NULL Pointer Exception in Hadoop's WritableComparator - java

I am trying to calculate word frequency and using the order inversion design pattern.
Here is my Java code :
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.util.*;
import java.io.*;
import java.util.*;
public class WordFreq2 {
static enum StatusCounters {MAP_COUNTER, REDUCE_COUNTER, TOTAL_WORDS}
static enum MyExceptions {IO_EXCEPTION, INTERRUPTED_EXCEPTION, NULL_POINTER_EXCEPTION}
public static class MyComparator extends WritableComparator {
public int compare(WritableComparable a, WritableComparable b)
{
if (a.toString().equals("special_key0") && b.toString().equals("special_key1") )
return 0;
else
if ( a.toString().equals("special_key0") || a.toString().equals("special_key1") )
return -1;
else
if ( b.toString().equals("special_key0") || a.toString().equals("special_key1") )
return 1;
else
return a.toString().compareTo(b.toString());
}
}
public static class MyPartitioner extends Partitioner<Text,IntWritable>
{
public int getPartition(Text key, IntWritable value, int num)
{
if ( key.toString().equals("special_key0") )
return 0;
else
if ( key.toString().equals("special_key1") )
return 1;
else
return key.hashCode() % num;
}
}
public static class MyMap extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text word = new Text();
private final int MEMORYHASHSIZE = 7;
private final HashMap<String,Integer> memoryHash = new HashMap<String,Integer>(MEMORYHASHSIZE);
private int special_key_count = 0;
protected void setup(Context context) throws IOException, InterruptedException {
}
protected void cleanup(Context context) throws IOException, InterruptedException {
flushMap(context);
for ( int c = 0; c < context.getNumReduceTasks(); c++)
{
word.set("special_key"+c);
context.write(word,new IntWritable(special_key_count));
}
}
private void flushMap(Context context) throws IOException, InterruptedException
{
Iterator<Map.Entry<String, Integer>> entries = memoryHash.entrySet().iterator();
while (entries.hasNext()) {
Map.Entry<String, Integer> entry = entries.next();
word.set(entry.getKey());
context.write(word,new IntWritable(entry.getValue()));
entries.remove();
}
}
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
context.progress(); //in case of long running code, report that something is happening
while (tokenizer.hasMoreTokens())
{
String current_token = tokenizer.nextToken();
// Key Present in our In-Memory Hash Tbale
if ( memoryHash.containsKey(current_token) )
{
// Increase the corresponding counter
Integer val = memoryHash.get(current_token);
memoryHash.put(current_token,++val);
}
else
{
// Flush the HashTable if size limit reached
if ( memoryHash.size() == MEMORYHASHSIZE)
flushMap(context);
memoryHash.put(current_token,1); // Make a new key with corresponding count 1
}
special_key_count++;
context.getCounter(StatusCounters.MAP_COUNTER).increment(1);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, FloatWritable>
{
int total_words;
protected void setup(Context context) throws IOException, InterruptedException {
total_words=0;
}
protected void cleanup(Context context) throws IOException, InterruptedException {
context.getCounter(StatusCounters.TOTAL_WORDS).increment(total_words);
}
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
float frequency;
for (IntWritable val : values)
{
if(key.toString().equals("special0") || key.toString().equals("special1"))
{
total_words = total_words + val.get();
}
else
{
frequency = val.get() / total_words;
context.write(key, new FloatWritable(frequency));
}
}
context.progress(); //in case of long running code, report that something is happening
context.getCounter(StatusCounters.REDUCE_COUNTER).increment(1);
}
}
private static boolean deleteOutputDir(Job job, Path p) throws IOException {
boolean retvalue = false;
Configuration conf = job.getConfiguration();
FileSystem myfs = p.getFileSystem(conf);
if(myfs.exists(p) && myfs.isDirectory(p)) {
retvalue = myfs.delete(p,true);
}
return retvalue;
}
public static void main(String[] args) throws Exception {
Job job = Job.getInstance();
job.setJarByClass(WordFreq2.class);
job.setJobName("wordfreq");
/* type of map output */
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
/* type of reduce output */
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
/* specify input/output directories */
FileInputFormat.setInputPaths(job, new Path(args[0]));
deleteOutputDir(job,new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
/* How to read and write inputs/outputs */
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
/* specify program components */
job.setMapperClass(MyMap.class);
job.setReducerClass(Reduce.class);
job.setNumReduceTasks(2); // Set the number of reducer to two
job.setSortComparatorClass(MyComparator.class);
job.setPartitionerClass(MyPartitioner.class);
boolean result = job.waitForCompletion(true);
Counters counters = job.getCounters();
Counter acounter = counters.findCounter(MyExceptions.IO_EXCEPTION);
long iocount = acounter.getValue();
System.exit(result?0:1);
}
}
However, I constantly hit this error :
Error: java.lang.NullPointerException
at org.apache.hadoop.io.WritableComparator.compare(WritableComparator.java:128)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.compare(MapTask.java:1245)
at org.apache.hadoop.util.QuickSort.sortInternal(QuickSort.java:74)
at org.apache.hadoop.util.QuickSort.sort(QuickSort.java:63)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1575)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.flush(MapTask.java:1462)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.close(MapTask.java:700)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:770)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:340)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:168)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1548)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:163)
I am unable to figure out the issue. Can anyone point me to the right direction?

Related

why is the first output line in map reduce null in java

I don't understand why the first output of my map reduce job is 0 and null
The output is : url ; number of visits
and here is the mapper class :
public class WordCountMapper extends
Mapper<LongWritable, Text, Text, IntWritable>
{
public void map(LongWritable cle, Text valeur, Context sortie)
throws IOException
{
String url="";
int nbVisites=0;
Pattern httplogPattern = Pattern.compile("([^\\s]+) - - \\[(.+)\\] \"([^\\s]+) (/[^\\s]*) HTTP/[^\\s]+\" [^\\s]+ ([0-9]+)");
String ligne = valeur.toString();
if (ligne.length()>0) {
Matcher matcher = httplogPattern.matcher(ligne);
if (matcher.matches()) {
url = matcher.group(1);
nbVisites = Integer.parseInt(matcher.group(5));
}
}
Text urlText = new Text(url);
IntWritable value = new IntWritable(nbVisites);
try
{
sortie.write(urlText, value);
System.out.println(urlText + " ; " + value);
}
catch (InterruptedException e)
{
e.printStackTrace();
}
}
and reducer :
public class WordCountReducer extends
Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text key, Iterable<IntWritable> values, Context sortie) throws IOException, InterruptedException
{
Iterator<IntWritable> it = values.iterator();
int nb=0;
while (it.hasNext()) {
nb = nb + it.next().get();
}
try {
sortie.write(key, new IntWritable(nb));
System.out.println(key.toString() + ";" + nb);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Each line of the input file looks like this :
199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
and here is the output :
0
04-dynamic-c.rotterdam.luna.net 4
06-dynamic-c.rotterdam.luna.net 1
10.salc.wsu.edu 3
11.ts2.mnet.medstroms.se 1
128.100.183.222 4
128.102.149.149 4
As you can see first line is a couple of null values
Thank you
You get an empty key (not null) because your default mapper Text is an empty string. Then the reducer counts that as 0...
It works fine if you check that your lines actually match before writing the output
Here's a refactored version of your code
public class WebLogDriver extends Configured implements Tool {
public static final String APP_NAME = WebLogDriver.class.getSimpleName();
public static void main(String[] args) throws Exception {
final int status = ToolRunner.run(new Configuration(), new WebLogDriver(), args);
System.exit(status);
}
#Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf, APP_NAME);
job.setJarByClass(WebLogDriver.class);
// outputs for mapper and reducer
job.setOutputKeyClass(Text.class);
// setup mapper
job.setMapperClass(WebLogDriver.WebLogMapper.class);
job.setMapOutputValueClass(IntWritable.class);
// setup reducer
job.setReducerClass(WebLogDriver.WebLogReducer.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
final Path outputDir = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outputDir);
return job.waitForCompletion(true) ? 0 : 1;
}
static class WebLogMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
static final Pattern HTTP_LOG_PATTERN = Pattern.compile("(\\S+) - - \\[(.+)] \"(\\S+) (/\\S*) HTTP/\\S+\" \\S+ (\\d+)");
final Text keyOut = new Text();
final IntWritable valueOut = new IntWritable();
#Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
String line = value.toString();
if (line.isEmpty()) return;
Matcher matcher = HTTP_LOG_PATTERN.matcher(line);
if (matcher.matches()) {
keyOut.set(matcher.group(1));
try {
valueOut.set(Integer.parseInt(matcher.group(5)));
context.write(keyOut, valueOut);
} catch (NumberFormatException e) {
e.printStackTrace();
}
}
}
}
static class WebLogReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
static final IntWritable valueOut = new IntWritable();
#Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int nb = StreamSupport.stream(values.spliterator(), true)
.mapToInt(IntWritable::get)
.sum();
valueOut.set(nb);
context.write(key, valueOut);
}
}
}

Java Hadoop wierd join behaviour

Aim
I have two csv files trying to make a join between them. One containing movieId, title and the other containing userId, movieId, comment-tag. I want to find out how many comments-tags each movie has, by printing title, comment_count. So my code:
Driver
public class Driver
{
public Driver(String[] args)
{
if (args.length < 3) {
System.err.println("input path ");
}
try {
Job job = Job.getInstance();
job.setJobName("movie tag count");
// set file input/output path
MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, TagMapper.class);
MultipleInputs.addInputPath(job, new Path(args[2]), TextInputFormat.class, MovieMapper.class);
FileOutputFormat.setOutputPath(job, new Path(args[3]));
// set jar class name
job.setJarByClass(Driver.class);
// set mapper and reducer to job
job.setReducerClass(Reducer.class);
// set output key class
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
int returnValue = job.waitForCompletion(true) ? 0 : 1;
System.out.println(job.isSuccessful());
System.exit(returnValue);
} catch (IOException | ClassNotFoundException | InterruptedException e) {
e.printStackTrace();
}
}
}
MovieMapper
public class MovieMapper extends org.apache.hadoop.mapreduce.Mapper<Object, Text, Text, Text>
{
#Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
String[] items = line.split("(?!\\B\"[^\"]*),(?![^\"]*\"\\B)"); //comma not in quotes
String movieId = items[0].trim();
if(tryParseInt(movieId))
{
context.write(new Text(movieId), new Text(items[1].trim()));
}
}
private boolean tryParseInt(String s)
{
try {
Integer.parseInt(s);
return true;
} catch (NumberFormatException e) {
return false;
}
}
}
TagMapper
public class TagMapper extends org.apache.hadoop.mapreduce.Mapper<Object, Text, Text, Text>
{
#Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
String[] items = line.split("(?!\\B\"[^\"]*),(?![^\"]*\"\\B)");
String movieId = items[1].trim();
if(tryParseInt(movieId))
{
context.write(new Text(movieId), new Text("_"));
}
}
private boolean tryParseInt(String s)
{
try {
Integer.parseInt(s);
return true;
} catch (NumberFormatException e) {
return false;
}
}
}
Reducer
public class Reducer extends org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, IntWritable>
{
#Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException
{
int noOfFrequency = 0;
Text movieTitle = new Text();
for (Text o : values)
{
if(o.toString().trim().equals("_"))
{
noOfFrequency++;
}
else
{
System.out.println(o.toString());
movieTitle = o;
}
}
context.write(movieTitle, new IntWritable(noOfFrequency));
}
}
The problem
The result I get is something like this:
title, count
_, count
title, count
title, count
_, count
title, count
_, count
How does this _ gets to be the key? I can't understand it. There is an if statment checking if there is an _ count it and don't put it as the title. Is there something wrong with the toString() method and the equals operation fails? Any ideas?
it is not weird because you iterate through values and o is a pointer to elements of values which is here are Text. at some point in time you make movieTitle to points to where o points movieTitle = o. in next iterations o points to "_" and also movieTitle points to "_".
if you change your code like this every thing works fine:
int noOfFrequency = 0;
Text movieTitle = null;
for (Text o : values)
{
if(o.toString().trim().equals("_"))
{
noOfFrequency++;
}
else
{
movieTitle = new Text(o.toString());
}
}
context.write(movieTitle, new IntWritable(noOfFrequency));

In MapReduce program, reducer is not getting called by Driver

According to map reduce programming model I wrote this program where Driver code is as follows
MY DRIVER CLASS
public class MRDriver extends Configured implements Tool
{
#Override
public int run(String[] strings) throws Exception {
if(strings.length != 2)
{
System.err.println("usage : <inputlocation> <inputlocation> <outputlocation>");
System.exit(0);
}
Job job = new Job(getConf(), "multiple files");
job.setJarByClass(MRDriver.class);
job.setMapperClass(MRMapper.class);
job.setReducerClass(MRReducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(strings[0]));
FileOutputFormat.setOutputPath(job, new Path(strings[1]));
return job.waitForCompletion(true) ? 0 : 1;
//throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
System.exit(ToolRunner.run(conf, new MRDriver(), args));
}
}
MY MAPPER CLASS
class MRMapper extends Mapper<LongWritable, Text, Text, Text>
{
#Override
public void map(LongWritable key, Text value, Context context)
{
try
{
StringTokenizer iterator;
String idsimval = null;
iterator = new StringTokenizer(value.toString(), "\t");
String id = iterator.nextToken();
String sentival = iterator.nextToken();
if(iterator.hasMoreTokens())
idsimval = iterator.nextToken();
context.write(new Text("unique"), new Text(id + "_" + sentival + "_" + idsimval));
} catch (IOException | InterruptedException e)
{
System.out.println(e);
}
}
MY REDUCER CLASS
class MRReducer extends Reducer<Text, Text, Text, Text> {
String[] records;
HashMap<Long, String> sentiMap = new HashMap<>();
HashMap<Long, String> cosiMap = new HashMap<>();
private String leftIdStr;
private ArrayList<String> rightIDList, rightSimValList, matchingSimValList, matchingIDList;
private double leftVal;
private double rightVal;
private double currDiff;
private double prevDiff;
private int finalIndex;
Context newContext;
private int i;
public void reducer(Text key, Iterable<Text> value, Context context) throws IOException, InterruptedException {
for (Text string : value) {
records = string.toString().split("_");
sentiMap.put(Long.parseLong(records[0]), records[1]);
if (records[2] != null) {
cosiMap.put(Long.parseLong(records[0]), records[2]);
}
if(++i == 2588)
{
newContext = context;
newfun();
}
context.write(new Text("hello"), new Text("hii"));
}
context.write(new Text("hello"), new Text("hii"));
}
void newfun() throws IOException, InterruptedException
{
for (HashMap.Entry<Long, String> firstEntry : cosiMap.entrySet()) {
try {
leftIdStr = firstEntry.getKey().toString();
rightIDList = new ArrayList<>();
rightSimValList = new ArrayList<>();
matchingSimValList = new ArrayList<>();
matchingIDList = new ArrayList<>();
for (String strTmp : firstEntry.getValue().split(" ")) {
rightIDList.add(strTmp.substring(0, 18));
rightSimValList.add(strTmp.substring(19));
}
String tmp = sentiMap.get(Long.parseLong(leftIdStr));
if ("NULL".equals(tmp)) {
leftVal = Double.parseDouble("0");
} else {
leftVal = Double.parseDouble(tmp);
}
tmp = sentiMap.get(Long.parseLong(rightIDList.get(0)));
if ("NULL".equals(tmp)) {
rightVal = Double.parseDouble("0");
} else {
rightVal = Double.parseDouble(tmp);
}
prevDiff = Math.abs(leftVal - rightVal);
int oldIndex = 0;
for (String s : rightIDList) {
try {
oldIndex++;
tmp = sentiMap.get(Long.parseLong(s));
if ("NULL".equals(tmp)) {
rightVal = Double.parseDouble("0");
} else {
rightVal = Double.parseDouble(tmp);
}
currDiff = Math.abs(leftVal - rightVal);
if (prevDiff > currDiff) {
prevDiff = currDiff;
}
} catch (Exception e) {
}
}
oldIndex = 0;
for (String s : rightIDList) {
tmp = sentiMap.get(Long.parseLong(s));
if ("NULL".equals(tmp)) {
rightVal = Double.parseDouble("0");
} else {
rightVal = Double.parseDouble(tmp);
}
currDiff = Math.abs(leftVal - rightVal);
if (Objects.equals(prevDiff, currDiff)) {
matchingSimValList.add(rightSimValList.get(oldIndex));
matchingIDList.add(rightIDList.get(oldIndex));
}
oldIndex++;
}
finalIndex = rightSimValList.indexOf(Collections.max(matchingSimValList));
newContext.write(new Text(leftIdStr), new Text(" " + rightIDList.get(finalIndex) + ":" + rightSimValList.get(finalIndex)));
} catch (NumberFormatException nfe) {
}
}
}
}
What is the problem and does it belong to map reduce program or hadoop system configuration? Whenever I run this program, it only writes mapper ouput into hdfs.
Inside your Reducer class you must override the reduce method. You are declaring a reducer method, which is not correct.
Try modifying your function inside the Reducer class:
#Override
public void reduce(Text key, Iterable<Text> value, Context context) throws IOException, InterruptedException {

Hadoop mapper is never called, custom input format might be the issue

So I am doing a little test program just to get the hang of hadoops inputformat classes. I had a word search already built which took in lines as values and searched for the word line by line. I wanted to see if I could get hadoop to take in values word by word, hadoop doesn't seem to like that and keeps giving me results using the default mapper. My mappers initialize function is never even called.
I do know my record reader is called and that it is doing more or less what it is supposed to and I'm pretty sure the output of the record reader is what my mapper is searching for so why does hadoop decide not to call it?
Here is the relevant code
Input Format Class
public class WordReader extends FileInputFormat<Text, Text> {
#Override
public RecordReader<Text, Text> createRecordReader(InputSplit split,
TaskAttemptContext context) {
return new MyWholeFileReader();
}
}
Record Reader
public class MyWholeFileReader extends RecordReader<Text, Text> {
private long start;
private LineReader in;
private Text key = null;
private Text value = null;
private ArrayList<String> outputvalues;
public void initialize(InputSplit genericSplit,
TaskAttemptContext context) throws IOException {
outputvalues = new ArrayList<String>();
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
start = split.getStart();
final Path file = split.getPath();
// open the file and seek to the start of the split
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
in = new LineReader(fileIn, job);
if (key == null) {
key = new Text();
}
key.set(split.getPath().getName());
if (value == null) {
value = new Text();
}
}
public boolean nextKeyValue() throws IOException {
if (outputvalues.size() == 0) {
Text buffer = new Text();
int i = in.readLine(buffer);
String str = buffer.toString();
for (String vals : str.split(" ")) {
outputvalues.add(vals);
}
if (i == 0 || outputvalues.size() == 0) {
key = null;
value = null;
return false;
}
}
value.set(outputvalues.remove(0));
System.out.println(value.toString());
return true;
}
#Override
public Text getCurrentKey() {
return key;
}
#Override
public Text getCurrentValue() {
return value;
}
/**
*
* Get the progress within the split
*/
public float getProgress() {
return 0.0f;
}
public synchronized void close() throws IOException {
if (in != null) {
in.close();
}
}
}
Mapper
public class WordSearchMapper extends Mapper<Text, Text, OutputCollector<Text,IntWritable>, Reporter> {
static String keyword;
BloomFilter<String> b;
public void configure(JobContext jobConf) {
keyword = jobConf.getConfiguration().get("keyword");
System.out.println("keyword>> " + keyword);
b = new BloomFilter<String>(.01,10000);
b.add(keyword);
System.out.println(b.getExpectedBitsPerElement());
}
public void map(Text key, Text value, OutputCollector<Text,IntWritable> output,
Reporter reporter) throws IOException {
int wordPos;
System.out.println("value.toString()>> " + value.toString());
System.out.println(((FileSplit) reporter.getInputSplit()).getPath()
.getName());
String[] tokens = value.toString().split("[\\p{P} \\t\\n\\r]");
for (String st :tokens) {
if (b.contains(st)) {
if (value.toString().contains(keyword)) {
System.out.println("Found one");
wordPos = ((Text) value).find(keyword);
output.collect(value, new IntWritable(wordPos));
}
}
}
}
}
Driver:
public class WordSearch {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf,"WordSearch");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(WordSearchMapper.class);
job.setInputFormatClass( WordReader.class);
job.setOutputFormatClass(TextOutputFormat.class);
conf.set("keyword", "the");
FileInputFormat.setInputPaths(job, new Path("search.txt"));
FileOutputFormat.setOutputPath(job, new Path("outputs"+System.currentTimeMillis()));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
And I figured it out... this is why hadoop needs to stop supporting multiple versions of itself or why I should stop jamming multiple tutorials together. Turns out my mapper needs to be set up like this for the way my mapper and record reader are set up to interact.
'public class WordSearchMapper extends Mapper { static String keyword;`
I only realized this after looking at my imports and seeing that reporter was from package org.apache.hadoop.mapred as opposed to org.apache.hadoop.mapreduce –

ClassCast Error while writing to Cassandra from hadoop job

I am running a hadoop job and trying to write the output to Cassandra. I am getting following exception:
java.lang.ClassCastException: org.apache.hadoop.io.Text cannot be cast to java.nio.ByteBuffer
at org.apache.cassandra.hadoop.ColumnFamilyRecordWriter.write(ColumnFamilyRecordWriter.java:60)
at org.apache.hadoop.mapred.ReduceTask$NewTrackingRecordWriter.write(ReduceTask.java:514)
at org.apache.hadoop.mapreduce.TaskInputOutputContext.write(TaskInputOutputContext.java:80)
at org.apache.hadoop.mapreduce.Reducer.reduce(Reducer.java:156)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:572)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:414)
at org.apache.hadoop.mapred.Child$4.run(Child.java:270)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1127)
at org.apache.hadoop.mapred.Child.main(Child.java:264)
I modeled my map reduce code on the WordCount example given at https://wso2.org/repos/wso2/trunk/carbon/dependencies/cassandra/contrib/word_count/src/WordCount.java
Here's my MR code:
public class SentimentAnalysis extends Configured implements Tool {
static final String KEYSPACE = "Travel";
static final String OUTPUT_COLUMN_FAMILY = "Keyword_PtitleId";
public static class Map extends Mapper<LongWritable, Text, Text, LongWritable> {
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
Sentiment sentiment = null;
try {
sentiment = (Sentiment) PojoMapper.fromJson(line, Sentiment.class);
} catch(Exception e) {
return;
}
if(sentiment != null && sentiment.isLike()) {
word.set(sentiment.getNormKeyword());
context.write(word, new LongWritable(sentiment.getPtitleId()));
}
}
}
public static class Reduce extends Reducer<Text, LongWritable, ByteBuffer, List<Mutation>> {
private ByteBuffer outputKey;
public void reduce(Text key, Iterator<LongWritable> values, Context context) throws IOException, InterruptedException {
List<Long> ptitles = new ArrayList<Long>();
java.util.Map<Long, Integer> ptitleToFrequency = new HashMap<Long, Integer>();
while (values.hasNext()) {
Long value = values.next().get();
ptitles.add(value);
}
for(Long ptitle : ptitles) {
if(ptitleToFrequency.containsKey(ptitle)) {
ptitleToFrequency.put(ptitle, ptitleToFrequency.get(ptitle) + 1);
}
else {
ptitleToFrequency.put(ptitle, 1);
}
}
byte[] keyBytes = key.getBytes();
outputKey = ByteBuffer.wrap(Arrays.copyOf(keyBytes, keyBytes.length));
for(Long ptitle : ptitleToFrequency.keySet()) {
context.write(outputKey, Collections.singletonList(getMutation(new Text(ptitle.toString()), ptitleToFrequency.get(ptitle))));
}
}
private static Mutation getMutation(Text word, int sum)
{
Column c = new Column();
byte[] wordBytes = word.getBytes();
c.name = ByteBuffer.wrap(Arrays.copyOf(wordBytes, wordBytes.length));
c.value = ByteBuffer.wrap(String.valueOf(sum).getBytes());
c.timestamp = System.currentTimeMillis() * 1000;
Mutation m = new Mutation();
m.column_or_supercolumn = new ColumnOrSuperColumn();
m.column_or_supercolumn.column = c;
return m;
}
}
public static void main(String[] args) throws Exception {
int ret = ToolRunner.run(new SentimentAnalysis(), args);
System.exit(ret);
}
#Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "SentimentAnalysis");
job.setJarByClass(SentimentAnalysis.class);
String inputFile = args[0];
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(ByteBuffer.class);
job.setOutputValueClass(List.class);
job.setOutputFormatClass(ColumnFamilyOutputFormat.class);
job.setInputFormatClass(TextInputFormat.class);
ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
FileInputFormat.setInputPaths(job, inputFile);
ConfigHelper.setRpcPort(job.getConfiguration(), "9160");
ConfigHelper.setInitialAddress(job.getConfiguration(), "localhost");
ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner");
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
}
If you look under the Reduce class, I am converting Text field (key) to ByteBuffer properly.
Would appreciate some pointers on how to fix this.
After some trial and error, I was able to figure out how to solve this particular issue. Basically, in my reduce method signature, I was using Iterator instead of Iterable and so the reducer was never called. And, hadoop was trying to write my Mapper output (Text, LongWritable) to Cassandra using outputKey/Value Classes for Reducer (ByteBuffer, List). This was causing the ClassCastException.
Changing reduce method signature to Iterable solved this issue.

Categories

Resources