java.lang.reflect.InvocationTargetException in Reduce DataJoin ( Hadoop In Action ) - java

I am having a problem running the DataJoin example in the Hadoop In Action. It seems like while running the job, java.lang.reflect.InvocationTargetException was thrown. I tried it for a day and it doesn't work. what did i do wrong?
Below is the exception
12/12/30 01:54:06 INFO mapred.JobClient: Task Id : attempt_201212280853_0032_m_000000_2, Status : FAILED
java.lang.RuntimeException: Error in configuring object
at org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:106)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:72)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:130)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:389)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:327)
at org.apache.hadoop.mapred.Child$4.run(Child.java:268)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1332)
at org.apache.hadoop.mapred.Child.main(Child.java:262)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.ja
Below is the code taken from the Hadoop In Action. I have customers.txt and orders.txt in my input directory. I tried renaming these 2 files in the input directory to part-0000.txt and part-0001.txt and still it doesn't work.
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.contrib.utils.join.DataJoinMapperBase;
import org.apache.hadoop.contrib.utils.join.DataJoinReducerBase;
import org.apache.hadoop.contrib.utils.join.TaggedMapOutput;
/**
* DataJoinMapperBase has a method method - http://hadoop.apache.org/docs/mapreduce/r0.21.0/api/org/apache/hadoop/contrib/utils/join/DataJoinMapperBase.html
*
* map(Object key, Object value, OutputCollector output, Reporter reporter)
*
*/
public class DataJoin extends Configured implements Tool {
public static class MapClass extends DataJoinMapperBase {
protected Text generateInputTag(String inputFile) {
String datasource = inputFile.split("-")[0];
return new Text(datasource);
}
protected Text generateGroupKey(TaggedMapOutput aRecord) {
String line = ((Text) aRecord.getData()).toString();
String[] tokens = line.split(",");
String groupKey = tokens[0];
return new Text(groupKey);
}
protected TaggedMapOutput generateTaggedMapOutput(Object value) {
TaggedWritable retv = new TaggedWritable();
retv.setData((Text) value);
retv.setTag(this.inputTag);
return retv;
}
}
public static class Reduce extends DataJoinReducerBase {
protected TaggedMapOutput combine(Object[] tags, Object[] values) {
if (tags.length < 2) return null;
String joinedStr = "";
for (int i=0; i<values.length; i++) {
if (i > 0) joinedStr += ",";
TaggedWritable tw = (TaggedWritable) values[i];
String line = ((Text) tw.getData()).toString();
String[] tokens = line.split(",", 2);
joinedStr += tokens[1];
}
TaggedWritable retv = new TaggedWritable();
retv.setData(new Text(joinedStr));
retv.setTag((Text) tags[0]);
return retv;
}
}
public static class TaggedWritable extends TaggedMapOutput {
private Writable data;
// public TaggedWritable(Writable data) {
// this.tag = new Text("");
// this.data = data;
// }
public TaggedWritable() {
this.tag = new Text();
}
public Writable getData() {
return data;
}
public void setData(Writable data) {
this.data = data;
}
public void write(DataOutput out) throws IOException {
this.tag.write(out);
this.data.write(out);
}
public void readFields(DataInput in) throws IOException {
this.tag.readFields(in);
String dataClz = in.readUTF();
if (this.data == null
|| !this.data.getClass().getName().equals(dataClz)) {
try {
this.data = (Writable) ReflectionUtils.newInstance(
Class.forName(dataClz), null);
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
this.data.readFields(in);
}
// public void readFields(DataInput in) throws IOException {
// this.tag.readFields(in);
// this.data.readFields(in);
// }
}
public int run(String[] args) throws Exception {
Configuration conf = getConf();
JobConf job = new JobConf(conf, DataJoin.class);
Path in = new Path(args[0]);
Path out = new Path(args[1]);
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
job.setJobName("DataJoin");
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
job.setInputFormat(TextInputFormat.class);
job.setOutputFormat(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(TaggedWritable.class);
job.set("mapred.textoutputformat.separator", ",");
JobClient.runJob(job);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(),
new DataJoin(),
args);
System.exit(res);
}
}
I hit the similar problem and java.lang.reflect.InvocationTargetException was the root cause...what did i do wrong?
12/12/30 01:54:06 INFO mapred.JobClient: Task Id : attempt_201212280853_0032_m_000000_2, Status : FAILED
java.lang.RuntimeException: Error in configuring object
at org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:106)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:72)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:130)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:389)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:327)
at org.apache.hadoop.mapred.Child$4.run(Child.java:268)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1332)
at org.apache.hadoop.mapred.Child.main(Child.java:262)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.ja
Below is the code taken from the Hadoop In Action
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.contrib.utils.join.DataJoinMapperBase;
import org.apache.hadoop.contrib.utils.join.DataJoinReducerBase;
import org.apache.hadoop.contrib.utils.join.TaggedMapOutput;
/**
* DataJoinMapperBase has a method method - http://hadoop.apache.org/docs/mapreduce/r0.21.0/api/org/apache/hadoop/contrib/utils/join/DataJoinMapperBase.html
*
* map(Object key, Object value, OutputCollector output, Reporter reporter)
*
*/
public class DataJoin extends Configured implements Tool {
public static class MapClass extends DataJoinMapperBase {
protected Text generateInputTag(String inputFile) {
String datasource = inputFile.split("-")[0];
return new Text(datasource);
}
protected Text generateGroupKey(TaggedMapOutput aRecord) {
String line = ((Text) aRecord.getData()).toString();
String[] tokens = line.split(",");
String groupKey = tokens[0];
return new Text(groupKey);
}
protected TaggedMapOutput generateTaggedMapOutput(Object value) {
TaggedWritable retv = new TaggedWritable();
retv.setData((Text) value);
retv.setTag(this.inputTag);
return retv;
}
}
public static class Reduce extends DataJoinReducerBase {
protected TaggedMapOutput combine(Object[] tags, Object[] values) {
if (tags.length < 2) return null;
String joinedStr = "";
for (int i=0; i<values.length; i++) {
if (i > 0) joinedStr += ",";
TaggedWritable tw = (TaggedWritable) values[i];
String line = ((Text) tw.getData()).toString();
String[] tokens = line.split(",", 2);
joinedStr += tokens[1];
}
TaggedWritable retv = new TaggedWritable();
retv.setData(new Text(joinedStr));
retv.setTag((Text) tags[0]);
return retv;
}
}
public static class TaggedWritable extends TaggedMapOutput {
private Writable data;
// public TaggedWritable(Writable data) {
// this.tag = new Text("");
// this.data = data;
// }
public TaggedWritable() {
this.tag = new Text();
}
public Writable getData() {
return data;
}
public void setData(Writable data) {
this.data = data;
}
public void write(DataOutput out) throws IOException {
this.tag.write(out);
this.data.write(out);
}
/***
// public void readFields(DataInput in) throws IOException {
// this.tag.readFields(in);
// String dataClz = in.readUTF();
// if (this.data == null
// || !this.data.getClass().getName().equals(dataClz)) {
// try {
// this.data = (Writable) ReflectionUtils.newInstance(
// Class.forName(dataClz), null);
// } catch (ClassNotFoundException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// }
//this.data.readFields(in);
//}
*****/
public void readFields(DataInput in) throws IOException {
this.tag.readFields(in);
this.data.readFields(in);
}
}
public int run(String[] args) throws Exception {
Configuration conf = getConf();
JobConf job = new JobConf(conf, DataJoin.class);
Path in = new Path(args[0]);
Path out = new Path(args[1]);
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
job.setJobName("DataJoin");
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
job.setInputFormat(TextInputFormat.class);
job.setOutputFormat(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(TaggedWritable.class);
job.set("mapred.textoutputformat.separator", ",");
JobClient.runJob(job);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(),
new DataJoin(),
args);
System.exit(res);
}
}

Related

Pass variables to mapper and reducer in Hadoop (old api)

I am not an expert in Hadoop and I have the following problem. I have a job that have to run on a cluster with Hadoop version 0.20.2.
When I start the job I specify some parameters. Two of that I want to pass to mapper and reduce class becase I need it.
I try different solution and now my code looks like this:
package bigdata;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.TreeMap;
import org.apache.commons.math3.stat.regression.SimpleRegression;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.vividsolutions.jts.geom.Geometry;
import com.vividsolutions.jts.io.ParseException;
import com.vividsolutions.jts.io.WKTReader;
public class BoxCount extends Configured implements Tool{
private static String mbr;
private static double cs;
public static class Map extends Mapper<LongWritable, Text, IntWritable, Text> implements JobConfigurable
{
public void configure(JobConf job) {
mbr = job.get(mbr);
cs = job.getDouble("cellSide", 0.1);
}
protected void setup(Context context)
throws IOException, InterruptedException {
// metodo in cui leggere l'MBR passato come parametro
System.out.println("mbr: " + mbr + "\ncs: " + cs);
// ...
}
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// some code here
}
protected void cleanup(Context context) throws IOException, InterruptedException
{
// other code
}
}
public static class Reduce extends Reducer<IntWritable,Text,IntWritable,IntWritable>implements JobConfigurable
{
private static String mbr;
private static double cs;
public void configure(JobConf job) {
mbr = job.get(mbr);
cs = job.getDouble("cellSide", 0.1);
}
protected void setup(Context context) throws IOException, InterruptedException
{
System.out.println("mbr: " + mbr + " cs: " + cs);
}
public void reduce(IntWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
//the reduce code
}
#SuppressWarnings("unused")
protected void cleanup(Context context)
throws IOException, InterruptedException {
// cleanup code
}
public BoxCount (String[] args) {
if (args.length != 4) {
// 0 1 2 3
System.out.println("Usage: OneGrid <mbr (Rectangle: (xmin,ymin)-(xmax,ymax))> <cell_Side> <input_path> <output_path>");
System.out.println("args.length = "+args.length);
for(int i = 0; i< args.length;i++)
System.out.println("args["+i+"]"+" = "+args[i]);
System.exit(0);
}
this.numReducers = 1;
//this.mbr = new String(args[0]);
// this.mbr = "Rectangle: (0.01,0.01)-(99.99,99.99)";
// per sierpinski_jts
this.mbr = "Rectangle: (0.0,0.0)-(100.01,86.6125)";
// per diagonale
//this.mbr = "Rectangle: (1.5104351688932738,1.0787616413335854)-(99999.3453727045,99999.98043392139)";
// per uniforme
// this.mbr = "Rectangle: (0.3020720559407146,0.2163091760095974)-(99999.68881210628,99999.46079314972)";
this.cellSide = Double.parseDouble(args[1]);
this.inputPath = new Path(args[2]);
this.outputDir = new Path(args[3]);
// Ricalcola la cellSize in modo da ottenere
// almeno minMunGriglie (10) griglie!
Grid g = new Grid(mbr, cellSide);
if ((this.cellSide*(Math.pow(2,minNumGriglie))) > g.width)
this.cellSide = g.width/(Math.pow(2,minNumGriglie));
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new BoxCount(args), args);
System.exit(res);
}
public int run(String[] args) throws Exception
{
// define new job instead of null using conf
Configuration conf = getConf();
#SuppressWarnings("deprecation")
Job job = new Job(conf, "BoxCount");
// conf.set("mapreduce.framework.name", "local");
// conf.set("mapreduce.jobtracker.address", "local");
// conf.set("fs.defaultFS","file:///");
// passo il valore mbr per creare la griglia
conf.set("mbr", mbr);
// passo lato cella
conf.setDouble("cellSide", cellSide);
job.setJarByClass(BoxCount.class);
// set job input format
job.setInputFormatClass(TextInputFormat.class);
// set map class and the map output key and value classes
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(Map.class);
// set reduce class and the reduce output key and value classes
job.setReducerClass(Reduce.class);
// set job output format
job.setOutputFormatClass(TextOutputFormat.class);
// add the input file as job input (from HDFS) to the variable
// inputFile
TextInputFormat.setInputPaths(job, inputPath);
// set the output path for the job results (to HDFS) to the variable
// outputPath
TextOutputFormat.setOutputPath(job, outputDir);
// set the number of reducers using variable numberReducers
job.setNumReduceTasks(numReducers);
// set the jar class
job.setJarByClass(BoxCount.class);
return job.waitForCompletion(true) ? 0 : 1; // this will execute the job
}
}
But the job not run. What is the correct solution?

CombineFileInputFormat implementation for XML files

I have to process 250 XML files each of which is 25 MB in size. For processing XML files, I am using XMLInputFormat from Apache Mahout and generating a sequence file. The key is filename and value is entire file contents in sequence file. But problem with this approach is that 250 Mappers are launched which makes the MapReduce job slower.
I have come across CombineFileInputFormat (while going through Tom White book) using which 250 Mappers wouldn't be launched for 250 files. But CombineFileInputFormat is an abstract class and I am facing difficulty implementing it for XML files as I am new to Java as well as Hadoop.
So, can someone please provide me implementation of CombineFileInputFormat for XML files.
Driver Code:
package com.ericsson.sequencefile;
//A MapReduce program for packaging a collection of small files as a single SequenceFile.
//hadoop jar sequencefiles.jar com.ericsson.sequencefile.SmallFilesToSequenceFileConverter -D xmlinput.start="<XMLstart>" -D xmlinput.end="</XMLstart>" /IRIS_NG/pfinder2/ccn/archive /IRIS_NG/pfinder2/output
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class SmallFilesToSequenceFileConverter extends Configured implements Tool {
public static class SequenceFileMapper extends Mapper<LongWritable, Text, Text, Text> {
private Text filenameKey;
#Override
public void setup(Context context) throws IOException, InterruptedException {
InputSplit split = context.getInputSplit();
Path path = ((FileSplit) split).getPath();
filenameKey = new Text(path.toString() + "\n");
}
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String document = value.toString();
context.write(filenameKey, new Text(document));
}
}
#Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
Configuration conf = getConf();
Job job = Job.getInstance(conf,"SmallFilesToSequenceFile");
job.setJarByClass(getClass());
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setInputFormatClass(XmlInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(SequenceFileMapper.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new SmallFilesToSequenceFileConverter(), args);
System.exit(exitCode);
}
}
XMLInputFormt.java
package com.ericsson.sequencefile;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.slf4j.*;
import java.io.IOException;
/**
* Reads records that are delimited by a specific begin/end tag.
*/
public class XmlInputFormat extends TextInputFormat {
private static final Logger log =
LoggerFactory.getLogger(XmlInputFormat.class);
public static final String START_TAG_KEY = "xmlinput.start";
public static final String END_TAG_KEY = "xmlinput.end";
#Override
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
try {
return new XmlRecordReader((FileSplit) split,
context.getConfiguration());
} catch (IOException ioe) {
log.warn("Error while creating XmlRecordReader", ioe);
return null;
}
}
/**
* XMLRecordReader class to read through a given xml document to
* output xml blocks as records as specified
* by the start tag and end tag
*/
public static class XmlRecordReader
extends RecordReader<LongWritable, Text> {
private final byte[] startTag;
private final byte[] endTag;
private final long start;
private final long end;
private final FSDataInputStream fsin;
private final DataOutputBuffer buffer = new DataOutputBuffer();
private LongWritable currentKey;
private Text currentValue;
public XmlRecordReader(FileSplit split, Configuration conf)
throws IOException {
startTag = conf.get(START_TAG_KEY).getBytes("UTF-8");
endTag = conf.get(END_TAG_KEY).getBytes("UTF-8");
// open the file and seek to the start of the split
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
FileSystem fs = file.getFileSystem(conf);
fsin = fs.open(split.getPath());
fsin.seek(start);
}
private boolean next(LongWritable key, Text value)
throws IOException {
if (fsin.getPos() < end && readUntilMatch(startTag, false)) {
try {
buffer.write(startTag);
if (readUntilMatch(endTag, true)) {
key.set(fsin.getPos());
value.set(buffer.getData(), 0, buffer.getLength());
return true;
}
} finally {
buffer.reset();
}
}
return false;
}
#Override
public void close() throws IOException {
fsin.close();
}
#Override
public float getProgress() throws IOException {
return (fsin.getPos() - start) / (float) (end - start);
}
private boolean readUntilMatch(byte[] match, boolean withinBlock)
throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
// end of file:
if (b == -1) {
return false;
}
// save to buffer:
if (withinBlock) {
buffer.write(b);
}
// check if we're matching:
if (b == match[i]) {
i++;
if (i >= match.length) {
return true;
}
} else {
i = 0;
}
// see if we've passed the stop point:
if (!withinBlock && i == 0 && fsin.getPos() >= end) {
return false;
}
}
}
#Override
public LongWritable getCurrentKey()
throws IOException, InterruptedException {
return currentKey;
}
#Override
public Text getCurrentValue()
throws IOException, InterruptedException {
return currentValue;
}
#Override
public void initialize(InputSplit split,
TaskAttemptContext context)
throws IOException, InterruptedException {
}
#Override
public boolean nextKeyValue()
throws IOException, InterruptedException {
currentKey = new LongWritable();
currentValue = new Text();
return next(currentKey, currentValue);
}
}
}

MapReduce Job hangs

I am new to Hadoop's MapReduce. I have written a map reduce task and I am trying to run that on my local machine. But the job hangs after map 100%.
Below is the code, I don't understand what am I missing.
I have a custom key class
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class AirlineMonthKey implements WritableComparable<AirlineMonthKey>{
Text airlineName;
Text month;
public AirlineMonthKey(){
super();
}
public AirlineMonthKey(Text airlineName, Text month) {
super();
this.airlineName = airlineName;
this.month = month;
}
public Text getAirlineName() {
return airlineName;
}
public void setAirlineName(Text airlineName) {
this.airlineName = airlineName;
}
public Text getMonth() {
return month;
}
public void setMonth(Text month) {
this.month = month;
}
#Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.airlineName.readFields(in);
this.month.readFields(in);
}
#Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
this.airlineName.write(out);
this.month.write(out);
}
#Override
public int compareTo(AirlineMonthKey airlineMonthKey) {
// TODO Auto-generated method stub
int diff = getAirlineName().compareTo(airlineMonthKey.getAirlineName());
if(diff != 0){
return diff;
}
int m1 = Integer.parseInt(getMonth().toString());
int m2 = Integer.parseInt(airlineMonthKey.getMonth().toString());
if(m1>m2){
return -1;
}
else
return 1;
}
}
and The mapper and the reducer class that uses the custom key as below.
package com.mapresuce.secondarysort;
import java.io.IOException;
import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import com.opencsv.CSVReader;
public class FlightDelayByMonth {
public static class FlightDelayByMonthMapper extends
Mapper<Object, Text, AirlineMonthKey, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String str = value.toString();
// Reading Line one by one from the input CSV.
CSVReader reader = new CSVReader(new StringReader(str));
String[] split = reader.readNext();
reader.close();
String airlineName = split[6];
String month = split[2];
String year = split[0];
String delayMinutes = split[37];
String cancelled = split[41];
if (!(airlineName.equals("") || month.equals("") || delayMinutes
.equals(""))) {
if (year.equals("2008") && cancelled.equals("0.00")) {
AirlineMonthKey airlineMonthKey = new AirlineMonthKey(
new Text(airlineName), new Text(month));
Text delay = new Text(delayMinutes);
context.write(airlineMonthKey, delay);
System.out.println("1");
}
}
}
}
public static class FlightDelayByMonthReducer extends
Reducer<AirlineMonthKey, Text, Text, Text> {
public void reduce(AirlineMonthKey key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
for(Text val : values){
context.write(new Text(key.getAirlineName().toString()+" "+key.getMonth().toString()), val);
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage:<in> <out>");
System.exit(2);
}
Job job = new Job(conf, "Average monthly flight dealy");
job.setJarByClass(FlightDelayByMonth.class);
job.setMapperClass(FlightDelayByMonthMapper.class);
job.setReducerClass(FlightDelayByMonthReducer.class);
job.setOutputKeyClass(AirlineMonthKey.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Also I have created a job and configuration in the main. Don't know what I am missing. I am running all this in local environment.
Try with writing a custom implementation of toString, equals and hashcode in your AirlineMonthKey class.
Read below link.
http://hadoop.apache.org/docs/stable/api/org/apache/hadoop/io/WritableComparable.html
It is important for key types to implement hashCode().
Hope this could help you.
The issue was I had to use the default Constructor in the AirlineMonthKey (which I did) and initialize the instance variables in the custom key class (which I didn't).

java.lang.ClassCastException: class org.json.JSONObject in MapReduce program

I have an input text file as given below (partial):
{"author":"Martti Paturi","book":"Aiotko oppikouluun"}
{"author":"International Meeting of Neurobiologists Amsterdam 1959.","book":"Structure and function of the cerebral cortex"}
{"author":"Paraná (Brazil : State). Comissão de Desenvolvimento Municipal.","book":"Plano diretor de desenvolvimento de Maringá"}
I need to perform MapReduce on this file to get as output a JSON object which has all the books from the same author in a JSON array, in the form:
{"author": "Ian Fleming", "books": [{"book": "Goldfinger"},{"book": "Moonraker"}]}
My code is as follows:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.json.*;
public class CombineBooks {
//TODO define variables and implement necessary components
/*public static class MyTuple implements Writable{
private String author;
private String book;
public void readFields(DataInput in){
JSONObject obj = new JSONObject(in.readLine());
author = obj.getString("author");
book = obj.getString("book");
}
public void write(DataOutput out){
out.writeBytes(author);
out.writeBytes(book);
}
public static MyTuple read(DataInput in){
MyTuple tup = new MyTuple();
tup.readFields(in);
return tup;
}
}*/
public static class Map extends Mapper<LongWritable, Text, Text, Text>{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
String author;
String book;
String line = value.toString();
String[] tuple = line.split("\\n");
try{
for(int i=0;i<tuple.length; i++){
JSONObject obj = new JSONObject(tuple[i]);
author = obj.getString("author");
book = obj.getString("book");
context.write(new Text(author), new Text(book));
}
}catch(JSONException e){
e.printStackTrace();
}
}
}
public static class Combine extends Reducer<Text, Text, Text, Text>{
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
String booklist = null;
int i = 0;
for(Text val : values){
if(booklist.equals(null)){
booklist = booklist + val.toString();
}
else{
booklist = booklist + "," + val.toString();
}
i++;
}
context.write(key, new Text(booklist));
}
}
public static class Reduce extends Reducer<Text,Text,JSONObject,NullWritable>{
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
try{
JSONArray ja = new JSONArray();
String[] book = null;
for(Text val : values){
book = val.toString().split(",");
}
for(int i=0; i<book.length; i++){
JSONObject jo = new JSONObject().put("book", book[i]);
ja.put(jo);
}
JSONObject obj = new JSONObject();
obj.put("author", key.toString());
obj.put("books", ja);
context.write(obj, NullWritable.get());
}catch(JSONException e){
e.printStackTrace();
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: CombineBooks <in> <out>");
System.exit(2);
}
//TODO implement CombineBooks
Job job = new Job(conf, "CombineBooks");
job.setJarByClass(CombineBooks.class);
job.setMapperClass(Map.class);
job.setCombinerClass(Combine.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(JSONObject.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
//TODO implement CombineBooks
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
When I am trying to run it, I am getting the follwoing error:
java.lang.ClassCastException: class org.json.JSONObject
at java.lang.Class.asSubclass(Class.java:3165)
at org.apache.hadoop.mapred.JobConf.getOutputKeyComparator(JobConf.java:795)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.<init>(MapTask.java:964)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.<init>(MapTask.java:673)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:756)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:364)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
I am using java-json.jar as an external dependency. I am not sure what the error is here. Any halp is appreciated!
the json jar file have to be saved in the hadoop lib folder and then try and execute the program.
Have a look at: Hadoop Writable. While you are indeed telling Hadoop to set the value of the output key, but JSONObject doesn't implement Writable interface.
Why you just don't output text?
context.write(new Text(jo.toString()), NullWritable.get());

FileNotFoundException when using Hadoop distributed cache

this time someone should please relpy
i am struggling with running my code using distributed cahe. i have already the files on hdfs but when i run this code :
import java.awt.image.BufferedImage;
import java.awt.image.DataBufferByte;
import java.awt.image.Raster;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.imageio.ImageIO;
import org.apache.hadoop.filecache.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import java.lang.String;
import java.lang.Runtime;
import java.net.URI;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
public class blur2 {
public static class BlurMapper extends MapReduceBase implements Mapper<Text, BytesWritable, LongWritable, BytesWritable>
{
OutputCollector<LongWritable, BytesWritable> goutput;
int IMAGE_HEIGHT = 240;
int IMAGE_WIDTH = 320;
public BytesWritable Gmiu;
public BytesWritable Gsigma;
public BytesWritable w;
byte[] bytes = new byte[IMAGE_HEIGHT*IMAGE_WIDTH*3];
public BytesWritable emit = new BytesWritable(bytes);
int count = 0;
int initVar = 125;
public LongWritable l = new LongWritable(1);
byte[] byte1 = new byte[IMAGE_HEIGHT*IMAGE_WIDTH];
byte[] byte2 = new byte[IMAGE_HEIGHT*IMAGE_WIDTH];
byte[] byte3 = new byte[IMAGE_HEIGHT*IMAGE_WIDTH];
public void map(Text key, BytesWritable file,OutputCollector<LongWritable, BytesWritable> output, Reporter reporter) throws IOException {
goutput = output;
BufferedImage img = ImageIO.read(new ByteArrayInputStream(file.getBytes()));
Raster ras=img.getData();
DataBufferByte db= (DataBufferByte)ras.getDataBuffer();
byte[] data = db.getData();
if(count==0){
for(int i=0;i<IMAGE_HEIGHT*IMAGE_WIDTH;i++)
{
byte1[i]=20;
byte2[i]=125;
}
Gmiu = new BytesWritable(data);
Gsigma = new BytesWritable(byte1);
w = new BytesWritable(byte2);
count++;
}
else{
byte1 = Gmiu.getBytes();
byte2 = Gsigma.getBytes();
byte3 = w.getBytes();
for(int i=0;i<IMAGE_HEIGHT*IMAGE_WIDTH;i++)
{
byte pixel = data[i];
Double tempmiu=new Double(0.0);
Double tempsig=new Double(0.0);
double temp1=0.0; double alpha = 0.05;
tempmiu = (1-alpha)*byte1[i] + alpha*pixel;
temp1=temp1+(pixel-byte1[i])*(pixel-byte1[i]);
tempsig=(1-alpha)*byte2[i]+ alpha*temp1;
byte1[i] = tempmiu.byteValue();
byte2[i]= tempsig.byteValue();
Double w1=new Double((1-alpha)*byte3[i]+alpha*100);
byte3[i] = w1.byteValue();
}
Gmiu.set(byte1,0,IMAGE_HEIGHT*IMAGE_WIDTH);
Gsigma.set(byte2,0,IMAGE_HEIGHT*IMAGE_WIDTH);
w.set(byte3,0,IMAGE_HEIGHT*IMAGE_WIDTH);
}
byte1 = Gsigma.getBytes();
for(int i=0;i<IMAGE_HEIGHT*IMAGE_WIDTH;i++)
{
bytes[i]=byte1[i];
}
byte1 = Gsigma.getBytes();
for(int i=0;i<IMAGE_HEIGHT*IMAGE_WIDTH;i++)
{
bytes[IMAGE_HEIGHT*IMAGE_WIDTH+i]=byte1[i];
}
byte1 = w.getBytes();
for(int i=0;i<IMAGE_HEIGHT*IMAGE_WIDTH;i++)
{
bytes[2*IMAGE_HEIGHT*IMAGE_WIDTH+i]=byte1[i];
}
emit.set(bytes,0,3*IMAGE_HEIGHT*IMAGE_WIDTH);
}
#Override
public void close(){
try{
goutput.collect(l, emit);
}
catch(Exception e){
e.printStackTrace();
System.exit(-1);
}
}
}
//end of first job , this is running perfectly
public static void main(String[] args) throws URISyntaxException {
if(args.length!=3) {
System.err.println("Usage: blurvideo input output");
System.exit(-1);
}
JobClient client = new JobClient();
JobConf conf = new JobConf(blur2.class);
conf.setOutputValueClass(BytesWritable.class);
conf.setInputFormat(SequenceFileInputFormat.class);
//conf.setNumMapTasks(n)
SequenceFileInputFormat.addInputPath(conf, new Path(args[0]));
TextOutputFormat.setOutputPath(conf, new Path(args[1]));
conf.setMapperClass(BlurMapper.class);
conf.setNumReduceTasks(0);
//conf.setReducerClass(org.apache.hadoop.mapred.lib.IdentityReducer.class);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (Exception e) {
e.printStackTrace();
}
// exec("jar cf /home/hmobile/hadoop-0.19.2/imag /home/hmobile/hadoop-0.19.2/output");
JobClient client2 = new JobClient();
JobConf conf2 = new JobConf(blur2.class);
conf2.setOutputValueClass(BytesWritable.class);
conf2.setInputFormat(SequenceFileInputFormat.class);
//conf.setNumMapTasks(n)
SequenceFileInputFormat.addInputPath(conf2, new Path(args[0]));
SequenceFileOutputFormat.setOutputPath(conf2, new Path(args[2]));
conf2.setMapperClass(BlurMapper2.class);
conf2.setNumReduceTasks(0);
DistributedCache.addCacheFile(new URI("~/ayush/output/part-00000"), conf2);// these files are already on the hdfs
DistributedCache.addCacheFile(new URI("~/ayush/output/part-00001"), conf2);
client2.setConf(conf2);
try {
JobClient.runJob(conf2);
} catch (Exception e) {
e.printStackTrace();
}
}
public static class BlurMapper2 extends MapReduceBase implements Mapper<Text, BytesWritable, LongWritable, BytesWritable>
{
int IMAGE_HEIGHT = 240;
int T =60;
int IMAGE_WIDTH = 320;
public BytesWritable Gmiu;
public BytesWritable Gsigma;
public BytesWritable w;
byte[] bytes = new byte[IMAGE_HEIGHT*IMAGE_WIDTH];
public BytesWritable emit = new BytesWritable(bytes);
int initVar = 125;int gg=0;
int K=64;int k=0,k1=0,k2=0;
public LongWritable l = new LongWritable(1);
byte[] Gmiu1 = new byte[IMAGE_HEIGHT*IMAGE_WIDTH*K];
byte[] Gsigma1 = new byte[IMAGE_HEIGHT*IMAGE_WIDTH*K];
byte[] w1 = new byte[IMAGE_HEIGHT*IMAGE_WIDTH*K];
public Path[] localFiles=new Path[2];
private FileSystem fs;
#Override
public void configure(JobConf conf2)
{
try {
fs = FileSystem.getLocal(new Configuration());
localFiles = DistributedCache.getLocalCacheFiles(conf2);
//System.out.println(localFiles[0].getName());
} catch (IOException ex) {
Logger.getLogger(blur2.class.getName()).log(Level.SEVERE, null, ex);
}
}
public void map(Text key, BytesWritable file,OutputCollector<LongWritable, BytesWritable> output, Reporter reporter) throws IOException
{
if(gg==0){
//System.out.println(localFiles[0].getName());
String wrd; String line;
for(Path f:localFiles)
{
if(!f.getName().endsWith("crc"))
{
// FSDataInputStream localFile = fs.open(f);
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(fs.open(f)));
int c = 0;
try {
while ((line = br.readLine()) != null) {
StringTokenizer itr = new StringTokenizer(line, " ");
while (itr.hasMoreTokens()) {
wrd = itr.nextToken();
c++;
int i = Integer.parseInt(wrd, 16);
Integer I = new Integer(i);
byte b = I.byteValue();
if (c < IMAGE_HEIGHT * IMAGE_WIDTH) {
Gmiu1[k] = b;k++;
} else {
if ((c >= IMAGE_HEIGHT * IMAGE_WIDTH) && (c < 2 * IMAGE_HEIGHT * IMAGE_WIDTH)) {
Gsigma1[k] = b;k1++;
} else {
w1[k] = b;k2++;
}
}
}
}
} catch (IOException ex) {
Logger.getLogger(blur2.class.getName()).log(Level.SEVERE, null, ex);
}
} catch (FileNotFoundException ex) {
Logger.getLogger(blur2.class.getName()).log(Level.SEVERE, null, ex);
} finally {
try {
br.close();
} catch (IOException ex) {
Logger.getLogger(blur2.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
}
gg++;
}
}
}
}
tackled a lot with this, can anyone please tell why i am getting this error:
java.io.FileNotFoundException: File does not exist: ~/ayush/output/part-00000
at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:394)
at org.apache.hadoop.filecache.DistributedCache.getTimestamp(DistributedCache.java:475)
at org.apache.hadoop.mapred.JobClient.configureCommandLineOptions(JobClient.java:676)
at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:774)
at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1127)
at blur2.main(blur2.java:175)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at org.apache.hadoop.util.RunJar.main(RunJar.java:165)
at org.apache.hadoop.mapred.JobShell.run(JobShell.java:54)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:79)
at org.apache.hadoop.mapred.JobShell.main(JobShell.java:68)
The problem is with the filename you are using "~/ayush/output/part-00000" relies on Unix shell (sh, bash, ksh) tilde expansion to replace the "~" with the pathname of your home directory.
Java (and C, and C++, and most other programming languages) don't do tilde expansion. You need to provide the pathname as "/home/ayush/output/part-00000" ... or whatever absolute pathname it is that the tilded form expands to.
Strictly speaking, the URI should be created as follows:
new File("/home/ayush/output/part-00000").toURI()
not as
new URI("/home/ayush/output/part-00000")
The latter creates a URI without a "protocol", and that could be problematic.

Categories

Resources