MapReduce: String index out of Range - java

I have a Problem with the MapReduce-Code. It is saying that my String-Index is out of Range, but the String is long enough for sure.
Does anyone have any suggestions? Thanks!
This is my Code:
`
package Test;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class TestMapper extends Mapper<Text,Text,IntWritable,IntWritable> {
private IntWritable date_int = null;
private IntWritable amount_int = null;
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
String date_str = value.toString().substring(4,5);
String amount_str = value.toString().substring(7,8);
date_int = new IntWritable(Integer.parseInt(date_str));
amount_int = new IntWritable(Integer.parseInt(amount_str));
// Sammeln der Ergebnisse
context.write(date_int, amount_int);
}
}
package Test
import java.io.IOException;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
//import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
//import org.apache.hadoop.mapreduce.Reducer.Context;
public class TestReducer extends Reducer<IntWritable, IntWritable,
IntWritable, FloatWritable> {
public void reduce(IntWritable key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
float sum = 0;
int count = 0;
for (IntWritable val : values) { sum +=val.get();
count +=1;
}
float result = sum / count;
context.write(key, new FloatWritable(result));
}
}
package Test;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class TestDriver extends Configured implements Tool {
private final static Logger log = Logger.getLogger(TestDriver.class.getName());
public static void main(String[] args) {
int res = 1; // Wenn 1 nicht veraendert wird, endet der Job nicht korrekt
try {
res = ToolRunner.run(new Configuration(), new TestDriver(), args);
} catch (Exception e) {
log.log(Level.SEVERE, "Fehler beim Ausfuehren des Jobs!");
e.printStackTrace();
}
System.exit(res);
}
#Override
public int run(String[] args) {
log.log(Level.INFO, "Start Map-Reduce-Job 'TestDriver'... ");
Configuration conf = this.getConf();
Job job = null;
try {
job = Job.getInstance(conf);
} catch (IOException e1) {
log.log(Level.SEVERE, "Fehler bei Instanziierung des Jobs!");
e1.printStackTrace();
}
job.setJarByClass(TestDriver.class);
job.setMapperClass(TestMapper.class);
job.setReducerClass(TestReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(FloatWritable.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
try {
FileInputFormat.addInputPath(job, new Path(args[0]));
} catch (IllegalArgumentException e) {
log.log(Level.SEVERE, "Fehler (Argument) beim Setzen des Eingabepfades!");
e.printStackTrace();
} catch (IOException e) {
log.log(Level.SEVERE, "Fehler (IO) beim Setzen des Eingabepfades!");
e.printStackTrace();
}
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean result = false;
try {
result = job.waitForCompletion(true);
} catch (ClassNotFoundException e) {
log.log(Level.SEVERE, "Fehler (ClassNotFound) beim Ausfuehren des Jobs!");
e.printStackTrace();
} catch (IOException e) {
log.log(Level.SEVERE, "Fehler (IOException) beim Ausfuehren des Jobs!");
e.printStackTrace();
} catch (InterruptedException e) {
log.log(Level.SEVERE, "Fehler (Interrupted) beim Ausfuehren des Jobs!");
e.printStackTrace();
}
log.log(Level.INFO, "Fertig!");
return result ? 0 : 1;
}
}
`
And this is the Error Message:
java.lang.Exception: java.lang.StringIndexOutOfBoundsException: String index out of range: 5
at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:462)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:522)
My Input File is a Text File and it is like that:
200912024
2009120420
2009120750
200912083
2009120912
2009121066
2009121170
2009121225
2009121430
2009121560
2009121621
2009121722
2009121818
2009122112
2009122213
Thanks!

It is because your input value to your mapper is empty.
You are using
job.setInputFormatClass( KeyValueTextInputFormat.class );
In KeyValueTextInputFormat Each line is divided into key and value parts by a separator byte. If no such a byte exists, the key will be the entire line and value will be empty.
Please see Class KeyValueTextInputFormat
So, if you change your input format to the default:
job.setInputFormatClass( TextInputFormat.class );
And your mapper to:
import java.io.IOException;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Mapper;
public class TestMapper extends
Mapper<LongWritable,Text,IntWritable,IntWritable> {
private IntWritable date_int = new IntWritable();
private IntWritable amount_int = new IntWritable();
/**
* #param key - Line offset - ignored.
* #param value - Value to process.
* #param context - MapperContext object for accessing output, configuration information, etc.
* #throws IOException, InterruptedException.
*/
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException
{
String date_str = value.toString().substring(4,5);
String amount_str = value.toString().substring(7,8);
int date = Integer.parseInt(date_str);
date_int.set(date);
int amount = Integer.parseInt(amount_str);
amount_int.set(amount);
// Sammeln der Ergebnisse
context.write(date_int, amount_int);
}
}
It should work.
Good luck!

Related

Can't access hashmap in mapper, MapReduce

I'd like to replace values of input data in my mapper, using dictionalies(csv) defined in another file. So I tried to put the csv data to HashMap and refer it in the mapper.
The java code and csv below are simplified version of my program. This code works in my local environment(Mac OS X, pseudo-distributed mode), but doesn't in my EC2 instance(ubuntu, pseudo-distributed mode).
In detail, I got this stdout in process:
cat:4
human:2
flamingo:1
this means the filereader successfully put csv data into HashMap.
However the mapper mapped nothing and therefore I got empty output in the EC2 environment, although it mapped 3 * (the number of lines of the input file) elements and generated the following in the local:
test,cat
test,flamingo
test,human
Does anyone have answers or hints?
Test.java
import java.io.IOException;
import java.util.StringTokenizer;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.DataInput;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.io.WritableUtils;
public class Test {
public static HashMap<String, Integer> map = new HashMap<String, Integer>();
public static class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
for(Map.Entry<String, Integer> e : map.entrySet()) {
context.write(new Text(e.getKey()), new Text("test"));
}
}
}
public static class Reducer1 extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> vals, Context context) throws IOException, InterruptedException {
context.write(new Text("test"), key);
}
}
public static class CommaTextOutputFormat extends TextOutputFormat<Text, Text> {
#Override
public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
String extension = ".txt";
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<Text, Text>(fileOut, ",");
}
}
public static void get_list(String list_path){
try {
FileReader fr = new FileReader(list_path);
BufferedReader br = new BufferedReader(fr);
String line = null, name = null;
int leg = 0;
while ((line = br.readLine()) != null) {
if (!line.startsWith("name") && !line.trim().isEmpty()) {
String[] name_leg = line.split(",", 0);
name = name_leg[0];
leg = Integer.parseInt(name_leg[1]);
map.put(name, leg);
}
}
br.close();
}
catch(IOException ex) {
System.err.println(ex.getMessage());
ex.printStackTrace();
}
for(Map.Entry<String, Integer> e : map.entrySet()) {
System.out.println(e.getKey() + ":" + e.getValue());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 3) {
System.err.println(
"Need 3 arguments: <input dir> <output base dir> <list path>");
System.exit(1);
}
get_list(args[2]);
Job job = Job.getInstance(conf, "test");
job.setJarByClass(Test.class);
job.setMapperClass(Mapper1.class);
job.setReducerClass(Reducer1.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(TextInputFormat.class);
// mapper output
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// reducer output
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// formtter
job.setOutputFormatClass(CommaTextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
if(!job.waitForCompletion(true)){
System.exit(1);
}
System.out.println("All Finished");
System.exit(0);
}
}
list.csv (args[2])
name,legs
cat,4
human,2
flamingo,1
=================================
I refer to #Rahul Sharma 's answer and modifiy my code as below. Then my code works in the both environments.
Thank you very much #Rahul Sharma and #Serhiy for your precise answer and useful comments.
Test.java
import java.io.IOException;
import java.util.StringTokenizer;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.DataInput;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.net.URI;
import java.io.InputStreamReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.io.WritableUtils;
public class Test {
public static HashMap<String, Integer> map = new HashMap<String, Integer>();
public static class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void setup(Context context) throws IOException, InterruptedException {
URI[] files = context.getCacheFiles();
Path list_path = new Path(files[0]);
try {
FileSystem fs = list_path.getFileSystem(context.getConfiguration());
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(list_path)));
String line = null, name = null;
int leg = 0;
while ((line = br.readLine()) != null) {
if (!line.startsWith("name") && !line.trim().isEmpty()) {
String[] name_leg = line.split(",", 0);
name = name_leg[0];
leg = Integer.parseInt(name_leg[1]);
map.put(name, leg);
}
}
br.close();
}
catch(IOException ex) {
System.err.println(ex.getMessage());
ex.printStackTrace();
}
for(Map.Entry<String, Integer> e : map.entrySet()) {
System.out.println(e.getKey() + ":" + e.getValue());
}
}
#Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
for(Map.Entry<String, Integer> e : map.entrySet()) {
context.write(new Text(e.getKey()), new Text("test"));
}
}
}
public static class Reducer1 extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> vals, Context context) throws IOException, InterruptedException {
context.write(new Text("test"), key);
}
}
// Writer
public static class CommaTextOutputFormat extends TextOutputFormat<Text, Text> {
#Override
public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
String extension = ".txt";
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<Text, Text>(fileOut, ",");
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 3) {
System.err.println(
"Need 3 arguments: <input dir> <output base dir> <list path>");
System.exit(1);
}
Job job = Job.getInstance(conf, "test");
job.addCacheFile(new Path(args[2]).toUri());
job.setJarByClass(Test.class);
job.setMapperClass(Mapper1.class);
job.setReducerClass(Reducer1.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(TextInputFormat.class);
// mapper output
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// reducer output
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// formtter
job.setOutputFormatClass(CommaTextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
if(!job.waitForCompletion(true)){
System.exit(1);
}
System.out.println("All Finished");
System.exit(0);
}
}
First you need to learn more about mapreduce framework.
Your program behave as expected in local mode because Mapper, reducer and Job are launched on same JVM. In case, of pseudo-distributed mode or distributed modes there will be separate jvms allocated for each component. The values you put into hashMap using get_list are not visible to mapper and reducer as they are in separate jvms
Use distributed cache to make it work in cluster mode.
Job Main class add file to distributed cache:
JobConf job = new JobConf();<br>
DistributedCache.addCacheArchive(new URI(args[2]), job);
Access file in mapper or reducer:
public void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.getLocal(conf);
Path[] dataFile = DistributedCache.getLocalCacheFiles(conf);
BufferedReader cacheReader = new BufferedReader(new InputStreamReader(fs.open(dataFile[0])));
// Implement here get_list method functionality
}

Hadoop - Reducer class in java

I am developing a Hadoop project in java. I want to find the customers with max consumption in a certain day. I have managed to find the customers in the date I want, but I am facing a problem in my Reducer class. Here is the code:
Mapper Class
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class alicanteMapperC extends
Mapper<LongWritable, Text, Text, IntWritable> {
String Customer = new String();
SimpleDateFormat ft = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date t = new Date();
IntWritable Consumption = new IntWritable();
int counter = 0;
//new vars
int max=0;
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
Date d2 = null;
try {
d2 = ft.parse("2013-07-01 01:00:00");
} catch (ParseException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
if (counter > 0) {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line, ",");
while (itr.hasMoreTokens()) {
Customer = itr.nextToken();
try {
t = ft.parse(itr.nextToken());
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Consumption.set(Integer.parseInt(itr.nextToken()));
}
if (t.compareTo(d2) == 0) {
context.write(new Text(Customer), Consumption);
}
}
counter++;
}
}
Reducer class
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class alicanteReducerC extends
Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable maximum = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int max = 0;
for (IntWritable val : values) {
if (val.get() > max) {
max = val.get();
}
}
for (IntWritable val : values) {
if (val.get() == max) {
context.write(key, val);
}
}
}
}
Do you have any idea why the reducer won't write to the output file? In other words, why doesn't the second for works?
EDIT
In my mapper class I find the Customers in a specific date and thus the consumption of them and I pass these values in the reducer class.
In the reducer class I want to find the max consumption and the customer associated to this consumption.

MapReduce Hadoop 2.4.1 Reducer not running

For some reason my Reducer doesn't seem to be running.
My Driver is
import java.io.File;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class PageRank {
public static void main(String[] args) throws Exception {
PageRank pageRanking = new PageRank();
//In and Out dirs in HDFS
pageRanking.runXmlParsing(args[0], args[1]);
System.out.println("finished");
}
public void runXmlParsing(String inputPath, String outputPath) throws IOException {
Configuration conf = new Configuration();
conf.set(XmlInputFormat.START_TAG_KEY, "<page>");
conf.set(XmlInputFormat.END_TAG_KEY, "</page>");
Job job1 = Job.getInstance(conf);
job1.setJarByClass(PageRank.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
// Our class to parse links from content.
job1.setMapperClass(WikiPageXMLMapper.class);
job1.setReducerClass(WikiLinksReducer.class);
job1.setInputFormatClass(XmlInputFormat.class);
job1.setOutputFormatClass(TextOutputFormat.class);
// Remove output if already exists
FileSystem.getLocal(conf).delete(new Path(outputPath), true);
FileInputFormat.setInputPaths(job1, new Path(inputPath));
FileOutputFormat.setOutputPath(job1, new Path(outputPath));
System.out.println("BEFORE RUN");
try {
job1.waitForCompletion(true);
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void deleteDir(File dir) {
File[] files = dir.listFiles();
for (File myFile: files) {
if (myFile.isDirectory()) {
deleteDir(myFile);
}
myFile.delete();
}
}
}
My Mapper is
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
public class WikiPageXMLMapper extends Mapper<LongWritable, Text, Text, Text> {
#Override
public void map(LongWritable key, Text value, Context output) throws IOException {
String[] titleAndText = parseTitleAndText(value.toString());
String pageString = titleAndText[0];
Text page = new Text(pageString.replace(' ', '_'));
String[] parts = titleAndText[1].split("\\[\\[");
String pages = "!##$ ";
for (int i = 1; i < parts.length; i++) {
int lastIndexBrackets = parts[i].lastIndexOf("]]");
// This checks and skips the first part of the outer link
if (lastIndexBrackets == -1)
continue;
String insideLinkPlusExtra = parts[i].substring(0, lastIndexBrackets);
int multipleClosingBrackets = insideLinkPlusExtra.indexOf("]]");
String otherPage = insideLinkPlusExtra;
if (multipleClosingBrackets != -1) {
otherPage = insideLinkPlusExtra.substring(0, multipleClosingBrackets);
}
otherPage = otherPage.split("\\|")[0];
otherPage = checkForDuplicates(otherPage, pages);
otherPage = (otherPage.indexOf(":") == -1) ? otherPage : "";
otherPage = (otherPage.indexOf("#") == -1) ? otherPage : "";
otherPage = checkForSubpageLinks(otherPage);
otherPage = checkForRedLink(otherPage);
if (otherPage == "")
continue;
Text oP = new Text(otherPage.replace(' ', '_'));
pages += oP + " ";
// taking each outlink and making it its own key (ingraph)
try {
output.write(new Text(oP), new Text(page));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
// Designate this page as not a redlink
try {
output.write(new Text(page), new Text("!##$"));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return;
}
}
And my Reducer is:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WikiLinksReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values, org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text>.Context output) throws IOException, InterruptedException {
System.out.println("REDUCER");
String links = "";
boolean isNotRedLink = false;
System.out.println("Starting reduce");
// Brett concern (and zach's): if n pages link to a redlink
// we will iterate n times and it could be wasteful
while(values.hasNext()){
String v = values.next().toString();
// Check first outlink is not #, if so, it is a redlink
if (v.equals("!##$")) {
isNotRedLink = true;
continue;
} else {
links += v;
continue;
}
}
// If the key is not a redlink, send it to the output
if (isNotRedLink) {
try {
output.write(key, new Text(links));
output.write(key, new Text("TESTING!"));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(links);
} else {
System.out.println(output);
try {
output.write(key, new Text("BLEG"));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(key + " IS A RED LINK");
return;
}
}
}
None of the System.out.println(...)s in my Reducer output to the console, and my output from the program (the file it leaves on my HDD) only has the results from the Mapper.
I feel silly. I tried Iterable instead of Iterator in the line public void reduce(Text key, Iterable<Text> values, org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text>.Context output) throws IOException, InterruptedException { and my issue is solved.

Parsing of Stackoverflow`s posts.xml on hadoop

I am following this article by Anoop Madhusudanan on codeproject to build a recommendation engine not on cluster but on my system.
Problem is when i try to parse posts.xml whose structure is as follows:
<row Id="99" PostTypeId="2" ParentId="88" CreationDate="2008-08-01T14:55:08.477" Score="2" Body="<blockquote>
<p>The actual resolution of gettimeofday() depends on the hardware architecture. Intel processors as well as SPARC machines offer high resolution timers that measure microseconds. Other hardware architectures fall back to the system’s timer, which is typically set to 100 Hz. In such cases, the time resolution will be less accurate. </p>
</blockquote>
<p>I obtained this answer from <a href="http://www.informit.com/guides/content.aspx?g=cplusplus&amp;seqNum=272" rel="nofollow">High Resolution Time Measurement and Timers, Part I</a></p>" OwnerUserId="25" LastActivityDate="2008-08-01T14:55:08.477" />
Now I need to parse this file(size 1.4 gb) on hadoop for which i have written code in java and created its jar.
Java class is as follows:
import java.io.IOException;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.w3c.dom.Element;
import java.io.File;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
public class Recommend {
static class Map extends Mapper<Text, Text, Text, Text> {
Path path;
String fXmlFile;
DocumentBuilderFactory dbFactory;
DocumentBuilder dBuilder;
Document doc;
/**
* Given an output filename, write a bunch of random records to it.
*/
public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
try{
fXmlFile=value.toString();
dbFactory = DocumentBuilderFactory.newInstance();
dBuilder= dbFactory.newDocumentBuilder();
doc= dBuilder.parse(fXmlFile);
doc.getDocumentElement().normalize();
NodeList nList = doc.getElementsByTagName("row");
for (int temp = 0; temp < nList.getLength(); temp++) {
Node nNode = nList.item(temp);
Element eElement = (Element) nNode;
Text keyWords =new Text(eElement.getAttribute("OwnerUserId"));
Text valueWords = new Text(eElement.getAttribute("ParentId"));
String val=keyWords.toString()+" "+valueWords.toString();
// Write the sentence
if(keyWords != null && valueWords != null){
output.collect(keyWords, new Text(val));
}
}
}catch (Exception e) {
e.printStackTrace();
}
}
}
/**
*
* #throws IOException
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
/*if (args.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}*/
// FileSystem fs = FileSystem.get(conf);
Job job = new Job(conf, "Recommend");
job.setJarByClass(Recommend.class);
// the keys are words (strings)
job.setOutputKeyClass(Text.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
// the values are counts (ints)
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
//conf.setReducerClass(Reduce.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
Path outPath = new Path(args[1]);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}
}
}
I expect the output to be as a file in hadoop containing output as OwnerUserId ParentId
but instead I get output as:
1599788 <row Id="2292" PostTypeId="2" ParentId="2284" CreationDate="2008-08-05T13:28:06.700" Score="0" ViewCount="0" Body="<p>The first thing you should do is contact the main people who run the open source project. Ask them if it is ok to contribute to the code and go from there.</p>
<p>Simply writing your improved code and then giving it to them may result in your code being rejected.</p>" OwnerUserId="383" LastActivityDate="2008-08-05T13:28:06.700" />
I dont know about the origin of 1599788 appearing as a key value from mapper.
I don`t know much about writing mapper classes for hadoop, I need help to modify my code to get the desired output.
Thanks in advance.
After a lot of research and experiments , finally learnt the way to write map for parsin xml files which have syntax like one I provided. I changed my approach and this is my new mapper code... Its workin for my usecase.
hope it help someone and they can save their time :)
import java.io.IOException;
import java.util.StringTokenizer;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.xml.sax.SAXException;
public class Map extends Mapper<LongWritable, Text, NullWritable, Text> {
NullWritable obj;
#Override
public void map(LongWritable key, Text value, Context context) throws InterruptedException {
StringTokenizer tok= new StringTokenizer(value.toString());
String pa=null,ow=null,pi=null,v;
while (tok.hasMoreTokens()) {
String[] arr;
String val = (String) tok.nextToken();
if(val.contains("PostTypeId")){
arr= val.split("[\"]");
pi=arr[arr.length-1];
if(pi.equals("2")){
continue;
}
else break;
}
if(val.contains("ParentId")){
arr= val.split("[\"]");
pa=arr[arr.length-1];
}
else if(val.contains("OwnerUserId") ){
arr= val.split("[\"]");
ow=arr[arr.length-1];
try {
if(pa!=null && ow != null){
v=String.format("{0},{1}", ow,pa);
context.write(obj,new Text(v));
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
}
Here is the mapper I've written to parse the so posts xml and create a tab separated file(s) on hadoop to be used by other map reduce jobs or Hive or Pig.
Mapper
package com.aravind.learning.hadoop.mapred.techtalks;
import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.google.common.base.Joiner;
public class StackoverflowDataWranglerMapper extends Mapper<LongWritable, Text, Text, Text>
{
static enum BadRecordCounters
{
NO_CREATION_DATE, UNKNOWN_USER_ID, UNPARSEABLE_RECORD, UNTAGGED_POSTS
}
private final Text outputKey = new Text();
private final Text outputValue = new Text();
private final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
private DocumentBuilder builder;
private static final Joiner TAG_JOINER = Joiner.on(",").skipNulls();
// 2008-07-31T21:42:52.667
private static final DateFormat DATE_PARSER = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");
private static final SimpleDateFormat DATE_BUILDER = new SimpleDateFormat("yyyy-MM-dd");
#Override
protected void setup(Context context) throws IOException, InterruptedException
{
try
{
builder = factory.newDocumentBuilder();
}
catch (ParserConfigurationException e)
{
new IOException(e);
}
}
#Override
protected void map(LongWritable inputKey, Text inputValue, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException
{
try
{
String entry = inputValue.toString();
if (entry.contains("<row "))
{
Document doc = builder.parse(new InputSource(new StringReader(entry)));
Element rootElem = doc.getDocumentElement();
String id = rootElem.getAttribute("Id");
String postedBy = rootElem.getAttribute("OwnerUserId").trim();
String viewCount = rootElem.getAttribute("ViewCount");
String postTypeId = rootElem.getAttribute("PostTypeId");
String score = rootElem.getAttribute("Score");
String title = rootElem.getAttribute("Title");
String tags = rootElem.getAttribute("Tags");
String answerCount = rootElem.getAttribute("AnswerCount");
String commentCount = rootElem.getAttribute("CommentCount");
String favoriteCount = rootElem.getAttribute("FavoriteCount");
String creationDate = rootElem.getAttribute("CreationDate");
Date parsedDate = null;
if (creationDate != null && creationDate.trim().length() > 0)
{
try
{
parsedDate = DATE_PARSER.parse(creationDate);
}
catch (ParseException e)
{
context.getCounter("Bad Record Counters", "Posts missing CreationDate").increment(1);
}
}
if (postedBy.length() == 0 || postedBy.trim().equals("-1"))
{
context.getCounter("Bad Record Counters", "Posts with either empty UserId or UserId contains '-1'")
.increment(1);
try
{
parsedDate = DATE_BUILDER.parse("2100-00-01");
}
catch (ParseException e)
{
// ignore
}
}
tags = tags.trim();
String tagTokens[] = null;
if (tags.length() > 1)
{
tagTokens = tags.substring(1, tags.length() - 1).split("><");
}
else
{
context.getCounter("Bad Record Counters", "Untagged Posts").increment(1);
}
outputKey.clear();
outputKey.set(id);
StringBuilder sb = new StringBuilder(postedBy).append("\t").append(parsedDate.getTime()).append("\t")
.append(postTypeId).append("\t").append(title).append("\t").append(viewCount).append("\t").append(score)
.append("\t");
if (tagTokens != null)
{
sb.append(TAG_JOINER.join(tagTokens)).append("\t");
}
else
{
sb.append("").append("\t");
}
sb.append(answerCount).append("\t").append(commentCount).append("\t").append(favoriteCount).toString();
outputValue.set(sb.toString());
context.write(outputKey, outputValue);
}
}
catch (SAXException e)
{
context.getCounter("Bad Record Counters", "Unparsable records").increment(1);
}
finally
{
builder.reset();
}
}
}
Driver
public class StackoverflowDataWranglerDriver extends Configured implements Tool
{
#Override
public int run(String[] args) throws Exception
{
if (args.length != 2)
{
System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
Job job = Job.getInstance(getConf());
job.setJobName("Tech Talks - Stackoverflow Forum Posts - Data Wrangler");
TextInputFormat.addInputPath(job, new Path(args[0]));
TextOutputFormat.setOutputPath(job, new Path(args[1]));
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setJarByClass(StackoverflowDataWranglerMapper.class);// required for mr1
job.setMapperClass(StackoverflowDataWranglerMapper.class);
job.setNumReduceTasks(0);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String args[]) throws Exception
{
int exitCode = ToolRunner.run(new Configuration(), new StackoverflowDataWranglerDriver(), args);
System.exit(exitCode);
}
}
Job submit command
hadoop jar ./hadoop-examples-0.0.1-SNAPSHOT.jar com.aravind.learning.hadoop.mapred.techtalks.StackoverflowDataWranglerDriver data/stackoverflow-posts.xml data/so-posts-tsv

java.lang.reflect.InvocationTargetException in Reduce DataJoin ( Hadoop In Action )

I am having a problem running the DataJoin example in the Hadoop In Action. It seems like while running the job, java.lang.reflect.InvocationTargetException was thrown. I tried it for a day and it doesn't work. what did i do wrong?
Below is the exception
12/12/30 01:54:06 INFO mapred.JobClient: Task Id : attempt_201212280853_0032_m_000000_2, Status : FAILED
java.lang.RuntimeException: Error in configuring object
at org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:106)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:72)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:130)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:389)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:327)
at org.apache.hadoop.mapred.Child$4.run(Child.java:268)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1332)
at org.apache.hadoop.mapred.Child.main(Child.java:262)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.ja
Below is the code taken from the Hadoop In Action. I have customers.txt and orders.txt in my input directory. I tried renaming these 2 files in the input directory to part-0000.txt and part-0001.txt and still it doesn't work.
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.contrib.utils.join.DataJoinMapperBase;
import org.apache.hadoop.contrib.utils.join.DataJoinReducerBase;
import org.apache.hadoop.contrib.utils.join.TaggedMapOutput;
/**
* DataJoinMapperBase has a method method - http://hadoop.apache.org/docs/mapreduce/r0.21.0/api/org/apache/hadoop/contrib/utils/join/DataJoinMapperBase.html
*
* map(Object key, Object value, OutputCollector output, Reporter reporter)
*
*/
public class DataJoin extends Configured implements Tool {
public static class MapClass extends DataJoinMapperBase {
protected Text generateInputTag(String inputFile) {
String datasource = inputFile.split("-")[0];
return new Text(datasource);
}
protected Text generateGroupKey(TaggedMapOutput aRecord) {
String line = ((Text) aRecord.getData()).toString();
String[] tokens = line.split(",");
String groupKey = tokens[0];
return new Text(groupKey);
}
protected TaggedMapOutput generateTaggedMapOutput(Object value) {
TaggedWritable retv = new TaggedWritable();
retv.setData((Text) value);
retv.setTag(this.inputTag);
return retv;
}
}
public static class Reduce extends DataJoinReducerBase {
protected TaggedMapOutput combine(Object[] tags, Object[] values) {
if (tags.length < 2) return null;
String joinedStr = "";
for (int i=0; i<values.length; i++) {
if (i > 0) joinedStr += ",";
TaggedWritable tw = (TaggedWritable) values[i];
String line = ((Text) tw.getData()).toString();
String[] tokens = line.split(",", 2);
joinedStr += tokens[1];
}
TaggedWritable retv = new TaggedWritable();
retv.setData(new Text(joinedStr));
retv.setTag((Text) tags[0]);
return retv;
}
}
public static class TaggedWritable extends TaggedMapOutput {
private Writable data;
// public TaggedWritable(Writable data) {
// this.tag = new Text("");
// this.data = data;
// }
public TaggedWritable() {
this.tag = new Text();
}
public Writable getData() {
return data;
}
public void setData(Writable data) {
this.data = data;
}
public void write(DataOutput out) throws IOException {
this.tag.write(out);
this.data.write(out);
}
public void readFields(DataInput in) throws IOException {
this.tag.readFields(in);
String dataClz = in.readUTF();
if (this.data == null
|| !this.data.getClass().getName().equals(dataClz)) {
try {
this.data = (Writable) ReflectionUtils.newInstance(
Class.forName(dataClz), null);
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
this.data.readFields(in);
}
// public void readFields(DataInput in) throws IOException {
// this.tag.readFields(in);
// this.data.readFields(in);
// }
}
public int run(String[] args) throws Exception {
Configuration conf = getConf();
JobConf job = new JobConf(conf, DataJoin.class);
Path in = new Path(args[0]);
Path out = new Path(args[1]);
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
job.setJobName("DataJoin");
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
job.setInputFormat(TextInputFormat.class);
job.setOutputFormat(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(TaggedWritable.class);
job.set("mapred.textoutputformat.separator", ",");
JobClient.runJob(job);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(),
new DataJoin(),
args);
System.exit(res);
}
}
I hit the similar problem and java.lang.reflect.InvocationTargetException was the root cause...what did i do wrong?
12/12/30 01:54:06 INFO mapred.JobClient: Task Id : attempt_201212280853_0032_m_000000_2, Status : FAILED
java.lang.RuntimeException: Error in configuring object
at org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:106)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:72)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:130)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:389)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:327)
at org.apache.hadoop.mapred.Child$4.run(Child.java:268)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1332)
at org.apache.hadoop.mapred.Child.main(Child.java:262)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.ja
Below is the code taken from the Hadoop In Action
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.contrib.utils.join.DataJoinMapperBase;
import org.apache.hadoop.contrib.utils.join.DataJoinReducerBase;
import org.apache.hadoop.contrib.utils.join.TaggedMapOutput;
/**
* DataJoinMapperBase has a method method - http://hadoop.apache.org/docs/mapreduce/r0.21.0/api/org/apache/hadoop/contrib/utils/join/DataJoinMapperBase.html
*
* map(Object key, Object value, OutputCollector output, Reporter reporter)
*
*/
public class DataJoin extends Configured implements Tool {
public static class MapClass extends DataJoinMapperBase {
protected Text generateInputTag(String inputFile) {
String datasource = inputFile.split("-")[0];
return new Text(datasource);
}
protected Text generateGroupKey(TaggedMapOutput aRecord) {
String line = ((Text) aRecord.getData()).toString();
String[] tokens = line.split(",");
String groupKey = tokens[0];
return new Text(groupKey);
}
protected TaggedMapOutput generateTaggedMapOutput(Object value) {
TaggedWritable retv = new TaggedWritable();
retv.setData((Text) value);
retv.setTag(this.inputTag);
return retv;
}
}
public static class Reduce extends DataJoinReducerBase {
protected TaggedMapOutput combine(Object[] tags, Object[] values) {
if (tags.length < 2) return null;
String joinedStr = "";
for (int i=0; i<values.length; i++) {
if (i > 0) joinedStr += ",";
TaggedWritable tw = (TaggedWritable) values[i];
String line = ((Text) tw.getData()).toString();
String[] tokens = line.split(",", 2);
joinedStr += tokens[1];
}
TaggedWritable retv = new TaggedWritable();
retv.setData(new Text(joinedStr));
retv.setTag((Text) tags[0]);
return retv;
}
}
public static class TaggedWritable extends TaggedMapOutput {
private Writable data;
// public TaggedWritable(Writable data) {
// this.tag = new Text("");
// this.data = data;
// }
public TaggedWritable() {
this.tag = new Text();
}
public Writable getData() {
return data;
}
public void setData(Writable data) {
this.data = data;
}
public void write(DataOutput out) throws IOException {
this.tag.write(out);
this.data.write(out);
}
/***
// public void readFields(DataInput in) throws IOException {
// this.tag.readFields(in);
// String dataClz = in.readUTF();
// if (this.data == null
// || !this.data.getClass().getName().equals(dataClz)) {
// try {
// this.data = (Writable) ReflectionUtils.newInstance(
// Class.forName(dataClz), null);
// } catch (ClassNotFoundException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// }
//this.data.readFields(in);
//}
*****/
public void readFields(DataInput in) throws IOException {
this.tag.readFields(in);
this.data.readFields(in);
}
}
public int run(String[] args) throws Exception {
Configuration conf = getConf();
JobConf job = new JobConf(conf, DataJoin.class);
Path in = new Path(args[0]);
Path out = new Path(args[1]);
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
job.setJobName("DataJoin");
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
job.setInputFormat(TextInputFormat.class);
job.setOutputFormat(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(TaggedWritable.class);
job.set("mapred.textoutputformat.separator", ",");
JobClient.runJob(job);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(),
new DataJoin(),
args);
System.exit(res);
}
}

Categories

Resources