Can't access hashmap in mapper, MapReduce - java

I'd like to replace values of input data in my mapper, using dictionalies(csv) defined in another file. So I tried to put the csv data to HashMap and refer it in the mapper.
The java code and csv below are simplified version of my program. This code works in my local environment(Mac OS X, pseudo-distributed mode), but doesn't in my EC2 instance(ubuntu, pseudo-distributed mode).
In detail, I got this stdout in process:
cat:4
human:2
flamingo:1
this means the filereader successfully put csv data into HashMap.
However the mapper mapped nothing and therefore I got empty output in the EC2 environment, although it mapped 3 * (the number of lines of the input file) elements and generated the following in the local:
test,cat
test,flamingo
test,human
Does anyone have answers or hints?
Test.java
import java.io.IOException;
import java.util.StringTokenizer;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.DataInput;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.io.WritableUtils;
public class Test {
public static HashMap<String, Integer> map = new HashMap<String, Integer>();
public static class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
for(Map.Entry<String, Integer> e : map.entrySet()) {
context.write(new Text(e.getKey()), new Text("test"));
}
}
}
public static class Reducer1 extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> vals, Context context) throws IOException, InterruptedException {
context.write(new Text("test"), key);
}
}
public static class CommaTextOutputFormat extends TextOutputFormat<Text, Text> {
#Override
public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
String extension = ".txt";
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<Text, Text>(fileOut, ",");
}
}
public static void get_list(String list_path){
try {
FileReader fr = new FileReader(list_path);
BufferedReader br = new BufferedReader(fr);
String line = null, name = null;
int leg = 0;
while ((line = br.readLine()) != null) {
if (!line.startsWith("name") && !line.trim().isEmpty()) {
String[] name_leg = line.split(",", 0);
name = name_leg[0];
leg = Integer.parseInt(name_leg[1]);
map.put(name, leg);
}
}
br.close();
}
catch(IOException ex) {
System.err.println(ex.getMessage());
ex.printStackTrace();
}
for(Map.Entry<String, Integer> e : map.entrySet()) {
System.out.println(e.getKey() + ":" + e.getValue());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 3) {
System.err.println(
"Need 3 arguments: <input dir> <output base dir> <list path>");
System.exit(1);
}
get_list(args[2]);
Job job = Job.getInstance(conf, "test");
job.setJarByClass(Test.class);
job.setMapperClass(Mapper1.class);
job.setReducerClass(Reducer1.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(TextInputFormat.class);
// mapper output
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// reducer output
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// formtter
job.setOutputFormatClass(CommaTextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
if(!job.waitForCompletion(true)){
System.exit(1);
}
System.out.println("All Finished");
System.exit(0);
}
}
list.csv (args[2])
name,legs
cat,4
human,2
flamingo,1
=================================
I refer to #Rahul Sharma 's answer and modifiy my code as below. Then my code works in the both environments.
Thank you very much #Rahul Sharma and #Serhiy for your precise answer and useful comments.
Test.java
import java.io.IOException;
import java.util.StringTokenizer;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.DataInput;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.net.URI;
import java.io.InputStreamReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.io.WritableUtils;
public class Test {
public static HashMap<String, Integer> map = new HashMap<String, Integer>();
public static class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void setup(Context context) throws IOException, InterruptedException {
URI[] files = context.getCacheFiles();
Path list_path = new Path(files[0]);
try {
FileSystem fs = list_path.getFileSystem(context.getConfiguration());
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(list_path)));
String line = null, name = null;
int leg = 0;
while ((line = br.readLine()) != null) {
if (!line.startsWith("name") && !line.trim().isEmpty()) {
String[] name_leg = line.split(",", 0);
name = name_leg[0];
leg = Integer.parseInt(name_leg[1]);
map.put(name, leg);
}
}
br.close();
}
catch(IOException ex) {
System.err.println(ex.getMessage());
ex.printStackTrace();
}
for(Map.Entry<String, Integer> e : map.entrySet()) {
System.out.println(e.getKey() + ":" + e.getValue());
}
}
#Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
for(Map.Entry<String, Integer> e : map.entrySet()) {
context.write(new Text(e.getKey()), new Text("test"));
}
}
}
public static class Reducer1 extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> vals, Context context) throws IOException, InterruptedException {
context.write(new Text("test"), key);
}
}
// Writer
public static class CommaTextOutputFormat extends TextOutputFormat<Text, Text> {
#Override
public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
String extension = ".txt";
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<Text, Text>(fileOut, ",");
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 3) {
System.err.println(
"Need 3 arguments: <input dir> <output base dir> <list path>");
System.exit(1);
}
Job job = Job.getInstance(conf, "test");
job.addCacheFile(new Path(args[2]).toUri());
job.setJarByClass(Test.class);
job.setMapperClass(Mapper1.class);
job.setReducerClass(Reducer1.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(TextInputFormat.class);
// mapper output
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// reducer output
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// formtter
job.setOutputFormatClass(CommaTextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
if(!job.waitForCompletion(true)){
System.exit(1);
}
System.out.println("All Finished");
System.exit(0);
}
}

First you need to learn more about mapreduce framework.
Your program behave as expected in local mode because Mapper, reducer and Job are launched on same JVM. In case, of pseudo-distributed mode or distributed modes there will be separate jvms allocated for each component. The values you put into hashMap using get_list are not visible to mapper and reducer as they are in separate jvms
Use distributed cache to make it work in cluster mode.
Job Main class add file to distributed cache:
JobConf job = new JobConf();<br>
DistributedCache.addCacheArchive(new URI(args[2]), job);
Access file in mapper or reducer:
public void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.getLocal(conf);
Path[] dataFile = DistributedCache.getLocalCacheFiles(conf);
BufferedReader cacheReader = new BufferedReader(new InputStreamReader(fs.open(dataFile[0])));
// Implement here get_list method functionality
}

Related

How can I compile Java source code?

I try to build a project that find the maximum of Average temperature of each month. Here is my code:
File Map.java
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class Map extends Mapper<LongWritable, Text, Text, FloatWritable> {
private FloatWritable average = new FloatWritable();
private float maxFloat, minFloat, averageFloat;
private Text word = new Text();
#Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer line = new StringTokenizer(value.toString(), ",");
if (line.countTokens() > 0) {
word.set(line.nextToken().substring(2,8));
if (line.hasMoreTokens()) {
maxFloat = Float.parseFloat(line.nextToken());
}
if (line.hasMoreTokens()) {
minFloat = Float.parseFloat(line.nextToken());
}
averageFloat = (minFloat + maxFloat) / 2;
average.set(averageFloat);
context.write(word, average);
}
}
}
File Reduce.java
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class Reduce extends Reducer<Text, FloatWritable, Text, FloatWritable> {
private float max_temp = Float.MIN_VALUE;
private float temp = 0;
#Override
protected void reduce(Text key, Iterable<FloatWritable> values, Context context)
throws IOException, InterruptedException {
Iterator<FloatWritable> itr = values.iterator();
while (itr.hasNext()) {
temp = itr.next().get();
if (temp > max_temp) {
max_temp = temp;
}
}
context.write(key, new FloatWritable(max_temp));
}
}
File MaxTempDriver.java
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxTempDriver {
public static void main(String[] args) throws Exception {
// Create a new job
Job job = new Job();
// Set job name to locate it in the distributed environment
job.setJarByClass(MaxTempDriver.class);
job.setJobName("Max Temperature");
// Set input and output Path, note that we use the default input format
// which is TextInputFormat (each record is a line of input)
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// Set Mapper and Reducer class
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
// Set Output key and value
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
And now I don't know how to compile these 3 files. I have read from the some tutorials from internet but it seems that they only had 1 file with map.class and reduce.class at the same file. How to compile these file?

javac do not generate any Error or jar file in hadoop

I study hdfs, so I test simple MapReduce code that gets average of overall in JSON file.
Json form's sample is in here(click me)
GetAverage.java
package some.package.path.here.bigdata;
import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.json.*;
public class GetAverage extends Configured implements Tool {
public static void main(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
int res = ToolRunner.run(new Configuration(), new GetAverage(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
Job job = Job.getInstance(getConf());
job.setJarByClass(GetAverage.class);
job.setOutputKeyClass(Text.class);
//Overall value type is float.
job.setOutputValueClass(FloatWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
return 0;
}
public static class Map extends Mapper<LongWritable, Text, Text, FloatWritable> {
private Text asin_value = new Text();
private FloatWritable overall_value = new FloatWritable();
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String asin;
float overall;
String line = value.toString();
String[] tuple = line.split("\\n");
try{
for(int i=0; i<tuple.length; i++){
JSONObject obj = new JSONObject(tuple[i]);
System.out.println(obj);
asin = obj.getString("asin");
overall = Float.parseFloat(obj.getString("overall"));
asin_value.set(asin);
overall_value.set(overall);
context.write(asin_value, overall_value);
}
}catch(JSONException e){
e.printStackTrace();
}
}
}
public static class Reduce extends Reducer<Text, FloatWritable, Text, FloatWritable> {
#Override
public void reduce(Text key, Iterable<FloatWritable> values, Context context)
throws IOException, InterruptedException {
float sum = 0;
int length = 0;
for (FloatWritable val : values) {
sum += val.get();
length++;
}
float average = sum / length;
context.write(key, new FloatWritable(average));
}
}
}
When I compile this, Code do not showing any message and generate any jar file.
I compile this java file with under command.
cat ./complie command
javac -classpath /usr/local/hadoop/share/hadoop/common/hadoop-common-2.8.0.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-core-2.8.0.jar:../org.json.jar -d getaverage_classes/ GetAverage.java
And my directory structure is...
~/test |- GetAverage.java
|- compile_command
|- getaverage_classes - some/package/path/here/bigdata |-GetAverage$Map.class
|- GetAverage$Reduce.class
|- GetAverage.class
Please answer my question anyone knows this problem. :)

ClassNotFound Exception while running MapReduce program

I am writing a mapreduce program for matrix addition. Since it requires 2 input files, i am using MultipleInputs. I have these following classes
MatAddMapper1.java
package mapred;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MatAddMapper1 extends Mapper<LongWritable, Text, Text, IntWritable>
{
//private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException
{
String line = value.toString();
String[] content = line.split (" ");
String key1 = content[0] + " " + content[1];
int val = Integer.parseInt(content[2]);
// Key is (i,j)
context.write(new Text(key1), new IntWritable(val));
}
}
MatAddMapper2.java is similar.
MatAddReducer.java
package mapred;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MatAddReducer
extends Reducer<Text, IntWritable, Text, IntWritable>
{
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException
{
int val = 0;
for (IntWritable value : values)
{
val = val + value.get();
}
context.write(key, new IntWritable(val));
}
}
MatAddApp.java (Main class)
package mapred;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MatAddApp extends Configured implements Tool
{
public int run(String[] args) throws Exception
{
Configuration conf = new Configuration();
#SuppressWarnings("deprecation")
Job job = new Job(conf, "Matrix Addition");
job.setJarByClass(MatAddApp.class);
MultipleInputs.addInputPath(job,new Path(args[0]),TextInputFormat.class,MatAddMapper1.class);
MultipleInputs.addInputPath(job,new Path(args[1]),TextInputFormat.class,MatAddMapper2.class);
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.setReducerClass(MatAddReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception
{
int ecode = ToolRunner.run(new MatAddApp(), args);
System.exit(ecode);
}
}
I am using eclipse and created a jar file MatAddition.jar. M.txt and N.txt are input matrices. When I tried to run the program in my hadoop cluster, I got the following error
Exception in thread "main" java.lang.ClassNotFoundException: MatAddApp
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:278)
at org.apache.hadoop.util.RunJar.run(RunJar.java:214)
at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
The issue is because of the classname.The driver classname should be fully qualified when setting in configuration as follows:
job.setJarByClass(mapred.MatAddApp.class);
Input.txt
A,0|0,1.0
A,0|1,2.0
A,0|2,3.0
A,0|3,4.0
A,1|0,5.0
A,1|1,6.0
A,1|2,7.0
A,1|3,8.0
B,0|0,1.0
B,0|1,2.0
B,0|2,3.0
B,0|3,4.0
B,1|0,5.0
B,1|1,6.0
B,1|2,7.0
B,1|3,8.0
Here, the first column represents the name of the matrix, second column represents the index and the third represents the value.
MatrixAdd.java
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import java.util.ArrayList;
import java.util.Iterator;
import java.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MatrixAdd {
public static class MatMapper extends Mapper<Object, Text, Text, DoubleWritable>{
private Text index = new Text();
private final static DoubleWritable num = new DoubleWritable();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
String record = value.toString();
String[] parts = record.split(",");
index.set(parts[1]);
num.set(Double.parseDouble(parts[2]));
context.write(index, num);
}
}
public static class MatReducer extends Reducer<Text,DoubleWritable,Text,DoubleWritable> {
private DoubleWritable result = new DoubleWritable();
public void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
double sumValue = 0;
for(DoubleWritable val: values) {
sumValue += val.get();
}
result.set(sumValue);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "max temp");
job.setJarByClass(MatrixAdd.class);
job.setMapperClass(MatMapper.class);
job.setCombinerClass(MatReducer.class);
job.setReducerClass(MatReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Output:
0|0 2.0
0|1 4.0
0|2 6.0
0|3 8.0
1|0 10.0
1|1 12.0
1|2 14.0
1|3 16.0

Hadoop jar command error for multiple mapper inputs and 1 reducer output (Join 2 values from 2 files)

Here is my sample program joining 2 datasets.
The program has 2 mappers and 1 reducer joining the values obtained from 2 different mappers having 2 different files as input.
I am getting an error in the hadoop jar command.
command:
hadoop jar /home/rahul/Downloads/testjars/datajoin.jar DataJoin
/user/rahul/cust.txt /user/rahul/delivery.txt /user/rahul/output
Error: Invalid number of arguments Datajoin
It is actually expecting only 1 input path and 1 output path whereas in my command I have 2 inputs for 2 different mappers and 1 output.
Can anyone help me out ?
Code:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class DataJoin {
public static class TokenizerMapper1 extends Mapper {
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String itr[] = value.toString().split("::");
word.set(itr[0].trim());
context.write(word, new Text("CD~" + itr[1]));
}
}
public static class TokenizerMapper2 extends Mapper {
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String itr[] = value.toString().split("::");
word.set(itr[0].trim());
context.write(word, new Text("DD~" + itr[1]));
}
}
public static class IntSumReducer extends Reducer {
private Text result = new Text();
public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
String sum = "";
for (Text val : values) {
sum += val.toString();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: DataJoin ");
System.exit(2);
}
Job job = new Job(conf, "Data Join");
job.setJarByClass(DataJoin.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
MultipleInputs.addInputPath(job, new Path(otherArgs[0]),
TextInputFormat.class, TokenizerMapper1.class);
MultipleInputs.addInputPath(job, new Path(otherArgs[1]),
TextInputFormat.class, TokenizerMapper2.class);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
You have error in this portion
if (otherArgs.length != 2) {
System.err.println("Usage: DataJoin ");
System.exit(2);
}
Your argument is of length 3. 2 inputs and 1 output.
Argument count starts from 1,2... not from 0,1....
Change to
if (otherArgs.length != 3) {
System.err.println("Usage: DataJoin ");
System.exit(0);
}
This solves your issue.

Parsing of Stackoverflow`s posts.xml on hadoop

I am following this article by Anoop Madhusudanan on codeproject to build a recommendation engine not on cluster but on my system.
Problem is when i try to parse posts.xml whose structure is as follows:
<row Id="99" PostTypeId="2" ParentId="88" CreationDate="2008-08-01T14:55:08.477" Score="2" Body="<blockquote>
<p>The actual resolution of gettimeofday() depends on the hardware architecture. Intel processors as well as SPARC machines offer high resolution timers that measure microseconds. Other hardware architectures fall back to the system’s timer, which is typically set to 100 Hz. In such cases, the time resolution will be less accurate. </p>
</blockquote>
<p>I obtained this answer from <a href="http://www.informit.com/guides/content.aspx?g=cplusplus&amp;seqNum=272" rel="nofollow">High Resolution Time Measurement and Timers, Part I</a></p>" OwnerUserId="25" LastActivityDate="2008-08-01T14:55:08.477" />
Now I need to parse this file(size 1.4 gb) on hadoop for which i have written code in java and created its jar.
Java class is as follows:
import java.io.IOException;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.w3c.dom.Element;
import java.io.File;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
public class Recommend {
static class Map extends Mapper<Text, Text, Text, Text> {
Path path;
String fXmlFile;
DocumentBuilderFactory dbFactory;
DocumentBuilder dBuilder;
Document doc;
/**
* Given an output filename, write a bunch of random records to it.
*/
public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
try{
fXmlFile=value.toString();
dbFactory = DocumentBuilderFactory.newInstance();
dBuilder= dbFactory.newDocumentBuilder();
doc= dBuilder.parse(fXmlFile);
doc.getDocumentElement().normalize();
NodeList nList = doc.getElementsByTagName("row");
for (int temp = 0; temp < nList.getLength(); temp++) {
Node nNode = nList.item(temp);
Element eElement = (Element) nNode;
Text keyWords =new Text(eElement.getAttribute("OwnerUserId"));
Text valueWords = new Text(eElement.getAttribute("ParentId"));
String val=keyWords.toString()+" "+valueWords.toString();
// Write the sentence
if(keyWords != null && valueWords != null){
output.collect(keyWords, new Text(val));
}
}
}catch (Exception e) {
e.printStackTrace();
}
}
}
/**
*
* #throws IOException
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
/*if (args.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}*/
// FileSystem fs = FileSystem.get(conf);
Job job = new Job(conf, "Recommend");
job.setJarByClass(Recommend.class);
// the keys are words (strings)
job.setOutputKeyClass(Text.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
// the values are counts (ints)
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
//conf.setReducerClass(Reduce.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
Path outPath = new Path(args[1]);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}
}
}
I expect the output to be as a file in hadoop containing output as OwnerUserId ParentId
but instead I get output as:
1599788 <row Id="2292" PostTypeId="2" ParentId="2284" CreationDate="2008-08-05T13:28:06.700" Score="0" ViewCount="0" Body="<p>The first thing you should do is contact the main people who run the open source project. Ask them if it is ok to contribute to the code and go from there.</p>
<p>Simply writing your improved code and then giving it to them may result in your code being rejected.</p>" OwnerUserId="383" LastActivityDate="2008-08-05T13:28:06.700" />
I dont know about the origin of 1599788 appearing as a key value from mapper.
I don`t know much about writing mapper classes for hadoop, I need help to modify my code to get the desired output.
Thanks in advance.
After a lot of research and experiments , finally learnt the way to write map for parsin xml files which have syntax like one I provided. I changed my approach and this is my new mapper code... Its workin for my usecase.
hope it help someone and they can save their time :)
import java.io.IOException;
import java.util.StringTokenizer;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.xml.sax.SAXException;
public class Map extends Mapper<LongWritable, Text, NullWritable, Text> {
NullWritable obj;
#Override
public void map(LongWritable key, Text value, Context context) throws InterruptedException {
StringTokenizer tok= new StringTokenizer(value.toString());
String pa=null,ow=null,pi=null,v;
while (tok.hasMoreTokens()) {
String[] arr;
String val = (String) tok.nextToken();
if(val.contains("PostTypeId")){
arr= val.split("[\"]");
pi=arr[arr.length-1];
if(pi.equals("2")){
continue;
}
else break;
}
if(val.contains("ParentId")){
arr= val.split("[\"]");
pa=arr[arr.length-1];
}
else if(val.contains("OwnerUserId") ){
arr= val.split("[\"]");
ow=arr[arr.length-1];
try {
if(pa!=null && ow != null){
v=String.format("{0},{1}", ow,pa);
context.write(obj,new Text(v));
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
}
Here is the mapper I've written to parse the so posts xml and create a tab separated file(s) on hadoop to be used by other map reduce jobs or Hive or Pig.
Mapper
package com.aravind.learning.hadoop.mapred.techtalks;
import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.google.common.base.Joiner;
public class StackoverflowDataWranglerMapper extends Mapper<LongWritable, Text, Text, Text>
{
static enum BadRecordCounters
{
NO_CREATION_DATE, UNKNOWN_USER_ID, UNPARSEABLE_RECORD, UNTAGGED_POSTS
}
private final Text outputKey = new Text();
private final Text outputValue = new Text();
private final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
private DocumentBuilder builder;
private static final Joiner TAG_JOINER = Joiner.on(",").skipNulls();
// 2008-07-31T21:42:52.667
private static final DateFormat DATE_PARSER = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");
private static final SimpleDateFormat DATE_BUILDER = new SimpleDateFormat("yyyy-MM-dd");
#Override
protected void setup(Context context) throws IOException, InterruptedException
{
try
{
builder = factory.newDocumentBuilder();
}
catch (ParserConfigurationException e)
{
new IOException(e);
}
}
#Override
protected void map(LongWritable inputKey, Text inputValue, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException
{
try
{
String entry = inputValue.toString();
if (entry.contains("<row "))
{
Document doc = builder.parse(new InputSource(new StringReader(entry)));
Element rootElem = doc.getDocumentElement();
String id = rootElem.getAttribute("Id");
String postedBy = rootElem.getAttribute("OwnerUserId").trim();
String viewCount = rootElem.getAttribute("ViewCount");
String postTypeId = rootElem.getAttribute("PostTypeId");
String score = rootElem.getAttribute("Score");
String title = rootElem.getAttribute("Title");
String tags = rootElem.getAttribute("Tags");
String answerCount = rootElem.getAttribute("AnswerCount");
String commentCount = rootElem.getAttribute("CommentCount");
String favoriteCount = rootElem.getAttribute("FavoriteCount");
String creationDate = rootElem.getAttribute("CreationDate");
Date parsedDate = null;
if (creationDate != null && creationDate.trim().length() > 0)
{
try
{
parsedDate = DATE_PARSER.parse(creationDate);
}
catch (ParseException e)
{
context.getCounter("Bad Record Counters", "Posts missing CreationDate").increment(1);
}
}
if (postedBy.length() == 0 || postedBy.trim().equals("-1"))
{
context.getCounter("Bad Record Counters", "Posts with either empty UserId or UserId contains '-1'")
.increment(1);
try
{
parsedDate = DATE_BUILDER.parse("2100-00-01");
}
catch (ParseException e)
{
// ignore
}
}
tags = tags.trim();
String tagTokens[] = null;
if (tags.length() > 1)
{
tagTokens = tags.substring(1, tags.length() - 1).split("><");
}
else
{
context.getCounter("Bad Record Counters", "Untagged Posts").increment(1);
}
outputKey.clear();
outputKey.set(id);
StringBuilder sb = new StringBuilder(postedBy).append("\t").append(parsedDate.getTime()).append("\t")
.append(postTypeId).append("\t").append(title).append("\t").append(viewCount).append("\t").append(score)
.append("\t");
if (tagTokens != null)
{
sb.append(TAG_JOINER.join(tagTokens)).append("\t");
}
else
{
sb.append("").append("\t");
}
sb.append(answerCount).append("\t").append(commentCount).append("\t").append(favoriteCount).toString();
outputValue.set(sb.toString());
context.write(outputKey, outputValue);
}
}
catch (SAXException e)
{
context.getCounter("Bad Record Counters", "Unparsable records").increment(1);
}
finally
{
builder.reset();
}
}
}
Driver
public class StackoverflowDataWranglerDriver extends Configured implements Tool
{
#Override
public int run(String[] args) throws Exception
{
if (args.length != 2)
{
System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
Job job = Job.getInstance(getConf());
job.setJobName("Tech Talks - Stackoverflow Forum Posts - Data Wrangler");
TextInputFormat.addInputPath(job, new Path(args[0]));
TextOutputFormat.setOutputPath(job, new Path(args[1]));
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setJarByClass(StackoverflowDataWranglerMapper.class);// required for mr1
job.setMapperClass(StackoverflowDataWranglerMapper.class);
job.setNumReduceTasks(0);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String args[]) throws Exception
{
int exitCode = ToolRunner.run(new Configuration(), new StackoverflowDataWranglerDriver(), args);
System.exit(exitCode);
}
}
Job submit command
hadoop jar ./hadoop-examples-0.0.1-SNAPSHOT.jar com.aravind.learning.hadoop.mapred.techtalks.StackoverflowDataWranglerDriver data/stackoverflow-posts.xml data/so-posts-tsv

Categories

Resources