I try to build a project that find the maximum of Average temperature of each month. Here is my code:
File Map.java
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class Map extends Mapper<LongWritable, Text, Text, FloatWritable> {
private FloatWritable average = new FloatWritable();
private float maxFloat, minFloat, averageFloat;
private Text word = new Text();
#Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer line = new StringTokenizer(value.toString(), ",");
if (line.countTokens() > 0) {
word.set(line.nextToken().substring(2,8));
if (line.hasMoreTokens()) {
maxFloat = Float.parseFloat(line.nextToken());
}
if (line.hasMoreTokens()) {
minFloat = Float.parseFloat(line.nextToken());
}
averageFloat = (minFloat + maxFloat) / 2;
average.set(averageFloat);
context.write(word, average);
}
}
}
File Reduce.java
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class Reduce extends Reducer<Text, FloatWritable, Text, FloatWritable> {
private float max_temp = Float.MIN_VALUE;
private float temp = 0;
#Override
protected void reduce(Text key, Iterable<FloatWritable> values, Context context)
throws IOException, InterruptedException {
Iterator<FloatWritable> itr = values.iterator();
while (itr.hasNext()) {
temp = itr.next().get();
if (temp > max_temp) {
max_temp = temp;
}
}
context.write(key, new FloatWritable(max_temp));
}
}
File MaxTempDriver.java
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxTempDriver {
public static void main(String[] args) throws Exception {
// Create a new job
Job job = new Job();
// Set job name to locate it in the distributed environment
job.setJarByClass(MaxTempDriver.class);
job.setJobName("Max Temperature");
// Set input and output Path, note that we use the default input format
// which is TextInputFormat (each record is a line of input)
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// Set Mapper and Reducer class
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
// Set Output key and value
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
And now I don't know how to compile these 3 files. I have read from the some tutorials from internet but it seems that they only had 1 file with map.class and reduce.class at the same file. How to compile these file?
Related
I am running the below Map reduce code to calculate sum and average length of words starting with each english alphabet.
For example : If the doc only contains the word 'and' 5 times
letter | total words | average length
a 5 3
The mapreduce program is as below:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LetterWiseAvgLengthV1
{
public static class TokenizerMapper
extends Mapper<LongWritable, Text, Text, Text>
{
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException
{
String st [] = value.toString().split("\\s+");
for(String word : st) {
String wordnew=word.replaceAll("[^a-zA-Z]","");
String firstLetter = wordnew.substring(0, 1);
if(!wordnew.isEmpty()){
// write ('a',3) if the word is and
context.write(new Text(firstLetter), new Text(String.valueOf(wordnew.length())));
}
else continue;
}
}
}
public static class IntSumReducer
extends Reducer<Text,Text,Text,Text>
{
public void reduce(Text key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException
{
int sum=0,count=0;
for (Text val : values)
{
sum += Integer.parseInt(val.toString());
count+= 1;
}
float avg=(sum/(float)count);
String op="Average length of " + count + " words = " + avg;
context.write(new Text(key), new Text(op));
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "wordLenAvgCombiner");
job.setJarByClass(LetterWiseAvgLengthV1.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
When I execute the program below on a text document, it creates an empty output directory in HDFS. There are no failures during execution, but the output folder is always empty
I study hdfs, so I test simple MapReduce code that gets average of overall in JSON file.
Json form's sample is in here(click me)
GetAverage.java
package some.package.path.here.bigdata;
import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.json.*;
public class GetAverage extends Configured implements Tool {
public static void main(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
int res = ToolRunner.run(new Configuration(), new GetAverage(), args);
System.exit(res);
}
#Override
public int run(String[] args) throws Exception {
System.out.println(Arrays.toString(args));
Job job = Job.getInstance(getConf());
job.setJarByClass(GetAverage.class);
job.setOutputKeyClass(Text.class);
//Overall value type is float.
job.setOutputValueClass(FloatWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
return 0;
}
public static class Map extends Mapper<LongWritable, Text, Text, FloatWritable> {
private Text asin_value = new Text();
private FloatWritable overall_value = new FloatWritable();
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String asin;
float overall;
String line = value.toString();
String[] tuple = line.split("\\n");
try{
for(int i=0; i<tuple.length; i++){
JSONObject obj = new JSONObject(tuple[i]);
System.out.println(obj);
asin = obj.getString("asin");
overall = Float.parseFloat(obj.getString("overall"));
asin_value.set(asin);
overall_value.set(overall);
context.write(asin_value, overall_value);
}
}catch(JSONException e){
e.printStackTrace();
}
}
}
public static class Reduce extends Reducer<Text, FloatWritable, Text, FloatWritable> {
#Override
public void reduce(Text key, Iterable<FloatWritable> values, Context context)
throws IOException, InterruptedException {
float sum = 0;
int length = 0;
for (FloatWritable val : values) {
sum += val.get();
length++;
}
float average = sum / length;
context.write(key, new FloatWritable(average));
}
}
}
When I compile this, Code do not showing any message and generate any jar file.
I compile this java file with under command.
cat ./complie command
javac -classpath /usr/local/hadoop/share/hadoop/common/hadoop-common-2.8.0.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-core-2.8.0.jar:../org.json.jar -d getaverage_classes/ GetAverage.java
And my directory structure is...
~/test |- GetAverage.java
|- compile_command
|- getaverage_classes - some/package/path/here/bigdata |-GetAverage$Map.class
|- GetAverage$Reduce.class
|- GetAverage.class
Please answer my question anyone knows this problem. :)
I'd like to replace values of input data in my mapper, using dictionalies(csv) defined in another file. So I tried to put the csv data to HashMap and refer it in the mapper.
The java code and csv below are simplified version of my program. This code works in my local environment(Mac OS X, pseudo-distributed mode), but doesn't in my EC2 instance(ubuntu, pseudo-distributed mode).
In detail, I got this stdout in process:
cat:4
human:2
flamingo:1
this means the filereader successfully put csv data into HashMap.
However the mapper mapped nothing and therefore I got empty output in the EC2 environment, although it mapped 3 * (the number of lines of the input file) elements and generated the following in the local:
test,cat
test,flamingo
test,human
Does anyone have answers or hints?
Test.java
import java.io.IOException;
import java.util.StringTokenizer;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.DataInput;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.io.WritableUtils;
public class Test {
public static HashMap<String, Integer> map = new HashMap<String, Integer>();
public static class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
for(Map.Entry<String, Integer> e : map.entrySet()) {
context.write(new Text(e.getKey()), new Text("test"));
}
}
}
public static class Reducer1 extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> vals, Context context) throws IOException, InterruptedException {
context.write(new Text("test"), key);
}
}
public static class CommaTextOutputFormat extends TextOutputFormat<Text, Text> {
#Override
public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
String extension = ".txt";
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<Text, Text>(fileOut, ",");
}
}
public static void get_list(String list_path){
try {
FileReader fr = new FileReader(list_path);
BufferedReader br = new BufferedReader(fr);
String line = null, name = null;
int leg = 0;
while ((line = br.readLine()) != null) {
if (!line.startsWith("name") && !line.trim().isEmpty()) {
String[] name_leg = line.split(",", 0);
name = name_leg[0];
leg = Integer.parseInt(name_leg[1]);
map.put(name, leg);
}
}
br.close();
}
catch(IOException ex) {
System.err.println(ex.getMessage());
ex.printStackTrace();
}
for(Map.Entry<String, Integer> e : map.entrySet()) {
System.out.println(e.getKey() + ":" + e.getValue());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 3) {
System.err.println(
"Need 3 arguments: <input dir> <output base dir> <list path>");
System.exit(1);
}
get_list(args[2]);
Job job = Job.getInstance(conf, "test");
job.setJarByClass(Test.class);
job.setMapperClass(Mapper1.class);
job.setReducerClass(Reducer1.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(TextInputFormat.class);
// mapper output
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// reducer output
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// formtter
job.setOutputFormatClass(CommaTextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
if(!job.waitForCompletion(true)){
System.exit(1);
}
System.out.println("All Finished");
System.exit(0);
}
}
list.csv (args[2])
name,legs
cat,4
human,2
flamingo,1
=================================
I refer to #Rahul Sharma 's answer and modifiy my code as below. Then my code works in the both environments.
Thank you very much #Rahul Sharma and #Serhiy for your precise answer and useful comments.
Test.java
import java.io.IOException;
import java.util.StringTokenizer;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.DataInput;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.net.URI;
import java.io.InputStreamReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.io.WritableUtils;
public class Test {
public static HashMap<String, Integer> map = new HashMap<String, Integer>();
public static class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void setup(Context context) throws IOException, InterruptedException {
URI[] files = context.getCacheFiles();
Path list_path = new Path(files[0]);
try {
FileSystem fs = list_path.getFileSystem(context.getConfiguration());
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(list_path)));
String line = null, name = null;
int leg = 0;
while ((line = br.readLine()) != null) {
if (!line.startsWith("name") && !line.trim().isEmpty()) {
String[] name_leg = line.split(",", 0);
name = name_leg[0];
leg = Integer.parseInt(name_leg[1]);
map.put(name, leg);
}
}
br.close();
}
catch(IOException ex) {
System.err.println(ex.getMessage());
ex.printStackTrace();
}
for(Map.Entry<String, Integer> e : map.entrySet()) {
System.out.println(e.getKey() + ":" + e.getValue());
}
}
#Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
for(Map.Entry<String, Integer> e : map.entrySet()) {
context.write(new Text(e.getKey()), new Text("test"));
}
}
}
public static class Reducer1 extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> vals, Context context) throws IOException, InterruptedException {
context.write(new Text("test"), key);
}
}
// Writer
public static class CommaTextOutputFormat extends TextOutputFormat<Text, Text> {
#Override
public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
String extension = ".txt";
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<Text, Text>(fileOut, ",");
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 3) {
System.err.println(
"Need 3 arguments: <input dir> <output base dir> <list path>");
System.exit(1);
}
Job job = Job.getInstance(conf, "test");
job.addCacheFile(new Path(args[2]).toUri());
job.setJarByClass(Test.class);
job.setMapperClass(Mapper1.class);
job.setReducerClass(Reducer1.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(TextInputFormat.class);
// mapper output
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// reducer output
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// formtter
job.setOutputFormatClass(CommaTextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
if(!job.waitForCompletion(true)){
System.exit(1);
}
System.out.println("All Finished");
System.exit(0);
}
}
First you need to learn more about mapreduce framework.
Your program behave as expected in local mode because Mapper, reducer and Job are launched on same JVM. In case, of pseudo-distributed mode or distributed modes there will be separate jvms allocated for each component. The values you put into hashMap using get_list are not visible to mapper and reducer as they are in separate jvms
Use distributed cache to make it work in cluster mode.
Job Main class add file to distributed cache:
JobConf job = new JobConf();<br>
DistributedCache.addCacheArchive(new URI(args[2]), job);
Access file in mapper or reducer:
public void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.getLocal(conf);
Path[] dataFile = DistributedCache.getLocalCacheFiles(conf);
BufferedReader cacheReader = new BufferedReader(new InputStreamReader(fs.open(dataFile[0])));
// Implement here get_list method functionality
}
I am writing a mapreduce program for matrix addition. Since it requires 2 input files, i am using MultipleInputs. I have these following classes
MatAddMapper1.java
package mapred;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MatAddMapper1 extends Mapper<LongWritable, Text, Text, IntWritable>
{
//private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException
{
String line = value.toString();
String[] content = line.split (" ");
String key1 = content[0] + " " + content[1];
int val = Integer.parseInt(content[2]);
// Key is (i,j)
context.write(new Text(key1), new IntWritable(val));
}
}
MatAddMapper2.java is similar.
MatAddReducer.java
package mapred;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MatAddReducer
extends Reducer<Text, IntWritable, Text, IntWritable>
{
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException
{
int val = 0;
for (IntWritable value : values)
{
val = val + value.get();
}
context.write(key, new IntWritable(val));
}
}
MatAddApp.java (Main class)
package mapred;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MatAddApp extends Configured implements Tool
{
public int run(String[] args) throws Exception
{
Configuration conf = new Configuration();
#SuppressWarnings("deprecation")
Job job = new Job(conf, "Matrix Addition");
job.setJarByClass(MatAddApp.class);
MultipleInputs.addInputPath(job,new Path(args[0]),TextInputFormat.class,MatAddMapper1.class);
MultipleInputs.addInputPath(job,new Path(args[1]),TextInputFormat.class,MatAddMapper2.class);
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.setReducerClass(MatAddReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception
{
int ecode = ToolRunner.run(new MatAddApp(), args);
System.exit(ecode);
}
}
I am using eclipse and created a jar file MatAddition.jar. M.txt and N.txt are input matrices. When I tried to run the program in my hadoop cluster, I got the following error
Exception in thread "main" java.lang.ClassNotFoundException: MatAddApp
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:278)
at org.apache.hadoop.util.RunJar.run(RunJar.java:214)
at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
The issue is because of the classname.The driver classname should be fully qualified when setting in configuration as follows:
job.setJarByClass(mapred.MatAddApp.class);
Input.txt
A,0|0,1.0
A,0|1,2.0
A,0|2,3.0
A,0|3,4.0
A,1|0,5.0
A,1|1,6.0
A,1|2,7.0
A,1|3,8.0
B,0|0,1.0
B,0|1,2.0
B,0|2,3.0
B,0|3,4.0
B,1|0,5.0
B,1|1,6.0
B,1|2,7.0
B,1|3,8.0
Here, the first column represents the name of the matrix, second column represents the index and the third represents the value.
MatrixAdd.java
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import java.util.ArrayList;
import java.util.Iterator;
import java.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MatrixAdd {
public static class MatMapper extends Mapper<Object, Text, Text, DoubleWritable>{
private Text index = new Text();
private final static DoubleWritable num = new DoubleWritable();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
String record = value.toString();
String[] parts = record.split(",");
index.set(parts[1]);
num.set(Double.parseDouble(parts[2]));
context.write(index, num);
}
}
public static class MatReducer extends Reducer<Text,DoubleWritable,Text,DoubleWritable> {
private DoubleWritable result = new DoubleWritable();
public void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
double sumValue = 0;
for(DoubleWritable val: values) {
sumValue += val.get();
}
result.set(sumValue);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "max temp");
job.setJarByClass(MatrixAdd.class);
job.setMapperClass(MatMapper.class);
job.setCombinerClass(MatReducer.class);
job.setReducerClass(MatReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Output:
0|0 2.0
0|1 4.0
0|2 6.0
0|3 8.0
1|0 10.0
1|1 12.0
1|2 14.0
1|3 16.0
I am trying to learn MapReduce and doing this task.
My input is as below(State, Sport, Amount(in USD)):
California Football 69.09
California Swimming 31.5
Illinois Golf 8.31
Illinois Tennis 15.75
Oklahoma Golf 15.44
Oklahoma Tennis 8.33
Texas Golf 16.71
Texas Swimming 71.59
Washington Football 50.32000000000001
And I am expecting my output such that the output should display which sport is popular in the particular state (depending on the highest sales of sport items). For eg:
California Football 69.09
Illinois Tennis 15.75
Oklahoma Golf 15.44
and so on
Below are my Mapper, Reducer and Driver codes:
Mapper code:
package org.assignment.sports;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class Sports_Mapper2 extends Mapper<LongWritable, Text, Text, Text>{
public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException{
String[] s= value.toString().split(" ");
String Sport_State = s[0];
String other = s[1]+" "+s[2];
context.write(new Text(Sport_State), new Text(other));
}
}
Reducer Code:
package org.assignment.sports;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class Sports_Reducer2 extends Reducer<Text, Text, Text, DoubleWritable>{
private static double MAX=0.00;
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException
{
//String[] k= values.toString().split(" ");
for (Text value:values){
String[] k= value.toString().split(" ");
DoubleWritable price = new DoubleWritable(Double.parseDouble(k[1]));
if(price.get()>MAX){
MAX = price.get();
}
else{
continue;
}
String ss = key.toString()+" "+ k[0];
context.write(new Text(ss), new DoubleWritable(MAX));
}
}
}
Driver Code:
package org.assignment.sports;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Sports_Driver2 {
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = new Job(conf, "Sports_Driver2");
String[] otherArgs =new GenericOptionsParser(conf, args).getRemainingArgs();
job.setJarByClass(Sports_Driver2.class);
job.setMapperClass(Sports_Mapper2.class);
job.setReducerClass(Sports_Reducer2.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job,new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)? 0: 1);
}
}
I am getting the output as below:
California Football 69.09
Texas Swimming 71.59
Where am I going wrong? Any help is appreciated
The problem is that the MAX value in the Reducer is not being reset after each particular state is written.
String ss = key.toString()+" "+ k[0];
context.write(new Text(ss), new DoubleWritable(MAX));
MAX = 0.00;
To take max of each values in Reducer for key you need to keep track of the sport's name aswell. Else it will produce wrong results.
Please try below code.
Driver
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Sports_Driver2 {
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Job job = new Job(conf, "Sports_Driver2");
String[] otherArgs =new GenericOptionsParser(conf, args).getRemainingArgs();
job.setJarByClass(Sports_Driver2.class);
job.setMapperClass(Sports_Mapper2.class);
job.setReducerClass(Sports_Reducer2.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
if(fs.exists(new Path(otherArgs[1]))){
fs.delete(new Path(otherArgs[1]), true);
}
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job,new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)? 0: 1);
}
}
Mapper
public class Sports_Mapper2 extends Mapper<LongWritable, Text, Text, Text>{
public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException{
String[] s= value.toString().split(" ");
String Sport_State = s[0];
String other = s[1]+" "+s[2];
context.write(new Text(Sport_State), new Text(other));
}
}
Reducer
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class Sports_Reducer2 extends Reducer<Text, Text, Text, DoubleWritable>{
Text keyEmit = new Text();
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException
{
Map<String,Double> getMax = new HashMap<>();
String sportName = "";
for (Text value:values){
String[] k= value.toString().split(" ");
sportName = k[0];
//store values
getMax.put(sportName, Double.parseDouble(k[1]));
}
/*
* Get maximum
*/
Map.Entry<String, Double> maxEntry = null;
for (Entry<String, Double> entry : getMax.entrySet())
{
if (maxEntry == null || entry.getValue().compareTo(maxEntry.getValue()) > 0)
{
maxEntry = entry;
}
}
keyEmit.set(key.toString()+" "+maxEntry.getKey());
context.write(keyEmit, new DoubleWritable(maxEntry.getValue()));
}
}
Output
California Football 69.09
Illinois Tennis 15.75
Oklahoma Golf 15.44
Texas Swimming 71.59
Washington Football 50.32000000000001
Hope this helps.