Hadoop map-reduce mapper programming - java

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class ADDMapper extends MapReduceBase implements Mapper<LongWritable,
Text,Text,LongWritable>
{ #Override
public void map(LongWritable key, Text value,OutputCollector<Text, LongWritable> output, Reporter r)throws IOException
{
String s=value.toString();
char[] words=s.toCharArray();
int wno=0;
int ino=0;
for(int i=0;i<words.length;i++)
{
String temp="";
for(int j=ino;j<words.length;j++)
{
if(words[j]!=' ')
{ temp+=words[j];
}
else
{
wno=j;
if(temp!="")
{
ino=ino + key; //////POINT OF ERROR
output.collect(new Text(temp),new LongWritable(ino));
}
temp="";
ino=wno+1;
break;
}
}
}
}
}
I want to get the index value of every string, sorted by string.
The above code is neither giving the index value nor shuffling the strings.
let
input file:
hi how are you
hi i am right.
how is your job.
hi are you ok.
output:
am 50
are 7,33
hi 0,30,44
how 3,14
.
.

Please run the below code, it is running fine and gives your expected output.
provide input and output path in command line arguments.(args[0], args[1])
import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
public class IndexCount {
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String str=value.toString();
String[] tokens = str.split(" "); //split into words
//create hashmap for unique word
HashMap<String,Integer> uniqueString = new HashMap<String,Integer>();
for(int i=0;i<tokens.length;i++){
uniqueString.put(tokens[i],1);
}
//for sorting create TreeMap from above hash map
TreeMap<String, Integer> map = new TreeMap<String,Integer>(uniqueString);
for (Entry<String, Integer> entry : map.entrySet()) {
int index=0;
//find the index of the word
index = str.indexOf((String)entry.getKey());
while (index >= 0) {
output.collect(new Text((String)entry.getKey()),new IntWritable(index));
index = str.indexOf((String)entry.getKey(), index + 1);
}
}
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
while (values.hasNext()) {
output.collect(key, new IntWritable(values.next().get()));
}
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("indexfinder");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}

Hi Shivendra I wrote the below logic of mapper that will help you to find the index of each string with sorted output.
Output of this code is sorted String with its index, then you can run reducer on this output.
String str=value.toString();
String[] tokens = str.split(" "); //split into words
//create hashmap for unique word
Map<String,Integer> uniqueString = new HashMap<String,Integer>();
for(int i=0;i<tokens.length;i++){
uniqueString.put(tokens[i],1);
}
//for sorting create TreeMap from above hash map
Map<String,Integer> map = new TreeMap<String,Integer>(uniqueString);
for (Map.Entry entry : map.entrySet()) {
int index=0;
//find the index of the word
index = str.indexOf((String)entry.getKey());
while (index >= 0) {
output.collect(new Text((String)entry.getKey()),new LongWritable(index));
index = str.indexOf((String)entry.getKey(), index + 1);
}
}
output of this logic:
am:20,
are:7,
are:50,
hi:0,
hi:15,
hi:47,
how:3,
how:30,
i:1,
i:16,
i:18,
i:24,
i:34,
i:48,
is:34,
job.:42,
ok.:58,
right.:23,
you:11,
you:37,
you:54,
your:37
It might be help you.

Please run the below code, its give expected output.
import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class Index {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String str=value.toString();
String[] tokens = str.split(" "); //split into words
//create hashmap for unique word
HashMap<String,Integer> uniqueString = new HashMap<String,Integer>();
for(int i=0;i<tokens.length;i++){
uniqueString.put(tokens[i],1);
}
//for sorting create TreeMap from above hash map
TreeMap<String, Integer> map = new TreeMap<String,Integer>(uniqueString);
Configuration conf=context.getConfiguration();
int strIndex = 0;
for (Entry<String, Integer> entry : map.entrySet()) {
//int index=0;
strIndex=conf.getInt("index", 0);
//find the index of the word
int index = str.indexOf((String)entry.getKey());
while (index >= 0) {
index+=strIndex;
context.write(new Text((String)entry.getKey()),new IntWritable(index));
index = str.indexOf((String)entry.getKey(), index + 1);
}
}
conf.setInt("index", strIndex+str.length());
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
for (IntWritable val : values) {
context.write(key, new IntWritable(val.get()));
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.setInt("index", 0);
Job job = new Job(conf, "index");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path("input"));
FileOutputFormat.setOutputPath(job, new Path("output"));
job.waitForCompletion(true);
}
}

Related

Map reduce example beside word count

I followed step by step via example in here : https://www.tutorialspoint.com/hadoop/hadoop_mapreduce.htm
I want to find max of each year in file like the following:
1320 23
1221 60
1320 33
1221 66
And the result that I expected is:
1320 33
1221 66
And I did like the following in java:
import java.util.*;
import java.io.IOException;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class ProcessUnits {
//Mapper class
public static class E_EMapper extends MapReduceBase implements
Mapper<LongWritable ,/*Input key Type */
Text, /*Input value Type*/
Text, /*Output key Type*/
IntWritable> /*Output value Type*/
{
//Map function
public void map(LongWritable key, Text value,
OutputCollector<Text, IntWritable> output,
Reporter reporter) throws IOException {
String line = value.toString();
String lasttoken = null;
StringTokenizer s = new StringTokenizer(line," ");
String year = s.nextToken();
while(s.hasMoreTokens()) {
lasttoken = s.nextToken();
}
int avgprice = Integer.parseInt(lasttoken);
output.collect(new Text(year), new IntWritable(avgprice));
}
}
//Reducer class
public static class E_EReduce extends MapReduceBase implements Reducer< Text, IntWritable, Text, IntWritable > {
//Reduce function
public void reduce( Text key, Iterator <IntWritable> values,
OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int maxavg = 0 ;
int val = Integer.MIN_VALUE;
while (values.hasNext()) {
val = values.next().get();
if(val > maxavg) {
maxavg = val ;
}
}
output.collect(key, new IntWritable(maxavg));
}
}
//Main function
public static void main(String args[])throws Exception {
JobConf conf = new JobConf(ProcessUnits.class);
conf.setJobName("max_eletricityunits");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(E_EMapper.class);
conf.setCombinerClass(E_EReduce.class);
conf.setReducerClass(E_EReduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
The error I got when I execute this program is the following:
Error: java.util.NoSuchElementException
at java.util.StringTokenizer.nextToken(StringTokenizer.java:349)
at ProcessUnits$E_EMapper.map(ProcessUnits.java:28)
at ProcessUnits$E_EMapper.map(ProcessUnits.java:14)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:54)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:465)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:349)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:178)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:172)
I know this problem is because my program can't maps line by line of file , it maps entire file
String line = value.toString();
String lasttoken = null;
StringTokenizer s = new StringTokenizer(line," ");
String year = s.nextToken();
while(s.hasMoreTokens()) {
lasttoken = s.nextToken();
}
int avgprice = Integer.parseInt(lasttoken);
output.collect(new Text(year), new IntWritable(avgprice));
Any idea to solve this problem from you guys?
Try reading each line from file once and split the values. Map all the corresponding years and prices. Then using reduce function compare the price with some constant if greater assign the value.
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class E_Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
public void map(LongWritable ikey, Text ivalue, Context context)
throws IOException, InterruptedException {
String line= ivalue.toString();
String [] values = line.splitBy(" ");
for(String price:values)
{
context.write(new Text(year),price);
}}}
public class E_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int avg=0;
for (IntWritable val : values) {
if(val.get()>avg){
context.write(key,new IntWritable(sum));
}}}

Map Reduce program creates an empty directory on execution

I am running the below Map reduce code to calculate sum and average length of words starting with each english alphabet.
For example : If the doc only contains the word 'and' 5 times
letter | total words | average length
a 5 3
The mapreduce program is as below:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LetterWiseAvgLengthV1
{
public static class TokenizerMapper
extends Mapper<LongWritable, Text, Text, Text>
{
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException
{
String st [] = value.toString().split("\\s+");
for(String word : st) {
String wordnew=word.replaceAll("[^a-zA-Z]","");
String firstLetter = wordnew.substring(0, 1);
if(!wordnew.isEmpty()){
// write ('a',3) if the word is and
context.write(new Text(firstLetter), new Text(String.valueOf(wordnew.length())));
}
else continue;
}
}
}
public static class IntSumReducer
extends Reducer<Text,Text,Text,Text>
{
public void reduce(Text key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException
{
int sum=0,count=0;
for (Text val : values)
{
sum += Integer.parseInt(val.toString());
count+= 1;
}
float avg=(sum/(float)count);
String op="Average length of " + count + " words = " + avg;
context.write(new Text(key), new Text(op));
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "wordLenAvgCombiner");
job.setJarByClass(LetterWiseAvgLengthV1.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
When I execute the program below on a text document, it creates an empty output directory in HDFS. There are no failures during execution, but the output folder is always empty

Output file contains Mapper Output instead of Reducer output

Hi I am trying to find average of few numbers using map reduce technique in stand alone mode. I have two input files.It contain values file1: 25 25 25 25 25 and file2: 15 15 15 15 15.
My program is working fine but the output file contains output of the mapper instead of reducer output.
Here is my code :
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.io.Writable;
import java.io.*;
public class Average {
public static class SumCount implements Writable {
public int sum;
public int count;
#Override
public void write(DataOutput out) throws IOException {
out.writeInt(sum);
out.writeInt(count);
}
#Override
public void readFields(DataInput in) throws IOException {
sum = in.readInt();
count =in.readInt();
}
}
public static class TokenizerMapper extends Mapper<Object, Text, Text, Object>{
private final static IntWritable valueofkey = new IntWritable();
private Text word = new Text();
SumCount sc=new SumCount();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
int sum=0;
int count=0;
int v;
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
v=Integer.parseInt(word.toString());
count=count+1;
sum=sum+v;
}
word.set("average");
sc.sum=sum;
sc.count=count;
context.write(word,sc);
}
}
public static class IntSumReducer extends Reducer<Text,Object,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<SumCount> values,Context context) throws IOException, InterruptedException {
int sum = 0;
int count=0;
int wholesum=0;
int wholecount=0;
for (SumCount val : values) {
wholesum=wholesum+val.sum;
wholecount=wholecount+val.count;
}
int res=wholesum/wholecount;
result.set(res);
context.write(key, result );
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "");
job.setJarByClass(Average.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(SumCount.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
after i run the program my output file is like this:
average Average$SumCount#434ba039
average Average$SumCount#434ba039
You can't use your Reducer class IntSumReducer as a combiner. A combiner must receive and emit the same Key/Value types.
So i would remove job.setCombinerClass(IntSumReducer.class);.
Remember the output from the combine is the input to the reduce, so writing out Text and IntWritable isnt going to work.
If your output files looked like part-m-xxxxx then the above issue could mean it only ran the Map phase and stoppped. Your counters would confirm this.
You also have Reducer<Text,Object,Text,IntWritable> which should be Reducer<Text,SumCount,Text,IntWritable>.

Java map reduce - count attributes in reduce

I am working on map reduce. I have two data sets. I have to combine these two based on an ID and count the number of occurrences of the ID separately for each context. (For example, if it lists the data from a travel agency that operates in a few states, the output I need is of the format : User ID - count of number of visits in NY, count of number of visits in IL). That data set contains the field state: 'NY'. I have a predefined set of states(NY, IL).
While reducing it, I am always getting the count as zero though there is data.
My output is UID 0 0 for all IDs.
Below is my code:
`import java.io.IOException;
import java.util.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
public class myMap {
/* Map*/
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer tokens = new StringTokenizer(line, ",");
Boolean eventFlag = false;
String UID = "", state = "";
while (tokens.hasMoreTokens()) {
String currToken = tokens.nextToken();
String[] keyValue = currToken.split(":");
if (keyValue[0].equals( "state")) {
state = keyValue[1].trim();
}
if (keyValue[0].equalsIgnoreCase( "user")) {
UID = keyValue[1];
}
}
output.collect(new Text(UID), new Text(state));
}
}
/* Reducer*/
public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text,Text> {
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
int nyCnt = 0;
int ilCnt = 0;
String currValue = new String();
while (values.hasNext()) {
currValue = values.next().toString();
if (currValue.equalsIgnoreCase("NY")) {
nyCnt+=1;
}
if (currValue.equalsIgnoreCase("IL")) {
ilCnt+=1;
}
output.collect(key , new Text(currValue));
}
String counts = Integer.toString(nyCnt) + " " + Integer.toString(ilCnt);
output.collect(key, new Text(counts) );
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(myMap.class);
conf.setJobName("myMap");
conf.setJarByClass(myMap.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
`
Any help regarding what is wrong will be useful. Thank you.

FileAlreadyExistsException while running MapReduce code

This program is supposed to accomplish the MapReduce job. The output of the first job has to be taken as the input of the second job.
When I run it, I get two errors:
Exception in thread "main" org.apache.hadoop.mapred.FileAlreadyExistsException
The mapping part is running 100% but the reducer is not running.
Here's my code:
import java.io.IOException;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.LongWritable;
public class MaxPubYear {
public static class FrequencyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text word = new Text();
String delim = ";";
Integer year = 0;
String tokens[] = value.toString().split(delim);
if (tokens.length >= 4) {
year = TryParseInt(tokens[3].replace("\"", "").trim());
if (year > 0) {
word = new Text(year.toString());
context.write(word, new IntWritable(1));
}
}
}
}
public static class FrequencyReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
context.write(key, new IntWritable(sum));
}
}
public static class MaxPubYearMapper extends
Mapper<LongWritable, Text, IntWritable, Text> {
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String delim = "\t";
Text valtosend = new Text();
String tokens[] = value.toString().split(delim);
if (tokens.length == 2) {
valtosend.set(tokens[0] + ";" + tokens[1]);
context.write(new IntWritable(1), valtosend);
}
}
}
public static class MaxPubYearReducer extends
Reducer<IntWritable, Text, Text, IntWritable> {
public void reduce(IntWritable key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
int maxiValue = Integer.MIN_VALUE;
String maxiYear = "";
for (Text value : values) {
String token[] = value.toString().split(";");
if (token.length == 2
&& TryParseInt(token[1]).intValue() > maxiValue) {
maxiValue = TryParseInt(token[1]);
maxiYear = token[0];
}
}
context.write(new Text(maxiYear), new IntWritable(maxiValue));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "Frequency");
job.setJarByClass(MaxPubYear.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(FrequencyMapper.class);
job.setCombinerClass(FrequencyReducer.class);
job.setReducerClass(FrequencyReducer.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1] + "_temp"));
int exitCode = job.waitForCompletion(true) ? 0 : 1;
if (exitCode == 0) {
Job SecondJob = new Job(conf, "Maximum Publication year");
SecondJob.setJarByClass(MaxPubYear.class);
SecondJob.setOutputKeyClass(Text.class);
SecondJob.setOutputValueClass(IntWritable.class);
SecondJob.setMapOutputKeyClass(IntWritable.class);
SecondJob.setMapOutputValueClass(Text.class);
SecondJob.setMapperClass(MaxPubYearMapper.class);
SecondJob.setReducerClass(MaxPubYearReducer.class);
FileInputFormat.addInputPath(SecondJob, new Path(args[1] + "_temp"));
FileOutputFormat.setOutputPath(SecondJob, new Path(args[1]));
System.exit(SecondJob.waitForCompletion(true) ? 0 : 1);
}
}
public static Integer TryParseInt(String trim) {
// TODO Auto-generated method stub
return(0);
}
}
Exception in thread "main"
org.apache.hadoop.mapred.FileAlreadyExistsException
Map-reduce job does not overwrite the contents in a existing directory. Output path to MR job must be a directory path which does not exist. MR job will create a directory at specified path with files within it.
In your code:
FileOutputFormat.setOutputPath(job, new Path(args[1] + "_temp"));
Make sure this path does not exist when you run MR job.

Categories

Resources