Map Reduce String out of bound error on String Concatenation - java

I am trying to write a map reduce code that takes a table stored in text file. The table has two attributes. One is id and second is name and the code should take all the values with same id and concatenate them . Ex: 1 xyz 2 xyz 1 abc should result to 1 xyzabc 2 xyz.
Following is my version of code.As a beginner i have modified the MaxTemperature code to learn doing that
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MaxTemperature {
public static class MaxTemperatureMapper
extends Mapper<Text, Text, Text, Text> {
#Override
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String lastWord = line.substring(line.lastIndexOf(" ")+1);
Text valq = new Text();
valq.set(line.substring(0,4));
context.write(new Text(lastWord), valq );
}
}
public static class MaxTemperatureReducer
extends Reducer<Text, Text, Text, Text> {
#Override
public void reduce(Text key, Iterable<Text> values,
Context context)
throws IOException, InterruptedException {
String p="";
for (Text value : values) {
p=p+value.toString();
}
Text aa= new Text();
aa.set(p);
context.write(key, new Text(aa));
}
}
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: MaxTemperature <input path> <output path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(MaxTemperature.class);
job.setJobName("Max temperature");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(MaxTemperatureMapper.class);
job.setReducerClass(MaxTemperatureReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(KeyValueTextInputFormat.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
My input file
123456 name
123456 name
123456 age
123456 age
123456 relation
132323 age
123565 name
258963 test
258963 age
254789 age
254259 age
652145 name
985745 name
523698 name
214569 ame
123546 name
123456 age
321456 age
123456 age
124589 hyderabad
~
Expected Output
123456 name,name,age (all values with index 123456)
124589 hyderabad (al values with index 124589)
I got the following error
java.lang.StringIndexOutOfBoundsException: String index out of range: 4
at java.lang.String.substring(String.java:1907)
at MaxTemperature$MaxTemperatureMapper.map(MaxTemperature.java:39)
at MaxTemperature$MaxTemperatureMapper.map(MaxTemperature.java:26)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:140)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:672)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:330)
at org.apache.hadoop.mapred.Child$4.run(Child.java:268)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1614)
at org.apache.hadoop.mapred.Child.main(Child.java:262)

3 things:
You haven't described the expected input very well, especially in the context of your code.
You haven't described what you're trying to do with your map/reduce methods, even if I can understand what you're trying to do.
You should check out the Javadoc for String.substring(int, int): http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#substring(int, int)

Related

Hadoop reducer ArrayIndexOutOfBoundsException when passing values from mapper

I'm trying to output two values from the mapper to the reducer by passing a string value but when I parse the string in the Mapper I get an out of bounds error. However, I made the string in the Mapper so I'm sure it has two values, what I'm doing wrong? How can I pass two values from the mapper to the reducer? (Eventually, I need to pass more variables to the reducer but this makes the problem a bit simpler.)
This is the error:
Error: java.lang.ArrayIndexOutOfBoundsException: 1
at TotalTime$TimeReducer.reduce(TotalTime.java:57)
at TotalTime$TimeReducer.reduce(TotalTime.java:1)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171)
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:628)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:390)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:174)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:168)
and this is my code
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TotalTime {
public static class TimeMapper extends Mapper<Object, Text, Text, Text> {
Text textKey = new Text();
Text textValue = new Text();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String data = value.toString();
String[] field = data.split(",");
if (null != field && field.length == 4) {
String strTimeIn[] = field[1].split(":");
String strTimeOout[] = field[2].split(":");
int timeOn = Integer.parseInt(strTimeIn[0]) * 3600 + Integer.parseInt(strTimeIn[1]) * 60 + Integer.parseInt(strTimeIn[2]);
int timeOff = Integer.parseInt(strTimeOout[0]) * 3600 + Integer.parseInt(strTimeOout[1]) * 60 + Integer.parseInt(strTimeOout[2]);
String v = String.valueOf(timeOn) + "," + String.valueOf(timeOff);
textKey.set(field[0]);
textValue.set(v);
context.write(textKey, textValue);
}
}
}
public static class TimeReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
Text textValue = new Text();
int sumTime = 0;
for (Text val : values) {
String line = val.toString();
// Split the string by commas
String[] field = line.split(",");
int timeOn = Integer.parseInt(field[0]);
int timeOff = Integer.parseInt(field[1]);
int time = timeOff - timeOn;
sumTime += time;
}
String v = String.valueOf(sumTime);
textValue.set(v);
context.write(key, textValue);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "User Score");
job.setJarByClass(TotalTime.class);
job.setMapperClass(TimeMapper.class);
job.setCombinerClass(TimeReducer.class);
job.setReducerClass(TimeReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
The input file looks like this:
ID2347,15:40:51,16:21:44,20
ID4568,14:27:57,14:58:04,72
ID8755,13:40:49,13:42:31,99
ID3258,13:12:48,13:37:11,73
ID9666,13:44:34,15:53:36,114
ID8755,09:43:59,10:47:52,123
ID3258,10:25:22,10:41:12,14
ID9666,09:40:10,11:44:01,15
It seems it is combiner which causes your code fails. remember combiner is a piece of code that is ran before reducer. now imagine this scenario:
your mapper process this line:
ID2347,15:40:51,16:21:44,20
and write following output to context
[ID2347, (56451,58904)]
now combiner come into play and process the output of your mapper before reducer and produce this:
[ID2347, 2453]
now above line go to reducer and it fails because in your code your assumption is the value is something like this val1,val2
if you want to your code work just remove combiner [or change your logic]

Mapreduce Mapper explaination

There is an NCDC weather data set example in Hadoop definite guide.
The Mapper class code is as follows
Example 2-3. Mapper for maximum temperature example
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MaxTemperatureMapper
extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final int MISSING = 9999;
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String year = line.substring(15, 19);
int airTemperature;
if (line.charAt(87) == '+') { // parseInt doesn't like leading plus signs
airTemperature = Integer.parseInt(line.substring(88, 92));
} else {
airTemperature = Integer.parseInt(line.substring(87, 92));
}
String quality = line.substring(92, 93);
if (airTemperature != MISSING && quality.matches("[01459]")) {
context.write(new Text(year), new IntWritable(airTemperature));
}
}
}
And the driver code is:
Example 2-5. Application to find the maximum temperature in the weather dataset
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxTemperature {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: MaxTemperature <input path> <output path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(MaxTemperature.class);
job.setJobName("Max temperature");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(MaxTemperatureMapper.class);
job.setReducerClass(MaxTemperatureReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
I'm not able to understand since we pass a file containing multiple lines why there is no iteration on lines. The code seems as if it is processing on a single line.
The book explains what Mapper<LongWritable, Text,means... The key is the offset of the file. The value is a line.
It also mentions that TextInputFormat is the default mapreduce input type, which is a type of FileInputFormat
public class TextInputFormat
extends FileInputFormat<LongWritable,Text>
And therefore, the default input types must be Long, Text pairs
As the JavaDoc says
Files are broken into lines. Either linefeed or carriage-return are used to signal end of line. Keys are the position in the file, and values are the line of text..
The book also has sections on defining custom RecordReaders
You need to call job.setInputFormatClass to change it to read anything other than single lines

Getting NumberFormatException exception still after all input data is good

I am wondering to see this issue while parsing the file by Mapper. My code is pretty simple, I am taking the data by "::" separated file line.
For example (input):
1::Toy Story (1995)::2077
Using below snip code of mapper which I usually doing in my practice
String tokens[]= value.toString().split("::");
int empId = Integer.parseInt(tokens[0]) ;
int count = Integer.parseInt(tokens[2]) ;
Technically line should split as below.
1 Toy Story (1995) 2077
tokens[0] tokens[1] tokens[2]
So, If I am looking for tokens[0] and tokens[2] then also why job is picking tokens[1], which is throwing below NumberFormatException exception and this is expected exception if I am trying to parse char to int. Could you please help me out from this.
17/09/05 19:06:49 INFO mapreduce.Job: Task Id : attempt_1500305785265_0095_m_000000_2, Status : FAILED
Error: java.lang.NumberFormatException: For input string: "1::Toy Story (1995)::2077"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Integer.parseInt(Integer.java:580)
at java.lang.Integer.parseInt(Integer.java:615)
at com.dataflair.comparableTest.ValueSortExp$MapTask.map(ValueSortExp.java:93)
at com.dataflair.comparableTest.ValueSortExp$MapTask.map(ValueSortExp.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:784)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:168)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1642)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:163)
CODE
import java.io.IOException;
import java.nio.ByteBuffer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.IntWritable.Comparator;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class ValueSortExp2 {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration(true);
String arguments[] = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = new Job(conf, "Test commond");
job.setJarByClass(ValueSortExp.class);
// Setup MapReduce
job.setMapperClass(ValueSortExp.MapTask.class);
job.setReducerClass(ValueSortExp.ReduceTask.class);
job.setNumReduceTasks(1);
// Specify key / value
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
//job.setSortComparatorClass(IntComparator.class);
// Input
FileInputFormat.addInputPath(job, new Path(arguments[0]));
job.setInputFormatClass(TextInputFormat.class);
// Output
FileOutputFormat.setOutputPath(job, new Path(arguments[1]));
job.setOutputFormatClass(TextOutputFormat.class);
/*
* // Delete output if exists FileSystem hdfs = FileSystem.get(conf); if
* (hdfs.exists(outputDir)) hdfs.delete(outputDir, true);
*
* // Execute job int code = job.waitForCompletion(true) ? 0 : 1;
* System.exit(code);
*/
// Execute job
int code = job.waitForCompletion(true) ? 0 : 1;
System.exit(code);
}
/*public static class IntComparator extends WritableComparator {
public IntComparator() {
super(IntWritable.class);
}
#Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
Integer v1 = ByteBuffer.wrap(b1, s1, l1).getInt();
Integer v2 = ByteBuffer.wrap(b2, s2, l2).getInt();
return v1.compareTo(v2) * (-1);
}
}*/
public static class MapTask extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
public void map(LongWritable key,Text value, Context context) throws IOException, InterruptedException {
String tokens[]= value.toString().split("::");
int empId = Integer.parseInt(tokens[0]) ;
int count = Integer.parseInt(tokens[2]) ;
context.write(new IntWritable(count), new IntWritable(empId));
}
}
public static class ReduceTask extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
public void reduce(IntWritable key, Iterable<IntWritable> list, Context context)
throws java.io.IOException, InterruptedException {
for (IntWritable value : list) {
context.write(key, value);
}
}
}
}
INPUT DATA
1::Toy Story (1995)::2077
10::GoldenEye (1995)::888
100::City Hall (1996)::128
1000::Curdled (1996)::20

Empty reduce output while using Custom Partitioner in MapReduce

Problem Statement:
Input
Monami 45000 A
Tarun 34000 B
Riju 25000 C
Rita 42000 A
Mithun 40000 A
Archana 21000 C
Shovik 32000 B
I want to use Custom Partitioner in Mapreduce to separate employee records with grade A, B and C in three different output files.
Output 1
Monami 45000 A
Rita 42000 A
Mithun 40000 A
Output 2
Tarun 34000 B
Shovik 32000 B
Output 3
Riju 25000 C
Archana 21000 C
Map Code:
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
//import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;
public class Map
extends Mapper<LongWritable,Text,Text,Text>
{
//private Text key1 = new Text();
//private Text value1 = new Text();
#Override
protected void map(LongWritable key,Text value,Context context)
throws IOException,InterruptedException
{
String line = value.toString();
String[] part = line.split("\t");
int len = part.length;
//System.out.println(len);
if (len == 3)
{
context.write(new Text(part[2]), new Text(part[0]+"\t"+part[1]));
//System.out.println(part[0]+part[1]+part[2]);
}
}
Partitioner Code
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class CustomPartitioner
extends Partitioner<Text,Text>
{
#Override
public int getPartition(Text key, Text value, int numReduceTasks)
{
if(numReduceTasks==0)
return 0;
if(key.equals(new Text("A")))
return 0;
if(key.equals(new Text("B")))
return 1;
else
return 2;
}
}
Reduce Code
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
//import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class Reduce
extends Reducer<Text,Text,Text,Text>
{
#Override
protected void reduce(Text key,Iterable<Text> values,Context context)
throws IOException,InterruptedException
{
Iterator<Text> itr = values.iterator();
while(itr.hasNext())
{
context.write(new Text(itr.next().getBytes()),new Text(key));
}
}
}
Driver Class
import org.apache.hadoop.fs.Path;
//import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MapReduceDriver
{
public static void main(String[] args) throws Exception
{
Job job = new Job();
job.setJarByClass(MapReduceDriver.class);
job.setJobName("Custom Partitioner");
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(Map.class);
job.setPartitionerClass(CustomPartitioner.class);
job.setReducerClass(Reduce.class);
job.setNumReduceTasks(3);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
The code runs without any errors but three reduce output files are empty. Also when the job runs, it shows map output bytes as zero. Hence I believe the map is not generating any key-value pairs. But I cannot find out the reason. Can you help me find the mistake?
Also I have one more confusion: In Map class, when variable len is checked for > 0, then I am getting ArrayIndexOutOfBoundsException but it runs fine without any exception if checked with == 3. Why does it throw an exception with > 0 ?
The problem is that your input data (as pasted here) is not tab-separated, but comma-separated. It should work fine, if you replace this line:
String[] part = line.split("\t");
with this line:
String[] part = line.split(" ");
The reason you are getting an exception when you check for len > 0 is that your string is not split into any sub-parts so len is 1. Then it satisfies the if condition and tries to execute something for the position 2 of parts, which does not exist.
In the existing code, len is not 3, so the code never enters the if block, hence, no exception thrown.

How to override the default sorting of Hadoop

I have a map-reduce job in which the keys are numbers from 1-200. My intended output was (number,value) in the number order.
But I'm getting the output as :
1 value
10 value
11 value
:
:
2 value
20 value
:
:
3 value
I know this is due to the default behavior of Map-Reduce to sort keys in ascending order.
I want my keys to be sorted in numerical order only. How can I achieve this?
If I had to take a guess, I'd say that you are storing your numbers as Text objects and not IntWritable objects.
Either way, once you have more than one reducer, only the items within a reducer will be sorted, but it won't be totally sorted.
The default WritableComparator in MapReduce framework would normally handle your numerical ordering if the key was IntWritable. I suspect it's getting a Text key thus resulting in lexicographical ordering in your case. Please have a look at the sample code which uses IntWritable key to emit the values:
1) Mapper Implementaion
package com.stackoverflow.answers.mapreduce;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SourceFileMapper extends Mapper<LongWritable, Text, IntWritable, Text> {
private static final String DEFAULT_DELIMITER = "\t";
private IntWritable keyToEmit = new IntWritable();
private Text valueToEmit = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
keyToEmit.set(Integer.parseInt(line.split(DEFAULT_DELIMITER)[0]));
valueToEmit.set(line.split(DEFAULT_DELIMITER)[1]);
context.write(keyToEmit, valueToEmit);
}
}
2) Reducer Implementation
package com.stackoverflow.answers.mapreduce;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class SourceFileReducer extends Reducer<IntWritable, Text, IntWritable, Text> {
public void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException,
InterruptedException {
for (Text value : values) {
context.write(key, value);
}
}
}
3) Driver Implementation
package com.stackoverflow.answers.mapreduce;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class SourceFileDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Path inputPath = new Path(args[0]);
Path outputDir = new Path(args[1]);
// Create configuration
Configuration conf = new Configuration(true);
// Create job
Job job = new Job(conf, "SourceFileDriver");
job.setJarByClass(SourceFileDriver.class);
// Setup MapReduce
job.setMapperClass(SourceFileMapper.class);
job.setReducerClass(SourceFileReducer.class);
job.setNumReduceTasks(1);
// Specify key / value
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
// Input
FileInputFormat.addInputPath(job, inputPath);
job.setInputFormatClass(TextInputFormat.class);
// Output
FileOutputFormat.setOutputPath(job, outputDir);
job.setOutputFormatClass(TextOutputFormat.class);
// Delete output if exists
FileSystem hdfs = FileSystem.get(conf);
if (hdfs.exists(outputDir))
hdfs.delete(outputDir, true);
// Execute job
int code = job.waitForCompletion(true) ? 0 : 1;
System.exit(code);
}
}
Thank you!

Categories

Resources