My Map Reduce Structure
public class ChainingMapReduce {
public static class ChainingMapReduceMapper
extends Mapper<Object, Text, Text, IntWritable>{
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
// code
}
}
}
public static class ChainingMapReduceReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
//code
}
}
public static class ChainingMapReduceMapper1
extends Mapper<Object, Text, Text, IntWritable>{
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
//code
}
}
}
public static class ChainingMapReduceReducer1
extends Reducer<Text,IntWritable,Text,IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
//code
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = new Job(conf, "First");
job.setJarByClass(ChainingMapReduce.class);
job.setMapperClass(ChainingMapReduceMapper.class);
job.setCombinerClass(ChainingMapReduceReducer.class);
job.setReducerClass(ChainingMapReduceReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path("/home/Desktop/log"));
FileOutputFormat.setOutputPath(job, new Path("/home/Desktop/temp/output"));
job.waitForCompletion( true );
System.out.println("First Job Completed.....Starting Second Job");
System.out.println(job.isSuccessful());
/* FileSystem hdfs = FileSystem.get(conf);
Path fromPath = new Path("/home/Desktop/temp/output/part-r-00000");
Path toPath = new Path("/home/Desktop/temp/output1");
hdfs.rename(fromPath, toPath);
conf.clear();
*/
if(job.isSuccessful()){
Configuration conf1 = new Configuration();
Job job1 = new Job(conf1,"Second");
job1.setJarByClass(ChainingMapReduce.class);
job1.setMapperClass(ChainingMapReduceMapper1.class);
job1.setCombinerClass(ChainingMapReduceReducer1.class);
job1.setReducerClass(ChainingMapReduceReducer1.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path("/home/Desktop/temp/output/part-r-00000)");
FileOutputFormat.setOutputPath(job, new Path("/home/Desktop/temp/output1"));
System.exit(job1.waitForCompletion(true) ? 0 : 1);
}
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
While I run this Program ...First Job get executed perfectly and after that following error come :
First Job Completed.....Starting Second Job true
12/01/27 15:24:21 INFO jvm.JvmMetrics: Cannot initialize JVM Metrics
with processName=JobTracker, sessionId= - already initialized 12/01/27
15:24:21 WARN mapred.JobClient: Use GenericOptionsParser for parsing
the arguments. Applications should implement Tool for the same.
12/01/27 15:24:21 WARN mapred.JobClient: No job jar file set. User
classes may not be found. See JobConf(Class) or
JobConf#setJar(String). 12/01/27 15:24:21 INFO mapred.JobClient:
Cleaning up the staging area
file:/tmp/hadoop/mapred/staging/4991311720439552/.staging/job_local_0002
Exception in thread "main"
org.apache.hadoop.mapred.InvalidJobConfException: Output directory not
set. at
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:123)
at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:872) at
org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:833) at
java.security.AccessController.doPrivileged(Native Method) at
javax.security.auth.Subject.doAs(Subject.java:396) at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1127)
at
org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:833)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:476) at
org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:506) at
ChainingMapReduce.main(ChainingMapReduce.java:129)
I tried to use "conf" for both jobs and "conf" "conf1" for respective jobs.
Change
FileInputFormat.addInputPath(job, new Path("/home/Desktop/temp/output/part-r-00000)");
FileOutputFormat.setOutputPath(job, new Path("/home/Desktop/temp/output1"));
to
FileInputFormat.addInputPath(job1, new Path("/home/Desktop/temp/output/part-r-00000)");
FileOutputFormat.setOutputPath(job1, new Path("/home/Desktop/temp/output1"));
for the second job.
Also consider using o.a.h.mapred.jobcontrol.Job and Apache Oozie.
Related
I have been working on a map reduce program and it works well in the hadoop hdfs environment in virtual machine. But when I tried the same program in windows with Intellij I'm getting this error.
WordCount.class // used this as sample program for testing whether it works or not.
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Intellij Error Log
2019-12-12 21:42:04,139 INFO [main] Configuration.deprecation (Configuration.java:warnOnceIfDeprecated(1181)) - session.id is deprecated. Instead, use dfs.metrics.session-id
2019-12-12 21:42:04,144 INFO [main] jvm.JvmMetrics (JvmMetrics.java:init(79)) - Initializing JVM Metrics with processName=JobTracker, sessionId=
2019-12-12 21:42:08,029 WARN [main] mapreduce.JobResourceUploader (JobResourceUploader.java:uploadFiles(64)) - Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
2019-12-12 21:42:08,089 INFO [main] mapreduce.JobSubmitter (JobSubmitter.java:submitJobInternal(251)) - Cleaning up the staging area file:/tmp/hadoop/mapred/staging/Abhishek1224360463/.staging/job_local1224360463_0001
Exception in thread "main" 0: No such file or directory
at org.apache.hadoop.io.nativeio.NativeIO$POSIX.chmod(NativeIO.java:236)
at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:767)
at org.apache.hadoop.fs.ChecksumFileSystem$1.apply(ChecksumFileSystem.java:506)
at org.apache.hadoop.fs.ChecksumFileSystem$FsOperation.run(ChecksumFileSystem.java:487)
at org.apache.hadoop.fs.ChecksumFileSystem.setPermission(ChecksumFileSystem.java:503)
at org.apache.hadoop.fs.FileSystem.mkdirs(FileSystem.java:619)
at org.apache.hadoop.mapreduce.JobResourceUploader.uploadFiles(JobResourceUploader.java:94)
at org.apache.hadoop.mapreduce.JobSubmitter.copyAndConfigureFiles(JobSubmitter.java:97)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:192)
at org.apache.hadoop.mapreduce.Job$11.run(Job.java:1341)
at org.apache.hadoop.mapreduce.Job$11.run(Job.java:1338)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1807)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1338)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1359)
at WordCount.main(WordCount.java:59)
I have given input by sending directory name as argument to main class i.e by editing run configurations and passing the directory name which contains the text file. (Input Arguments: input output)
I have input directory under the project root folder.
Running Intellij in Administrator Mode did the trick. That is weird though. Will be appreciated if anyone explains me about this.
I'm new in hadoop and I need to read a parquet file at map stage of map reduce process. I've found the following snippets of code at cloudera:
public static class MyMap extends
Mapper<LongWritable, Group, NullWritable, Text> {
#Override
public void map(LongWritable key, Group value, Context context) throws IOException, InterruptedException {
NullWritable outKey = NullWritable.get();
String outputRecord = "";
// Get the schema and field values of the record
String inputRecord = value.toString();
// Process the value, create an output record
// ...
context.write(outKey, new Text(outputRecord));
}
}
Job configuration:
public int run(String[] args) throws Exception {
Job job = new Job(getConf());
job.setJarByClass(getClass());
job.setJobName(getClass().getName());
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(MyMap.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(ExampleInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
return 0;
}
The question is Can I use my own type instead of key and value and how to implement it ? I mean sort of pojo which represent one record from parquet file.
I learning hadoop. I wrote simple program in Java. Program have to counts words (and creates file with words and number of times each word appears), but program only creates a file with all words, and number "1" near every word. It's look like :
rmd 1
rmd 1
rmd 1
rmd 1
rmdaxsxgb 1
But I want :
rmd 4
rmdaxsxgb 1
As I understood, works only map function. (I tried to comment reduce function, and have the same result).
My code (it is a typical example, of mapreduce program; it can be easily finded in internet or books about hadoop):
public class WordCount {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "wordcount");
job.setJarByClass(WordCount.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
} }
I use hadoop on amazon web services, and don't understand why it doesn't work properly.
This could be because of the mix and match of the APIs. There are 2 APIs for hadoop the older being mapred and latest being mapreduce.
In the latest API, the reducer handles the values as an Iterable compared to Iterator (old API) as in your code.
Try -
public class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
#Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value:values) {
sum += value.get();
}
context.write(key, new IntWritable(sum));
}
}
It looks like that there is no reducer running in your hadoop cluster.
You can set it in three ways. You can either set it in your mapred-site.xml. set the property like
<property>
<name>mapred.reduce.tasks</name>
<value>1</value>
</property>
OR by setting it in command line like
-D mapred.reduce.tasks=1
OR by defining it in your main class
job.setNumReduceTasks(1);
To set it permanently for all jobs, you should set the property in your mapred-site.xml.
I was trying to do a simple sort example with TotalOrderPartitioner. The input is a sequence file with IntWritable as key and NullWritable as value. I want to sort based on key. The output of is a sequence file with IntWritable as key and NullWritable as value. I'm running this job in clustered environment. This is my driver class:
public class SortDriver extends Configured implements Tool {
#Override
public int run(String[] args) throws Exception {
Configuration conf = this.getConf();
Job job = Job.getInstance(conf);
job.setJobName("SORT-WITH-TOTAL-ORDER-PARTITIONER");
job.setJarByClass(SortDriver.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
SequenceFileInputFormat.setInputPaths(job, new Path("/user/client/seq-input"));
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(NullWritable.class);
job.setMapperClass(SortMapper.class);
job.setReducerClass(SortReducer.class);
job.setPartitionerClass(TotalOrderPartitioner.class);
TotalOrderPartitioner.setPartitionFile(conf, new Path("/user/client/partition.lst"));
job.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setCompressOutput(job, true);
SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
SequenceFileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
SequenceFileOutputFormat.setOutputPath(job, new Path("/user/client/sorted-output"));
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(3);
InputSampler.Sampler<IntWritable, NullWritable> sampler = new InputSampler.RandomSampler<>(0.1, 200);
InputSampler.writePartitionFile(job, sampler);
boolean res = job.waitForCompletion(true);
return res ? 0 : 1;
}
public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new Configuration(), new SortDriver(), args));
}
}
Mapper class:
public class SortMapper extends Mapper<IntWritable, NullWritable, IntWritable, NullWritable>{
#Override
protected void map(IntWritable key, NullWritable value, Context context) throws IOException, InterruptedException {
context.write(key, value);
}
}
Reducer class:
public class SortReducer extends Reducer<IntWritable, NullWritable, IntWritable, NullWritable> {
#Override
protected void reduce(IntWritable key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
When I run this job I get:
Error: java.lang.IllegalArgumentException: Can't read partitions file
at org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner.setConf(TotalOrderPartitioner.java:116)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:73)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:133)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.<init>(MapTask.java:678)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:747)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:340)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:167)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1557)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:162)
Caused by: java.io.FileNotFoundException: File file:/grid/hadoop/yarn/local/usercache/client/appcache/application_1406784047304_0002/container_1406784047304_0002_01_000003/_partition.lst does not exist
at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:511)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:724)
at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:501)
at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:397)
at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1749)
at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1773)
at org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner.readPartitions(TotalOrderPartitioner.java:301)
at org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner.setConf(TotalOrderPartitioner.java:88)
... 10 more
I found partition file in my home directory(/user/client) with name _partition.lst. The partition file name does not match with code: TotalOrderPartitioner.setPartitionFile(conf, new Path("/user/client/partition.lst"));. Can anyone help me with this problem? I'm using hadoop 2.4 in HDP 2.1 distribution.
I think the problem is in the line:
TotalOrderPartitioner.setPartitionFile(conf, new Path("/user/client/partition.lst"));
You have to replace it with:
TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("/user/client/partition.lst"));
since you are using
InputSampler.writePartitionFile(job, sampler);
Otherwise, try replacing the last line only with:
InputSampler.writePartitionFile(conf, sampler);
But I am not sure if it works like that in the new API.
Hope it helps! Good luck!
I also found this error when I was using hadoop mapreduce and the mapreduce service had not been installed and started. After installing mapreduce and starting it, the exception disappeared.
got this error when I had job.setNumReduceTasks(3); and was running my code in standalone mode
changed it to job.setNumReduceTasks(1) and worked fine in standalone mode
I am trying to tweak an existing problem to suit my needs..
Basically input is simple text
I process it and pass key/value pair to reducer
And I create a json.. so there is key but no value
So mapper:
Input: Text/Text
Output: Text/Text
Reducer: Text/Text
Output: Text/None
My signatures are as follows:
public class AdvanceCounter {
/**
* The map class of WordCount.
*/
public static class TokenCounterMapper
extends Mapper<Object, Text, Text, Text> { // <--- See this signature
public void map(Object key, Text value, Context context) // <--- See this signature
throws IOException, InterruptedException {
context.write(key,value); //both are of type text OUTPUT TO REDUCER
}
}
public static class TokenCounterReducer
extends Reducer<Text, Text, Text, **NullWritable**> { // <--- See this signature Nullwritable here
public void reduce(Text key, Iterable<Text> values, Context context) // <--- See this signature
throws IOException, InterruptedException {
for (Text value : values) {
JSONObject jsn = new JSONObject();
//String output = "";
String[] vals = value.toString().split("\t");
String[] targetNodes = vals[0].toString().split(",",-1);
try {
jsn.put("source",vals[1]);
jsn.put("targets",targetNodes);
context.write(new Text(jsn.toString()),null); // no value
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = new Job(conf, "Example Hadoop 0.20.1 WordCount");
// ...
//
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
But on execution i am getting this error:
13/06/04 13:08:26 INFO mapred.JobClient: Task Id : attempt_201305241622_0053_m_000008_0, Status : FAILED
java.io.IOException: Type mismatch in value from map: expected org.apache.hadoop.io.NullWritable, recieved org.apache.hadoop.io.Text
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1019)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:691)
at org.apache.hadoop.mapreduce.TaskInputOutputContext.write(TaskInputOutputContext.java:80)
at org.sogou.Stinger$TokenCounterMapper.map(Stinger.java:72)
at org.sogou.Stinger$TokenCounterMapper.map(Stinger.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1093)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
You haven't specified your map output types, so it's taking the same as you set for your reducer, which are Text and NullWritable which is incorrect for your mapper. You should do the following to avoid any confusing it's better to specify all your types for both mapper and reducer:
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);