How do i create custom mapper class in mapreduce - java

I am having unique requirement where i have to pass the zip shell command from text file and mapper will process the script that will create zip files in parallel fashion using mapper only. I am thinking to execute shell command using exec in java. I am bit stuck on how to implement the custom mapper as my output would be compressed format.
Below is my mapper class -
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class Map extends Mapper<LongWritable, Text, Text, NullWritable>{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
String line= value.toString();
StringTokenizer tokenizer= new StringTokenizer(line);
while(tokenizer.hasMoreTokens()){
value.set(tokenizer.nextToken());
context.write(value,NullWritable.get());
}
}
}
Processor class -
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
public class ZipProcessor extends Configured implements Tool {
public static void main(String [] args) throws Exception{
int exitCode = ToolRunner.run(new ZipProcessor(), args);
System.exit(exitCode);
}
public int run(String[] args) throws Exception {
if(args.length!=2){
System.err.printf("Usage: %s needs two arguments, input and output files\n", getClass().getSimpleName());
return -1;
}
Configuration conf=new Configuration();
Job job = Job.getInstance(conf,"zipping");
job.setJarByClass(ZipProcessor.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapperClass(Map.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
int returnValue = job.waitForCompletion(true) ? 0:1;
if(job.isSuccessful()) {
System.out.println("Job was successful");
} else if(!job.isSuccessful()) {
System.out.println("Job was not successful");
}
return returnValue;
}
}
Sample mapr.txt
zip -r "/folder1/file.zip" "sourceFolder"
zip -r "/folder2/file.zip" "sourceFolder"
zip -r "/folder3/file.zip" "sourceFolder"

Related

Hadoop program (java) to read comma separated input file

I have an input file that looks like below
000001928162247ffaf63185cd8b2a244c78e7c6,2009324,abcat0101001,Sharp,"2011-09-05 12:25:37.42","2011-09-05 12:25:01.187"
0001be1731ee7d1c519bc7e87110c9eb880cb396,1649294,abcat0715001,"Gunnar eyewear","2011-09-23 17:13:36.175","2011-09-23 17:12:18.389"
0001bfa0c494c01f9f8c141c476c11bb4625a746,17240521,cat02015,refrigerator,"2011-10-19 23:43:51.71","2011-10-19 23:43:06.485"
0001fb09f03fea4d04e2267ed3194c806839d997,1271997,abcat0513004,Razer,"2011-09-07 09:20:07.11","2011-09-07 09:19:03.279"
0002965b083b6e508f7740c47c8f39e1072b4219,3562379,pcmcat209400050001,"I phone 4","2011-10-27 14:10:31.92","2011-10-27 14:09:33.327"
0002bb28a9ca07f5515b01996fd5d7ca84742e41,3230638,pcmcat177200050009,"hd antenna","2011-10-20 00:03:49.966","2011-10-20 00:02:01.458"
0002bd9c3d654698bb514194c4f4171ad6992266,9947181,pcmcat253300050012,printer,"2011-10-06 19:51:40.984","2011-10-06 19:47:13.803"
0002fee45e1c32eb94e82fc6c15c4db14e796248,3519969,pcmcat247400050000,vaio,"2011-10-19 23:31:51.015","2011-10-19 23:31:12.213"
00042033d355973baf9454b021a15c6b5b48f4a3,2677297,pcmcat212600050008,"desk top","2011-08-29 12:03:38.265","2011-08-29 12:03:12.348"
000433e0ef411c2cb8ee1727002d6ba15fe9426b,8959317,cat02015,"how i met your mother","2011-09-17 19:44:40.129","2011-09-17 19:43:37.564"
which contains the following information
user_id,product_id,category,query,click_time,query_time
I want to read this file in Hadoop and extract the user_id and the category (the 1st and 3rd fields). I have a basic Hadoop program as below that I used for wordcount. In this task, the items are separated by comma, and I have to store them in ArrayList.
Here is my starting program:
import java.io.IOException;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.lib.map.TokenCounterMapper;
import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
public class popularcats extends Configured implements Tool {
public int run(String[] args) throws Exception {
Job job = new Job(getConf());
job.setJarByClass(getClass());
job.setMapperClass(TokenCounterMapper.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String [] args) throws Exception {
int exitCode = ToolRunner.run(new popularcats(), args);
System.exit(exitCode);
}
}
I suppose Hadoop must have some classes for reading in a CSV file. I found this class CSVLineRecordReader from this address
https://github.com/mvallebr/CSVInputFormat/blob/master/src/main/java/org/apache/hadoop/mapreduce/lib/input/CSVLineRecordReader.java
So how should i process this file and extract the desired fields?

Hadoop WordCount code, the following errors are shown

I was reading and implementing this tutorial. At the last, I implement the three classes- Mapper, Reducer and driver. I copied the exact code given on the webpage for all three classes. But following two errors didn't go away:-
Mapper Class
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class WordCountMapper extends MapReduceBase // Here WordCountMapper was underlined as error source by Eclipse
implements Mapper<LongWritable, Text, Text, IntWritable> {
private final IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(WritableComparable key, Writable value,
OutputCollector output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line.toLowerCase());
while(itr.hasMoreTokens()) {
word.set(itr.nextToken());
output.collect(word, one);
}
}
}
The error was:
The type WordCountMapper must implement the inherited abstract method
Mapper.map(LongWritable, Text,
OutputCollector, Reporter)
Driver Class (WordCount.java)
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
public class WordCount {
public static void main(String[] args) {
JobClient client = new JobClient();
JobConf conf = new JobConf(WordCount.class);
// specify output types
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
// specify input and output dirs
FileInputPath.addInputPath(conf, new Path("input")); //////////FileInputPath was underlined
FileOutputPath.addOutputPath(conf, new Path("output")); ////////FileOutputPath as underlined
// specify a mapper
conf.setMapperClass(WordCountMapper.class);
// specify a reducer
conf.setReducerClass(WordCountReducer.class);
conf.setCombinerClass(WordCountReducer.class);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (Exception e) {
e.printStackTrace();
}
}
}
The error was:
FileInputPath cannot be resolved
FileOutputPath cannot be resolved
Use this
FileInputFormat.addInputPath(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf,new Path(args[1]));
or
FileInputFormat.addInputPath(conf, new Path("inputfile.txt"));
FileOutputFormat.setOutputPath(conf,new Path("outputfile.txt"));
instead of this
// specify input and output dirs
FileInputPath.addInputPath(conf, new Path("input")); //////////FileInputPath was underlined
FileOutputPath.addOutputPath(conf, new Path("output")); ////////FileOutputPath as underlined
This should be the one to import in this case
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;

When to use NLineInputFormat in Hadoop Map-Reduce?

I have a Text based input file of size around 25 GB. And in that file one single record consists of 4 lines. And the processing for every record is the same. But inside every record,each of the four lines are processed differently.
I'm new to Hadoop so I wanted a guidance that whether to use NLineInputFormat in this situation or use the default TextInputFormat ? Thanks in advance !
Assuming you have the text file in the following format :
2015-8-02
error2014 blahblahblahblah
2015-8-02
blahblahbalh error2014
You could use NLineInputFormat.
With NLineInputFormat functionality, you can specify exactly how many lines should go to a mapper.
In your case you can use to input 4 lines per mapper.
EDIT:
Here is an example for using NLineInputFormat:
Mapper Class:
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MapperNLine extends Mapper<LongWritable, Text, LongWritable, Text> {
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(key, value);
}
}
Driver class:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Driver extends Configured implements Tool {
#Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.out
.printf("Two parameters are required for DriverNLineInputFormat- <input dir> <output dir>\n");
return -1;
}
Job job = new Job(getConf());
job.setJobName("NLineInputFormat example");
job.setJarByClass(Driver.class);
job.setInputFormatClass(NLineInputFormat.class);
NLineInputFormat.addInputPath(job, new Path(args[0]));
job.getConfiguration().setInt("mapreduce.input.lineinputformat.linespermap", 4);
LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(MapperNLine.class);
job.setNumReduceTasks(0);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Configuration(), new Driver(), args);
System.exit(exitCode);
}
}

Caused by: java.lang.ClassNotFoundException: com.example.Example$ReduceTask while running maprduce job

I am trying to run the mapreduce job show below. But, its giving me ClassNotFound exception, even though this inner class is present in the jar. Can anyone give hint?
package com.example;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
public class Example {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setReducerClass(Example.ReduceTask.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
public static class ReduceTask
extends Reducer<LongWritable, Text, Text, Text> {
public void reduce(LongWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value: values) {
String[] cols = value.toString().split(",");
context.write(new Text(cols[0]), value);
break;
}
}
}
}
java.lang.RuntimeException: java.lang.ClassNotFoundException: com.example.Example$ReduceTask
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:996)
at org.apache.hadoop.mapreduce.JobContext.getReducerClass(JobContext.java:236)
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:556)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:414)
at org.apache.hadoop.mapred.Child$4.run(Child.java:270)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1127)
at org.apache.hadoop.mapred.Child.main(Child.java:264)
I am running it via command:
hadoop jar PracticeHadoop.jar com.example.Example workspace/input workspace/op

how to access the texts from a text file present in hadoop cache

I have 2 files which needs to be accessed by the hadoop cluster. Those two files are good.txt and bad.txt respectively.
Firstly since both these files needs to be accessed from different nodes i place these two files in distributed cache in driver class as follows
Configuration conf = new Configuration();
DistributedCache.addCacheFile(new URI("/user/training/Rakshith/good.txt"),conf);
DistributedCache.addCacheFile(new URI("/user/training/Rakshith/bad.txt"),conf);
Job job = new Job(conf);
Now both good and bad files are placed in distributed cache. I access the distributed cache in mapper class as follows
public class LetterMapper extends Mapper<LongWritable,Text,LongWritable,Text> {
private Path[]files;
#Override
protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException {
files=DistributedCache.getLocalCacheFiles(new Configuration(context.getConfiguration()));
}
I need to check if a word is present in a good.txt or bad.txt. So i use the something like this
File file=new File(files[0].toString()); //to access good.txt
BufferedReader br=new BufferedReader(new FileReader(file));
StringBuider sb=new StringBuilder();
String input=null;
while((input=br.readLine())!=null){
sb.append(input);
}
input=sb.toString();
iam supposed to get the content of good file in my input variable. But i dont get it. Have i missed anything??
Does job finish successfully? The maptask may fail because you are using JobConf in this line
files=DistributedCache.getLocalCacheFiles(new JobConf(context.getConfiguration()));
If you change it like this it should work, I don't see any problem with remaining code you posted in question.
files=DistributedCache.getLocalCacheFiles(context.getConfiguration());
or
files=DistributedCache.getLocalCacheFiles(new Configuration(context.getConfiguration()));
#rVr these is my driver class
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class AvgWordLength {
public static void main(String[] args) throws Exception {
if (args.length !=2) {
System.out.printf("Usage: AvgWordLength <input dir> <output dir>\n");
System.exit(-1);
}
Configuration conf = new Configuration();
DistributedCache.addCacheFile(new URI("/user/training/Rakshith/good.txt"),conf);
DistributedCache.addCacheFile(new URI("/user/training/Rakshith/bad.txt"),conf);
Job job = new Job(conf);
job.setJarByClass(AvgWordLength.class);
job.setJobName("Average Word Length");
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(LetterMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
boolean success = job.waitForCompletion(true);
System.exit(success ? 0 : 1);
}
}
And my mapper class is
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
public class LetterMapper extends Mapper<LongWritable,Text,LongWritable,Text> {
private Path[]files;
#Override
protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException {
files=DistributedCache.getLocalCacheFiles(new Configuration(context.getConfiguration()));
System.out.println("in setup()"+files.toString());
}
#Override
public void map(LongWritable key, Text value, Context context)throws IOException,InterruptedException{
int i=0;
System.out.println("in map----->>"+files.toString());//added just to view logs
HashMap<String,String> h=new HashMap<String,String>();
String negword=null;
String input=value.toString();
if(isPresent(input,files[0].toString()){
h.put(input,"good");
}
else
if(isPresent(input,files[1].toString()){
h.put(input,"bad");
}
}
public static boolean isPresent(String n,Path files2) throws IOException{
File file=new File(files2.toString());
BufferedReader br=new BufferedReader(new FileReader(file));
StringBuilder sb=new StringBuilder();
String input=null;
while((input=br.readLine().toString())!=null){
sb.append(input.toString());
}
input=sb.toString();
//System.out.println(input);
Pattern pattern=Pattern.compile(n);
Matcher matcher=pattern.matcher(input);
if(matcher.find()){
return true;
}
else
return false;
}
}

Categories

Resources