Wordcount on (Key,Value) outputs from a Map Reduce - java

I have several (title , text ) ordered pairs obtained as an output from a MapReduce application in Hadoop using Java.
Now I would like to implement Word Count on the text field of these ordered pairs.
So my final output should look like :
(title-a , word-a-1 , count-a-1 , word-a-2 , count-a-2 ....)
(title-b , word-b-1, count-b-1 , word-b-2 , count-b-2 ....)
.
.
.
.
(title-x , word-x-1, count-x-1 , word-x-2 , count-x-2 ....)
To summarize , I want to implement wordcount separately on the output records from first mapreduce. Can someone suggest me a good way to do it or how I can chain a second map reduce job to create the above output or format it better ?
The following is the code , borrowed it from github and made some changes
package com.org;
import javax.xml.stream.XMLStreamConstants;//XMLInputFactory;
import java.io.*;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import javax.xml.stream.*;
public class XmlParser11
{
public static class XmlInputFormat1 extends TextInputFormat {
public static final String START_TAG_KEY = "xmlinput.start";
public static final String END_TAG_KEY = "xmlinput.end";
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
return new XmlRecordReader();
}
/**
* XMLRecordReader class to read through a given xml document to output
* xml blocks as records as specified by the start tag and end tag
*
*/
// #Override
public static class XmlRecordReader extends
RecordReader<LongWritable, Text> {
private byte[] startTag;
private byte[] endTag;
private long start;
private long end;
private FSDataInputStream fsin;
private DataOutputBuffer buffer = new DataOutputBuffer();
private LongWritable key = new LongWritable();
private Text value = new Text();
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
startTag = conf.get(START_TAG_KEY).getBytes("utf-8");
endTag = conf.get(END_TAG_KEY).getBytes("utf-8");
FileSplit fileSplit = (FileSplit) split;
// open the file and seek to the start of the split
start = fileSplit.getStart();
end = start + fileSplit.getLength();
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
fsin = fs.open(fileSplit.getPath());
fsin.seek(start);
}
#Override
public boolean nextKeyValue() throws IOException,
InterruptedException {
if (fsin.getPos() < end) {
if (readUntilMatch(startTag, false)) {
try {
buffer.write(startTag);
if (readUntilMatch(endTag, true)) {
key.set(fsin.getPos());
value.set(buffer.getData(), 0,
buffer.getLength());
return true;
}
} finally {
buffer.reset();
}
}
}
return false;
}
#Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException,
InterruptedException {
return value;
}
#Override
public void close() throws IOException {
fsin.close();
}
#Override
public float getProgress() throws IOException {
return (fsin.getPos() - start) / (float) (end - start);
}
private boolean readUntilMatch(byte[] match, boolean withinBlock)
throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
// end of file:
if (b == -1)
return false;
// save to buffer:
if (withinBlock)
buffer.write(b);
// check if we're matching:
if (b == match[i]) {
i++;
if (i >= match.length)
return true;
} else
i = 0;
// see if we've passed the stop point:
if (!withinBlock && i == 0 && fsin.getPos() >= end)
return false;
}
}
}
}
public static class Map extends Mapper<LongWritable, Text,Text, Text> {
#Override
protected void map(LongWritable key, Text value,
Mapper.Context context)
throws
IOException, InterruptedException {
String document = value.toString();
System.out.println("'" + document + "'");
try {
XMLStreamReader reader = XMLInputFactory.newInstance().createXMLStreamReader(new
ByteArrayInputStream(document.getBytes()));
String propertyName = "";
String propertyValue = "";
String currentElement = "";
while (reader.hasNext()) {
int code = reader.next();
switch (code) {
case XMLStreamConstants.START_ELEMENT: //START_ELEMENT:
currentElement = reader.getLocalName();
break;
case XMLStreamConstants.CHARACTERS: //CHARACTERS:
if (currentElement.equalsIgnoreCase("title")) {
propertyName += reader.getText();
//System.out.println(propertyName);
} else if (currentElement.equalsIgnoreCase("text")) {
propertyValue += reader.getText();
//System.out.println(propertyValue);
}
break;
}
}
reader.close();
context.write(new Text(propertyName.trim()), new Text(propertyValue.trim()));
}
catch(Exception e){
throw new IOException(e);
}
}
}
public static class Reduce
extends Reducer<Text, Text, Text, Text> {
#Override
protected void setup(
Context context)
throws IOException, InterruptedException {
context.write(new Text("<Start>"), null);
}
#Override
protected void cleanup(
Context context)
throws IOException, InterruptedException {
context.write(new Text("</Start>"), null);
}
private Text outputKey = new Text();
public void reduce(Text key, Iterable<Text> values,
Context context)
throws IOException, InterruptedException {
for (Text value : values) {
outputKey.set(constructPropertyXml(key, value));
context.write(outputKey, null);
}
}
public static String constructPropertyXml(Text name, Text value) {
StringBuilder sb = new StringBuilder();
sb.append("<property><name>").append(name)
.append("</name><value>").append(value)
.append("</value></property>");
return sb.toString();
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
conf.set("xmlinput.start", "<page>");
conf.set("xmlinput.end", "</page>");
Job job = new Job(conf);
job.setJarByClass(XmlParser11.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(XmlParser11.Map.class);
job.setReducerClass(XmlParser11.Reduce.class);
job.setInputFormatClass(XmlInputFormat1.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
The wordcount code we find online does the word count of all files and gives the output . I want to do the wordcount for each of text fields separately. The above mapper is used to pull title and text from an XML document . Is there any way I can do the wordcount in the same mapper. If I do that , my next doubt is how do I pass it along with the already existing key value pairs (title,text) to the reducer. Sorry , I am not able to phrase my question properly but I guess the reader must have got some idea

I'm not sure if I have understood it properly. So I have many questions along with my answer.
First of all whoever has written this code is probably trying to show how to write a custom InputFormat to process xml data using MR. I don't know how it is related to your problem.
To summarize , I want to implement wordcount separately on the output records from first mapreduce. Can someone suggest me a good way to do it
Read the output file generated by first MR and do it.
or how I can chain a second map reduce job to create the above output or format it better ?
You can definitely chain jobs together in this fashion by writing multiple driver methods, one for each job. See this for more details and this for an example.
I want to do the wordcount for each of text fields separately.
What do you mean by separately?In the traditional wordcount program count of each word is calculated independently of the others.
Is there any way I can do the wordcount in the same mapper.
I hope you have understood the wordcount program properly. In the traditional wordcount program you read the input file, one line at a time, slit the line into words and then emit each word as the key with 1 as the value. All this happens inside the Mapper, which is essentially the same Mapper. And then the total count for each word is determined in the Reducer part of your job. If you wish to emit the words with their total counts from the mapper itself you have to read the whole file in the Mapper itself and do the counting. For that you need to set isSplittable in your InputFormat to false so that your input file is read as a whole and goes to just one Mapper.
When you emit something from Mapper and if it is not a Map only job, the output of your Mapper automatically goes to the Reducer. Do you need something else?

i suggested you can go with regular expression
and perform mapping and grouping.
in hadoop example jar file provide Grep class using this you can perform mapping of your hdfs data using regular expression. and group your maped data.

Related

Hadoop reducer ArrayIndexOutOfBoundsException when passing values from mapper

I'm trying to output two values from the mapper to the reducer by passing a string value but when I parse the string in the Mapper I get an out of bounds error. However, I made the string in the Mapper so I'm sure it has two values, what I'm doing wrong? How can I pass two values from the mapper to the reducer? (Eventually, I need to pass more variables to the reducer but this makes the problem a bit simpler.)
This is the error:
Error: java.lang.ArrayIndexOutOfBoundsException: 1
at TotalTime$TimeReducer.reduce(TotalTime.java:57)
at TotalTime$TimeReducer.reduce(TotalTime.java:1)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171)
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:628)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:390)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:174)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:168)
and this is my code
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TotalTime {
public static class TimeMapper extends Mapper<Object, Text, Text, Text> {
Text textKey = new Text();
Text textValue = new Text();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String data = value.toString();
String[] field = data.split(",");
if (null != field && field.length == 4) {
String strTimeIn[] = field[1].split(":");
String strTimeOout[] = field[2].split(":");
int timeOn = Integer.parseInt(strTimeIn[0]) * 3600 + Integer.parseInt(strTimeIn[1]) * 60 + Integer.parseInt(strTimeIn[2]);
int timeOff = Integer.parseInt(strTimeOout[0]) * 3600 + Integer.parseInt(strTimeOout[1]) * 60 + Integer.parseInt(strTimeOout[2]);
String v = String.valueOf(timeOn) + "," + String.valueOf(timeOff);
textKey.set(field[0]);
textValue.set(v);
context.write(textKey, textValue);
}
}
}
public static class TimeReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
Text textValue = new Text();
int sumTime = 0;
for (Text val : values) {
String line = val.toString();
// Split the string by commas
String[] field = line.split(",");
int timeOn = Integer.parseInt(field[0]);
int timeOff = Integer.parseInt(field[1]);
int time = timeOff - timeOn;
sumTime += time;
}
String v = String.valueOf(sumTime);
textValue.set(v);
context.write(key, textValue);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "User Score");
job.setJarByClass(TotalTime.class);
job.setMapperClass(TimeMapper.class);
job.setCombinerClass(TimeReducer.class);
job.setReducerClass(TimeReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
The input file looks like this:
ID2347,15:40:51,16:21:44,20
ID4568,14:27:57,14:58:04,72
ID8755,13:40:49,13:42:31,99
ID3258,13:12:48,13:37:11,73
ID9666,13:44:34,15:53:36,114
ID8755,09:43:59,10:47:52,123
ID3258,10:25:22,10:41:12,14
ID9666,09:40:10,11:44:01,15
It seems it is combiner which causes your code fails. remember combiner is a piece of code that is ran before reducer. now imagine this scenario:
your mapper process this line:
ID2347,15:40:51,16:21:44,20
and write following output to context
[ID2347, (56451,58904)]
now combiner come into play and process the output of your mapper before reducer and produce this:
[ID2347, 2453]
now above line go to reducer and it fails because in your code your assumption is the value is something like this val1,val2
if you want to your code work just remove combiner [or change your logic]

Hadoop - MapReduce not reducing

I'm trying to reduce a map like this:
01 true
01 true
01 false
02 false
02 false
where the first column is of Text, the second is BooleanWritable. The aim is to keep only those keys which only contain false next to them, and then write pairs of the first columns digits (so the output for above input would be 0, 2). For this, I wrote the following reducer:
import java.io.IOException;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class BeadReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text _key, Iterable<BooleanWritable> values, Context context) throws IOException, InterruptedException {
// process values
boolean dontwrite= false;
for (BooleanWritable val : values) {
dontwrite = (dontwrite || val.get());
}
if (!dontwrite) {
context.write(new Text(_key.toString().substring(0,1)), new Text(_key.toString().substring(1,2)));
}
else {
context.write(new Text("not"), new Text("good"));
}
}
}
This, however, does nothing. Nor does it write the pairs, not "not good", as if it doesn't even enter the if-else branch. All I get is the mapped (mapping works as intended) values.
The driver:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class BeadDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "task2");
job.setJarByClass(hu.pack.task2.BeadDriver.class);
// TODO: specify a mapper
job.setMapperClass(hu.pack.task2.BeadMapper.class);
// TODO: specify a reducer
job.setReducerClass(hu.pack.task2.BeadReducer.class);
// TODO: specify output types
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BooleanWritable.class);
// TODO: specify input and output DIRECTORIES (not files)
FileInputFormat.setInputPaths(job, new Path("local"));
FileOutputFormat.setOutputPath(job, new Path("outfiles"));
FileSystem fs;
try {
fs = FileSystem.get(conf);
if (fs.exists(new Path("outfiles")))
fs.delete(new Path("outfiles"),true);
} catch (IOException e1) {
e1.printStackTrace();
}
if (!job.waitForCompletion(true))
return;
}
}
The mapper:
import java.io.IOException;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class BeadMapper extends Mapper<LongWritable, Text, Text, BooleanWritable > {
private final Text wordKey = new Text("");
public void map(LongWritable ikey, Text value, Context context) throws IOException, InterruptedException {
String[] friend = value.toString().split(";");
String[] friendswith = friend[1].split(",");
for (String s : friendswith) {
wordKey.set(friend[0] + s);
context.write(wordKey, new BooleanWritable(true));
wordKey.set(s + friend[0]);
context.write(wordKey, new BooleanWritable(true));
}
if (friendswith.length > 0) {
for(int i = 0; i < friendswith.length-1; ++i) {
for(int j = i+1; j < friendswith.length; ++j) {
wordKey.set(friendswith[i] + friendswith[j]);
context.write(wordKey, new BooleanWritable(false));
}
}
}
}
}
I wonder what the problem is, what am I missing?
The output key and value types of a mapper should be input types for reducer and therefore in your case, the reducer must inherit from
Reducer<Text, BooleanWritable, Text, BooleanWritable>
The setOutputKeyClass and setOutputValueClass set the types for the job output i.e. both map and reduce. If you want to specify a different type for mapper, you should use the methods setMapOutputKeyClass and setMapOutputValueClass.
As a side note, when you don't want the true values in the output then why emit them from the mapper. Also with the below code in reducer,
for (BooleanWritable val : values) {
dontwrite = (dontwrite || val.get());
}
if dontwrite becomes true once it will be true till the end of the loop. You may want to change your logic for optimization.

How to format the output being written by Mapreduce in Hadoop

I am trying to reverse the contents of the file by each word. I have the program running fine, but the output i am getting is something like this
1 dwp
2 seviG
3 eht
4 tnerruc
5 gnikdrow
6 yrotcerid
7 ridkm
8 desU
9 ot
10 etaerc
I want the output to be something like this
dwp seviG eht tnerruc gnikdrow yrotcerid ridkm desU
ot etaerc
The code i am working with
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class Reproduce {
public static int temp =0;
public static class ReproduceMap extends MapReduceBase implements Mapper<LongWritable, Text, IntWritable, Text>{
private Text word = new Text();
#Override
public void map(LongWritable arg0, Text value,
OutputCollector<IntWritable, Text> output, Reporter reporter)
throws IOException {
String line = value.toString().concat("\n");
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(new StringBuffer(tokenizer.nextToken()).reverse().toString());
temp++;
output.collect(new IntWritable(temp),word);
}
}
}
public static class ReproduceReduce extends MapReduceBase implements Reducer<IntWritable, Text, IntWritable, Text>{
#Override
public void reduce(IntWritable arg0, Iterator<Text> arg1,
OutputCollector<IntWritable, Text> arg2, Reporter arg3)
throws IOException {
String word = arg1.next().toString();
Text word1 = new Text();
word1.set(word);
arg2.collect(arg0, word1);
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(ReproduceMap.class);
conf.setReducerClass(ReproduceReduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
How do i modify my output instead of writing another java program to do that
Thanks in advance
Here is a simple code demonstrate the use of custom FileoutputFormat
public class MyTextOutputFormat extends FileOutputFormat<Text, List<IntWritable>> {
#Override
public org.apache.hadoop.mapreduce.RecordWriter<Text, List<Intwritable>> getRecordWriter(TaskAttemptContext arg0) throws IOException, InterruptedException {
//get the current path
Path path = FileOutputFormat.getOutputPath(arg0);
//create the full path with the output directory plus our filename
Path fullPath = new Path(path, "result.txt");
//create the file in the file system
FileSystem fs = path.getFileSystem(arg0.getConfiguration());
FSDataOutputStream fileOut = fs.create(fullPath, arg0);
//create our record writer with the new file
return new MyCustomRecordWriter(fileOut);
}
}
public class MyCustomRecordWriter extends RecordWriter<Text, List<IntWritable>> {
private DataOutputStream out;
public MyCustomRecordWriter(DataOutputStream stream) {
out = stream;
try {
out.writeBytes("results:\r\n");
}
catch (Exception ex) {
}
}
#Override
public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
//close our file
out.close();
}
#Override
public void write(Text arg0, List arg1) throws IOException, InterruptedException {
//write out our key
out.writeBytes(arg0.toString() + ": ");
//loop through all values associated with our key and write them with commas between
for (int i=0; i<arg1.size(); i++) {
if (i>0)
out.writeBytes(",");
out.writeBytes(String.valueOf(arg1.get(i)));
}
out.writeBytes("\r\n");
}
}
Finally we need to tell our job about our ouput format and the path before running it.
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ArrayList.class);
job.setOutputFormatClass(MyTextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path("/home/hadoop/out"));
We can customize the output by writing a custom fileoutputformat class
you can use NullWritable as a output value. NullWritable is just a placeholder Since you don't want number to be displayed as a part of your output. I have given modified reducer class. Note :- need to add import statement for NullWritable
public static class ReproduceReduce extends MapReduceBase implements Reducer<IntWritable, Text, Text, NullWritable>{
#Override
public void reduce(IntWritable arg0, Iterator<Text> arg1,
OutputCollector<Text, NullWritable> arg2, Reporter arg3)
throws IOException {
String word = arg1.next().toString();
Text word1 = new Text();
word1.set(word);
arg2.collect(word1, new NullWritable());
}
}
and change the driver class or main method
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(NullWritable.class);
In Mapper key temp is incremented for each word value, So each word is processed as a separate key-value pair.
Below steps should solve the problem
1) In Mapper just remove the temp++, so that all the reversed words will have the key as 0 (temp =0).
2) Reducer receives the key 0 and list of reversed strings.
In reducer set the key to NullWritable and write the output.
What you can try is take one constant key (or simply nullwritable) and pass this as a key and your complete line as a value(you can reverse it in mapper class or you can also reverse it in the reducer class as well). so your reducer will receive a constant key (or place holder if you have used nullwritable as a key) and complete line. Now you can simply reverse the line and write it to output file. By not using tmp as a key you avoid writing unwanted numbers in your output file.

Hadoop Map Reduce program for hashing

I have written a Map Reduce Program in Hadoop for hashing all the records of the file, and appending the hased value as an additional attribute to each record and then output to Hadoop file system
This is the code i have written
public class HashByMapReduce
{
public static class LineMapper extends Mapper<Text, Text, Text, Text>
{
private Text word = new Text();
public void map(Text key, Text value, Context context) throws IOException, InterruptedException
{
key.set("single")
String line = value.toString();
word.set(line);
context.write(key, line);
}
}
public static class LineReducer
extends Reducer<Text,Text,Text,Text>
{
private Text result = new Text();
public void reduce(Text key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException
{
String translations = "";
for (Text val : values)
{
translations = val.toString()+","+String.valueOf(hash64(val.toString())); //Point of Error
result.set(translations);
context.write(key, result);
}
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = new Job(conf, "Hashing");
job.setJarByClass(HashByMapReduce.class);
job.setMapperClass(LineMapper.class);
job.setReducerClass(LineReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(KeyValueTextInputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
I have written this code with the logic that Each line is read by the Map method which assigns all value to a single key which then passes to same Reducer method. which the passes each values to hash64() function.
But i see its passing a null value(empty value) to hash function. I am not unable to figure it out why? Thanks in advance
The cause of the problem is most probably due to the use of KeyValueTextInputFormat. From Yahoo tutorial :
InputFormat: Description: Key: Value:
TextInputFormat Default format; The byte offset The line contents
reads lines of of the line
text files
KeyValueInputFormat Parses lines Everything up to the The remainder of
into key, first tab character the line
val pairs
It's breaking your input lines wrt tab character. I suppose there is no tab in your lines. As a result the key in the LineMapper is a whole line while nothing is being passed as value ( not sure null or empty ).
From your code I think you should better use TextInputFormat class as your inputformat which produces line offset as key and the complete line as value. This should solve your problem.
EDIT : I run your code with following changes, and it seems to work fine:
Changed inputformat to TextInputFormat and accordingly change declaration of the Mapper
Added proper setMapOutputKeyClass & setMapOutputValueClass to the job. These are not mandatory but often creates problem on running.
Removed your ket.set("single") and added a private outkey to the Mapper.
Since you provided no details of hash64 method, I used String.toUpperCase for testing.
If the issue persists, then I'm sure that your hash method hasn't handle null well.
Full code :
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class HashByMapReduce {
public static class LineMapper extends
Mapper<LongWritable, Text, Text, Text> {
private Text word = new Text();
private Text outKey = new Text("single");
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
word.set(line);
context.write(outKey, word);
}
}
public static class LineReducer extends Reducer<Text, Text, Text, Text> {
private Text result = new Text();
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String translations = "";
for (Text val : values) {
translations = val.toString() + ","
+ val.toString().toUpperCase(); // Point of Error
result.set(translations);
context.write(key, result);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "Hashing");
job.setJarByClass(HashByMapReduce.class);
job.setMapperClass(LineMapper.class);
job.setReducerClass(LineReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Hadoop Custom InputFileFormat producing empty results

So I'm trying to import baseball event files from Retrosheet.org into Hadoop. Each game entry follows the following format, with each file containing a season's worth of game entries (this is an incomplete record just for example, some items removed for redundancy and to save space):
id,BOS192704230
version,1
info,inputprogvers,"version 7RS(19) of 07/07/92"
info,visteam,WS1
start,judgj101,"Joe Judge",0,5,3
start,myerb103,"Buddy Myer",0,6,6
start,blueo102,"Ossie Bluege",0,7,5
play,4,0,myerb103,??,,S/BG
play,4,0,blueo102,??,,CS2(2E4)
play,4,0,blueo102,??,,7/FL
play,4,0,ruelm101,??,,63
play,4,0,crowg102,??,,NP
sub,wests101,"Sam West",0,9,11
play,4,0,wests101,??,,K/C
play,4,1,wannp101,??,,NP
sub,liseh101,"Hod Lisenbee",0,9,1
play,4,1,wannp101,??,,W
play,4,1,rothj101,??,,CS2(26)
play,4,1,rothj101,??,,7/F
play,4,1,tobij101,??,,5/P
play,5,0,rices101,??,,6/P
data,er,crowg102,4
data,er,liseh101,0
data,er,braxg101,1
data,er,marbf101,0
data,er,harrs101,3
I'm making my first pass at importing this into Hadoop, and am having trouble implementing a proper custom InputFileFormat to successfully read such a record. I've been attempting to split the files on the first line of each game record (indicated by "id," followed by a team, season, date, and game code) using the regex "id,[A-Z]{3}[0-9]{9}". When I output this (I'm using a SequenceFile output, but both the SequenceFile and regular Text file outputs return the same result), I get an empty result file. Any point in the right direction would be extremely helpful. The code I've got so far is based on a template found here: http://dronamk.blogspot.com/2013/03/regex-custom-input-format-for-hadoop.html. I'm using essentially the same code posted there, just compiling the abovementioned regex instead of the included expression.
The class in question:
package project.baseball;
import java.io.IOException;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class RegexInputFormat extends
InputFormat<LongWritable, TextArrayWritable> {
public Pattern pattern = Pattern.compile("id,[A-Z]{3}[0-9]{9}");
private TextInputFormat textIF = new TextInputFormat();
#Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
InterruptedException {
return textIF.getSplits(context);
}
#Override
public RecordReader<LongWritable, TextArrayWritable> createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
RegexRecordReader reader = new RegexRecordReader();
if (pattern == null) {
throw new IllegalStateException(
"No pattern specified - unable to create record reader");
}
reader.setPattern(pattern);
return reader;
}
public static class RegexRecordReader extends
RecordReader<LongWritable, TextArrayWritable> {
private LineRecordReader lineRecordReader = new LineRecordReader();
private Pattern pattern;
TextArrayWritable value = new TextArrayWritable();
public void setPattern(Pattern pattern2) {
pattern = pattern2;
}
#Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
lineRecordReader.initialize(split, context);
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
while (lineRecordReader.nextKeyValue()) {
Matcher matcher;
matcher = pattern.matcher(lineRecordReader.getCurrentValue()
.toString());
if (matcher.find()) {
int fieldCount;
Text[] fields;
fieldCount = matcher.groupCount();
fields = new Text[fieldCount];
for (int i = 0; i < fieldCount; i++) {
fields[i] = new Text(matcher.group(i + 1));
}
value.setFields(fields);
return true;
}
}
return false;
}
#Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return lineRecordReader.getCurrentKey();
}
#Override
public TextArrayWritable getCurrentValue() throws IOException,
InterruptedException {
return value;
}
#Override
public float getProgress() throws IOException, InterruptedException {
return lineRecordReader.getProgress();
}
#Override
public void close() throws IOException {
lineRecordReader.close();
}
}
}
Your regex may miss the context ie what is around the line to split with.
Try this instead:
(.*)(id,([A-Z]{3}[0-9]{9}))(.*)

Categories

Resources