Hadoop - Reducer class in java - java

I am developing a Hadoop project in java. I want to find the customers with max consumption in a certain day. I have managed to find the customers in the date I want, but I am facing a problem in my Reducer class. Here is the code:
Mapper Class
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class alicanteMapperC extends
Mapper<LongWritable, Text, Text, IntWritable> {
String Customer = new String();
SimpleDateFormat ft = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date t = new Date();
IntWritable Consumption = new IntWritable();
int counter = 0;
//new vars
int max=0;
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
Date d2 = null;
try {
d2 = ft.parse("2013-07-01 01:00:00");
} catch (ParseException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
if (counter > 0) {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line, ",");
while (itr.hasMoreTokens()) {
Customer = itr.nextToken();
try {
t = ft.parse(itr.nextToken());
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Consumption.set(Integer.parseInt(itr.nextToken()));
}
if (t.compareTo(d2) == 0) {
context.write(new Text(Customer), Consumption);
}
}
counter++;
}
}
Reducer class
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class alicanteReducerC extends
Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable maximum = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int max = 0;
for (IntWritable val : values) {
if (val.get() > max) {
max = val.get();
}
}
for (IntWritable val : values) {
if (val.get() == max) {
context.write(key, val);
}
}
}
}
Do you have any idea why the reducer won't write to the output file? In other words, why doesn't the second for works?
EDIT
In my mapper class I find the Customers in a specific date and thus the consumption of them and I pass these values in the reducer class.
In the reducer class I want to find the max consumption and the customer associated to this consumption.

Related

Duplicate "values" for some key in map-reduce java program

I am new in mapreduce and hadoop (hadoop 3.2.3 and java 8).
I am trying to separate some lines based on a symbol in a line.
Example: "q1,a,q0," should be return ('a',"q1,a,q0,") as (key, value).
My dataset contains ten(10) lines , five(5) for key 'a' and five for key 'b'.
I expect to get 5 line for each key but i always get five for 'a' and 10 for 'b'
Data
A,q0,a,q1;A,q0,b,q0;A,q1,a,q1;A,q1,b,q2;A,q2,a,q1;A,q2,b,q0;B,s0,a,s0;B,s0,b,s1;B,s1,a,s1;B,s1,b,s0
Mapper class:
import java.io.IOException;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<LongWritable, Text, ByteWritable ,Text>{
private ByteWritable key1 = new ByteWritable();
//private int n ;
private int count =0 ;
private Text wordObject = new Text();
#Override
public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
String ftext = value.toString();
for (String line: ftext.split(";")) {
wordObject = new Text();
if (line.split(",")[2].equals("b")) {
key1.set((byte) 'b');
wordObject.set(line) ;
context.write(key1,wordObject);
continue ;
}
key1.set((byte) 'a');
wordObject.set(line) ;
context.write(key1,wordObject);
}
}
}
Reducer class:
import java.io.IOException;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
public class MyReducer extends Reducer<ByteWritable, Text, ByteWritable ,Text>{
private Integer count=0 ;
#Override
public void reduce(ByteWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for(Text val : values ) {
count++ ;
}
Text symb = new Text(count.toString()) ;
context.write(key , symb);
}
}
Driver class:
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MyDriver extends Configured implements Tool {
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.out.printf("Usage: %s [generic options] <inputdir> <outputdir>\n", getClass().getSimpleName());
return -1;
}
#SuppressWarnings("deprecation")
Job job = new Job(getConf());
job.setJarByClass(MyDriver.class);
job.setJobName("separation ");
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(ByteWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(ByteWritable.class);
job.setOutputValueClass(Text.class);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Configuration(), new MyDriver(), args);
System.exit(exitCode);
}
}
The problem was solved by putting the variable "count" inside the function "Reduce()".
Does your input read more than one line that has 5 more b's? I cannot reproduce for that one line, but your code can be cleaned up.
For the following code, I get output as
a 5
b 5
static class Mapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, ByteWritable, Text> {
final ByteWritable keyOut = new ByteWritable();
final Text valueOut = new Text();
#Override
protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, ByteWritable, Text>.Context context) throws IOException, InterruptedException {
String line = value.toString();
if (line.isEmpty()) {
return;
}
StringTokenizer tokenizer = new StringTokenizer(line, ";");
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
String[] parts = token.split(",");
String keyStr = parts[2];
if (keyStr.matches("[ab]")) {
keyOut.set((byte) keyStr.charAt(0));
valueOut.set(token);
context.write(keyOut, valueOut);
}
}
}
}
static class Reducer extends org.apache.hadoop.mapreduce.Reducer<ByteWritable, Text, Text, LongWritable> {
static final Text keyOut = new Text();
static final LongWritable valueOut = new LongWritable();
#Override
protected void reduce(ByteWritable key, Iterable<Text> values, org.apache.hadoop.mapreduce.Reducer<ByteWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
keyOut.set(new String(new byte[]{key.get()}, StandardCharsets.UTF_8));
valueOut.set(StreamSupport.stream(values.spliterator(), true)
.mapToLong(v -> 1).sum());
context.write(keyOut, valueOut);
}
}

java.lang.Object cannot be converted to own class

I am new to java/hadoop and am currently trying to recreate the results of a code I found here:
https://sunilmistri.wordpress.com/2015/02/13/mapreduce-example-for-minimum-and-maximum-value-by-group-key/
In contrast to the example, I only have 3 "columns", each with an integer number, delimited by a tab (/t), e.g. 100 115 3
The first two numbers are nodes, the third the weight between the nodes. Now I try to find the min and max weights for the first node.
The only things I have changed in the code are the delimiter (/t) and that I put both classes in one file (which, what I read, should be ok).
Now I get an error "java.lang.Object cannot be converted to minmaxduration" in the line indicated below.
I found similar questions like incompatible types : java.lang.Object cannot be converted to T
but this didn't really help me. How can I fix this?
Thanks in advance.
import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Q1{
public static class DurationMapper
extends Mapper<Object, Text, Text, MinMaxDuration>{
private Text month= new Text();
private Integer minduration;
private Integer maxduration;
private MinMaxDuration outPut= new MinMaxDuration();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String[] campaignFields= value.toString().split("/t");
//10001,'telephone','may','mon',100,1,999,0,'nonexistent','no',0
month.set(campaignFields[0]);
minduration=Integer.parseInt(campaignFields[2]);
maxduration=Integer.parseInt(campaignFields[2]);
if (month == null || minduration == null || maxduration== null) {
return;
}
try {
outPut.setMinDuration(minduration);
outPut.setMaxDuration(maxduration);
context.write(month,outPut);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public static class DurationReducer
extends Reducer<Text,MinMaxDuration,Text,MinMaxDuration> {
private MinMaxDuration resultRow = new MinMaxDuration();
public void reduce(Text key, Iterable values,
Context context
) throws IOException, InterruptedException {
Integer minduration = 0;
Integer maxduration = 0;
resultRow.setMinDuration(null);
resultRow.setMaxDuration(null);
for (MinMaxDuration val : values) { //ERROR HERE
minduration = val.getMinDuration();
maxduration = val.getMaxDuration();
// get min score
if (resultRow.getMinDuration()==null || minduration.compareTo(resultRow.getMinDuration())<0) {
resultRow.setMinDuration(minduration);
}
// get min bonus
if (resultRow.getMaxDuration()==null || maxduration.compareTo(resultRow.getMaxDuration())>0) {
resultRow.setMaxDuration(maxduration);
}
} // end of for loop
context.write(key, resultRow);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Campaign Duration Count");
job.setJarByClass(CampaignMinMax.class);
job.setMapperClass(DurationMapper.class);
job.setCombinerClass(DurationReducer.class);
job.setReducerClass(DurationReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(MinMaxDuration.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
class MinMaxDuration implements Writable {
// declare variables
Integer minDuration;
Integer maxDuration;
// constructor
public MinMaxDuration() {
minDuration=0;
maxDuration=0;
}
//set method
void setMinDuration(Integer duration){
this.minDuration=duration;
}
void setMaxDuration(Integer duration){
this.maxDuration=duration;
}
//get method
Integer getMinDuration() {
return minDuration;
}
Integer getMaxDuration(){
return maxDuration;
}
// write method
public void write(DataOutput out) throws IOException {
// what order we want to write !
out.writeInt(minDuration);
out.writeInt(maxDuration);
}
// readFields Method
public void readFields(DataInput in) throws IOException {
minDuration=new Integer(in.readInt());
maxDuration=new Integer(in.readInt());
}
public String toString() {
return minDuration + "\t" + maxDuration;
}
}
You are missing the type specification in the method signature. So it will be considered to be a generic Object and not your concrete type. So change the method signature from
public void reduce(Text key, Iterable values,Context context) throws IOException, InterruptedException
to
public void reduce(Text key, Iterable<MinMaxDuration> values,
Context context
) throws IOException, InterruptedException
So you specifiying the proper type.

MapReduce Hadoop 2.4.1 Reducer not running

For some reason my Reducer doesn't seem to be running.
My Driver is
import java.io.File;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class PageRank {
public static void main(String[] args) throws Exception {
PageRank pageRanking = new PageRank();
//In and Out dirs in HDFS
pageRanking.runXmlParsing(args[0], args[1]);
System.out.println("finished");
}
public void runXmlParsing(String inputPath, String outputPath) throws IOException {
Configuration conf = new Configuration();
conf.set(XmlInputFormat.START_TAG_KEY, "<page>");
conf.set(XmlInputFormat.END_TAG_KEY, "</page>");
Job job1 = Job.getInstance(conf);
job1.setJarByClass(PageRank.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
// Our class to parse links from content.
job1.setMapperClass(WikiPageXMLMapper.class);
job1.setReducerClass(WikiLinksReducer.class);
job1.setInputFormatClass(XmlInputFormat.class);
job1.setOutputFormatClass(TextOutputFormat.class);
// Remove output if already exists
FileSystem.getLocal(conf).delete(new Path(outputPath), true);
FileInputFormat.setInputPaths(job1, new Path(inputPath));
FileOutputFormat.setOutputPath(job1, new Path(outputPath));
System.out.println("BEFORE RUN");
try {
job1.waitForCompletion(true);
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void deleteDir(File dir) {
File[] files = dir.listFiles();
for (File myFile: files) {
if (myFile.isDirectory()) {
deleteDir(myFile);
}
myFile.delete();
}
}
}
My Mapper is
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
public class WikiPageXMLMapper extends Mapper<LongWritable, Text, Text, Text> {
#Override
public void map(LongWritable key, Text value, Context output) throws IOException {
String[] titleAndText = parseTitleAndText(value.toString());
String pageString = titleAndText[0];
Text page = new Text(pageString.replace(' ', '_'));
String[] parts = titleAndText[1].split("\\[\\[");
String pages = "!##$ ";
for (int i = 1; i < parts.length; i++) {
int lastIndexBrackets = parts[i].lastIndexOf("]]");
// This checks and skips the first part of the outer link
if (lastIndexBrackets == -1)
continue;
String insideLinkPlusExtra = parts[i].substring(0, lastIndexBrackets);
int multipleClosingBrackets = insideLinkPlusExtra.indexOf("]]");
String otherPage = insideLinkPlusExtra;
if (multipleClosingBrackets != -1) {
otherPage = insideLinkPlusExtra.substring(0, multipleClosingBrackets);
}
otherPage = otherPage.split("\\|")[0];
otherPage = checkForDuplicates(otherPage, pages);
otherPage = (otherPage.indexOf(":") == -1) ? otherPage : "";
otherPage = (otherPage.indexOf("#") == -1) ? otherPage : "";
otherPage = checkForSubpageLinks(otherPage);
otherPage = checkForRedLink(otherPage);
if (otherPage == "")
continue;
Text oP = new Text(otherPage.replace(' ', '_'));
pages += oP + " ";
// taking each outlink and making it its own key (ingraph)
try {
output.write(new Text(oP), new Text(page));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
// Designate this page as not a redlink
try {
output.write(new Text(page), new Text("!##$"));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return;
}
}
And my Reducer is:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WikiLinksReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values, org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text>.Context output) throws IOException, InterruptedException {
System.out.println("REDUCER");
String links = "";
boolean isNotRedLink = false;
System.out.println("Starting reduce");
// Brett concern (and zach's): if n pages link to a redlink
// we will iterate n times and it could be wasteful
while(values.hasNext()){
String v = values.next().toString();
// Check first outlink is not #, if so, it is a redlink
if (v.equals("!##$")) {
isNotRedLink = true;
continue;
} else {
links += v;
continue;
}
}
// If the key is not a redlink, send it to the output
if (isNotRedLink) {
try {
output.write(key, new Text(links));
output.write(key, new Text("TESTING!"));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(links);
} else {
System.out.println(output);
try {
output.write(key, new Text("BLEG"));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(key + " IS A RED LINK");
return;
}
}
}
None of the System.out.println(...)s in my Reducer output to the console, and my output from the program (the file it leaves on my HDD) only has the results from the Mapper.
I feel silly. I tried Iterable instead of Iterator in the line public void reduce(Text key, Iterable<Text> values, org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text>.Context output) throws IOException, InterruptedException { and my issue is solved.

MapReduce Job hangs

I am new to Hadoop's MapReduce. I have written a map reduce task and I am trying to run that on my local machine. But the job hangs after map 100%.
Below is the code, I don't understand what am I missing.
I have a custom key class
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class AirlineMonthKey implements WritableComparable<AirlineMonthKey>{
Text airlineName;
Text month;
public AirlineMonthKey(){
super();
}
public AirlineMonthKey(Text airlineName, Text month) {
super();
this.airlineName = airlineName;
this.month = month;
}
public Text getAirlineName() {
return airlineName;
}
public void setAirlineName(Text airlineName) {
this.airlineName = airlineName;
}
public Text getMonth() {
return month;
}
public void setMonth(Text month) {
this.month = month;
}
#Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.airlineName.readFields(in);
this.month.readFields(in);
}
#Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
this.airlineName.write(out);
this.month.write(out);
}
#Override
public int compareTo(AirlineMonthKey airlineMonthKey) {
// TODO Auto-generated method stub
int diff = getAirlineName().compareTo(airlineMonthKey.getAirlineName());
if(diff != 0){
return diff;
}
int m1 = Integer.parseInt(getMonth().toString());
int m2 = Integer.parseInt(airlineMonthKey.getMonth().toString());
if(m1>m2){
return -1;
}
else
return 1;
}
}
and The mapper and the reducer class that uses the custom key as below.
package com.mapresuce.secondarysort;
import java.io.IOException;
import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import com.opencsv.CSVReader;
public class FlightDelayByMonth {
public static class FlightDelayByMonthMapper extends
Mapper<Object, Text, AirlineMonthKey, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String str = value.toString();
// Reading Line one by one from the input CSV.
CSVReader reader = new CSVReader(new StringReader(str));
String[] split = reader.readNext();
reader.close();
String airlineName = split[6];
String month = split[2];
String year = split[0];
String delayMinutes = split[37];
String cancelled = split[41];
if (!(airlineName.equals("") || month.equals("") || delayMinutes
.equals(""))) {
if (year.equals("2008") && cancelled.equals("0.00")) {
AirlineMonthKey airlineMonthKey = new AirlineMonthKey(
new Text(airlineName), new Text(month));
Text delay = new Text(delayMinutes);
context.write(airlineMonthKey, delay);
System.out.println("1");
}
}
}
}
public static class FlightDelayByMonthReducer extends
Reducer<AirlineMonthKey, Text, Text, Text> {
public void reduce(AirlineMonthKey key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
for(Text val : values){
context.write(new Text(key.getAirlineName().toString()+" "+key.getMonth().toString()), val);
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage:<in> <out>");
System.exit(2);
}
Job job = new Job(conf, "Average monthly flight dealy");
job.setJarByClass(FlightDelayByMonth.class);
job.setMapperClass(FlightDelayByMonthMapper.class);
job.setReducerClass(FlightDelayByMonthReducer.class);
job.setOutputKeyClass(AirlineMonthKey.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Also I have created a job and configuration in the main. Don't know what I am missing. I am running all this in local environment.
Try with writing a custom implementation of toString, equals and hashcode in your AirlineMonthKey class.
Read below link.
http://hadoop.apache.org/docs/stable/api/org/apache/hadoop/io/WritableComparable.html
It is important for key types to implement hashCode().
Hope this could help you.
The issue was I had to use the default Constructor in the AirlineMonthKey (which I did) and initialize the instance variables in the custom key class (which I didn't).

Parsing of Stackoverflow`s posts.xml on hadoop

I am following this article by Anoop Madhusudanan on codeproject to build a recommendation engine not on cluster but on my system.
Problem is when i try to parse posts.xml whose structure is as follows:
<row Id="99" PostTypeId="2" ParentId="88" CreationDate="2008-08-01T14:55:08.477" Score="2" Body="<blockquote>
<p>The actual resolution of gettimeofday() depends on the hardware architecture. Intel processors as well as SPARC machines offer high resolution timers that measure microseconds. Other hardware architectures fall back to the system’s timer, which is typically set to 100 Hz. In such cases, the time resolution will be less accurate. </p>
</blockquote>
<p>I obtained this answer from <a href="http://www.informit.com/guides/content.aspx?g=cplusplus&amp;seqNum=272" rel="nofollow">High Resolution Time Measurement and Timers, Part I</a></p>" OwnerUserId="25" LastActivityDate="2008-08-01T14:55:08.477" />
Now I need to parse this file(size 1.4 gb) on hadoop for which i have written code in java and created its jar.
Java class is as follows:
import java.io.IOException;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.w3c.dom.Element;
import java.io.File;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
public class Recommend {
static class Map extends Mapper<Text, Text, Text, Text> {
Path path;
String fXmlFile;
DocumentBuilderFactory dbFactory;
DocumentBuilder dBuilder;
Document doc;
/**
* Given an output filename, write a bunch of random records to it.
*/
public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
try{
fXmlFile=value.toString();
dbFactory = DocumentBuilderFactory.newInstance();
dBuilder= dbFactory.newDocumentBuilder();
doc= dBuilder.parse(fXmlFile);
doc.getDocumentElement().normalize();
NodeList nList = doc.getElementsByTagName("row");
for (int temp = 0; temp < nList.getLength(); temp++) {
Node nNode = nList.item(temp);
Element eElement = (Element) nNode;
Text keyWords =new Text(eElement.getAttribute("OwnerUserId"));
Text valueWords = new Text(eElement.getAttribute("ParentId"));
String val=keyWords.toString()+" "+valueWords.toString();
// Write the sentence
if(keyWords != null && valueWords != null){
output.collect(keyWords, new Text(val));
}
}
}catch (Exception e) {
e.printStackTrace();
}
}
}
/**
*
* #throws IOException
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
/*if (args.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}*/
// FileSystem fs = FileSystem.get(conf);
Job job = new Job(conf, "Recommend");
job.setJarByClass(Recommend.class);
// the keys are words (strings)
job.setOutputKeyClass(Text.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
// the values are counts (ints)
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
//conf.setReducerClass(Reduce.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
Path outPath = new Path(args[1]);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}
}
}
I expect the output to be as a file in hadoop containing output as OwnerUserId ParentId
but instead I get output as:
1599788 <row Id="2292" PostTypeId="2" ParentId="2284" CreationDate="2008-08-05T13:28:06.700" Score="0" ViewCount="0" Body="<p>The first thing you should do is contact the main people who run the open source project. Ask them if it is ok to contribute to the code and go from there.</p>
<p>Simply writing your improved code and then giving it to them may result in your code being rejected.</p>" OwnerUserId="383" LastActivityDate="2008-08-05T13:28:06.700" />
I dont know about the origin of 1599788 appearing as a key value from mapper.
I don`t know much about writing mapper classes for hadoop, I need help to modify my code to get the desired output.
Thanks in advance.
After a lot of research and experiments , finally learnt the way to write map for parsin xml files which have syntax like one I provided. I changed my approach and this is my new mapper code... Its workin for my usecase.
hope it help someone and they can save their time :)
import java.io.IOException;
import java.util.StringTokenizer;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.xml.sax.SAXException;
public class Map extends Mapper<LongWritable, Text, NullWritable, Text> {
NullWritable obj;
#Override
public void map(LongWritable key, Text value, Context context) throws InterruptedException {
StringTokenizer tok= new StringTokenizer(value.toString());
String pa=null,ow=null,pi=null,v;
while (tok.hasMoreTokens()) {
String[] arr;
String val = (String) tok.nextToken();
if(val.contains("PostTypeId")){
arr= val.split("[\"]");
pi=arr[arr.length-1];
if(pi.equals("2")){
continue;
}
else break;
}
if(val.contains("ParentId")){
arr= val.split("[\"]");
pa=arr[arr.length-1];
}
else if(val.contains("OwnerUserId") ){
arr= val.split("[\"]");
ow=arr[arr.length-1];
try {
if(pa!=null && ow != null){
v=String.format("{0},{1}", ow,pa);
context.write(obj,new Text(v));
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
}
Here is the mapper I've written to parse the so posts xml and create a tab separated file(s) on hadoop to be used by other map reduce jobs or Hive or Pig.
Mapper
package com.aravind.learning.hadoop.mapred.techtalks;
import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.google.common.base.Joiner;
public class StackoverflowDataWranglerMapper extends Mapper<LongWritable, Text, Text, Text>
{
static enum BadRecordCounters
{
NO_CREATION_DATE, UNKNOWN_USER_ID, UNPARSEABLE_RECORD, UNTAGGED_POSTS
}
private final Text outputKey = new Text();
private final Text outputValue = new Text();
private final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
private DocumentBuilder builder;
private static final Joiner TAG_JOINER = Joiner.on(",").skipNulls();
// 2008-07-31T21:42:52.667
private static final DateFormat DATE_PARSER = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");
private static final SimpleDateFormat DATE_BUILDER = new SimpleDateFormat("yyyy-MM-dd");
#Override
protected void setup(Context context) throws IOException, InterruptedException
{
try
{
builder = factory.newDocumentBuilder();
}
catch (ParserConfigurationException e)
{
new IOException(e);
}
}
#Override
protected void map(LongWritable inputKey, Text inputValue, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException
{
try
{
String entry = inputValue.toString();
if (entry.contains("<row "))
{
Document doc = builder.parse(new InputSource(new StringReader(entry)));
Element rootElem = doc.getDocumentElement();
String id = rootElem.getAttribute("Id");
String postedBy = rootElem.getAttribute("OwnerUserId").trim();
String viewCount = rootElem.getAttribute("ViewCount");
String postTypeId = rootElem.getAttribute("PostTypeId");
String score = rootElem.getAttribute("Score");
String title = rootElem.getAttribute("Title");
String tags = rootElem.getAttribute("Tags");
String answerCount = rootElem.getAttribute("AnswerCount");
String commentCount = rootElem.getAttribute("CommentCount");
String favoriteCount = rootElem.getAttribute("FavoriteCount");
String creationDate = rootElem.getAttribute("CreationDate");
Date parsedDate = null;
if (creationDate != null && creationDate.trim().length() > 0)
{
try
{
parsedDate = DATE_PARSER.parse(creationDate);
}
catch (ParseException e)
{
context.getCounter("Bad Record Counters", "Posts missing CreationDate").increment(1);
}
}
if (postedBy.length() == 0 || postedBy.trim().equals("-1"))
{
context.getCounter("Bad Record Counters", "Posts with either empty UserId or UserId contains '-1'")
.increment(1);
try
{
parsedDate = DATE_BUILDER.parse("2100-00-01");
}
catch (ParseException e)
{
// ignore
}
}
tags = tags.trim();
String tagTokens[] = null;
if (tags.length() > 1)
{
tagTokens = tags.substring(1, tags.length() - 1).split("><");
}
else
{
context.getCounter("Bad Record Counters", "Untagged Posts").increment(1);
}
outputKey.clear();
outputKey.set(id);
StringBuilder sb = new StringBuilder(postedBy).append("\t").append(parsedDate.getTime()).append("\t")
.append(postTypeId).append("\t").append(title).append("\t").append(viewCount).append("\t").append(score)
.append("\t");
if (tagTokens != null)
{
sb.append(TAG_JOINER.join(tagTokens)).append("\t");
}
else
{
sb.append("").append("\t");
}
sb.append(answerCount).append("\t").append(commentCount).append("\t").append(favoriteCount).toString();
outputValue.set(sb.toString());
context.write(outputKey, outputValue);
}
}
catch (SAXException e)
{
context.getCounter("Bad Record Counters", "Unparsable records").increment(1);
}
finally
{
builder.reset();
}
}
}
Driver
public class StackoverflowDataWranglerDriver extends Configured implements Tool
{
#Override
public int run(String[] args) throws Exception
{
if (args.length != 2)
{
System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
Job job = Job.getInstance(getConf());
job.setJobName("Tech Talks - Stackoverflow Forum Posts - Data Wrangler");
TextInputFormat.addInputPath(job, new Path(args[0]));
TextOutputFormat.setOutputPath(job, new Path(args[1]));
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setJarByClass(StackoverflowDataWranglerMapper.class);// required for mr1
job.setMapperClass(StackoverflowDataWranglerMapper.class);
job.setNumReduceTasks(0);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String args[]) throws Exception
{
int exitCode = ToolRunner.run(new Configuration(), new StackoverflowDataWranglerDriver(), args);
System.exit(exitCode);
}
}
Job submit command
hadoop jar ./hadoop-examples-0.0.1-SNAPSHOT.jar com.aravind.learning.hadoop.mapred.techtalks.StackoverflowDataWranglerDriver data/stackoverflow-posts.xml data/so-posts-tsv

Categories

Resources