HBase mapreduce reduce part does not run - java

I am running HBASE hbase-1.2.4 standalone on a ubuntu 17.04 machine.
I am trying to write a map reduce job in java that extracts metadata (i.e. column family identifiers followed by column identifiers) and sum up the number of records sharing the same schema.
I found a lot of examples and copied some code from
http://www.informit.com/articles/article.aspx?p=2262143&seqNum=2
which treats a similar issue
also the article
http://sujee.net/2011/04/10/hbase-map-reduce-example/
seemed to be helpful.
My customized code compiles and runs, but my tests show up that the reducer does not run, and at the end I get no results.
I attach the code which contains some comments indicating the critical places and also some trials.
I hope, somebody can give me the hint for the necessary corrections
Code
import java.io.*;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.fs.Path;
//
import java.util.ArrayList;
import java.util.List;
//
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
/**
* ermittelt die Anzahl gleich strukturierter Records in einer Tabelle
*
**/
public class MetaSummary {
static class Mapper1 extends TableMapper<Text, IntWritable> {
// wenn moeglich, spaeter Tabellenname als Parameter übergeben (String tableName)
private int numRecords = 0;
private static final IntWritable one = new IntWritable(1);
// eingefuegt MU
private String l_row ="";
private String l_family;
private String l_qualifier;
private String l_out ="";
private byte[] l_bytearray;
private Text l_text;
private String l_mapout ="";
#Override
public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
// für jeden Record den Aufbau Spaltenfamile:Spalte als String
// zusammengesetzt erzeugen ergibt den Output-Key des Mappers
// Der zugehörige Wert ist jeweils 1.
// Die Werte für gleiche Keys sollen nachher in der Reduce Phase addiert werden
// Aufgabe, dem userKey den String zuweisen, der den Aufbau beschreibt
//
// the user key is composed of the column famliy identifiers along with the respective column names
l_out="";
for(KeyValue kv : values.raw()){
l_family = new String(kv.getFamily());
l_qualifier = new String(kv.getQualifier());
l_out = l_out+l_family+":";
if (l_qualifier == null){ l_qualifier = "<null>"; }
if (l_qualifier.equals("")){ l_qualifier = "<leer>"; }
l_out = l_out +l_qualifier + " ";
}
l_out = l_out.trim();
l_mapout = l_mapout+ l_out + " ";
l_text = new Text(l_mapout);
// following code for test reasons only, to check if this part was running
try (PrintStream out = new PrintStream(new FileOutputStream("mapout.txt"))) {
out.print(l_mapout); }
try {
//context.write(l_userkey, one); // former trials
// context.write(l_out, one);
context.write(l_text, one);
}
catch (InterruptedException e) {
throw new IOException(e);
}
}
}
static class Reducer1 extends TableReducer<ImmutableBytesWritable, IntWritable, ImmutableBytesWritable>
//public static class Reducer1 extends TableReducer<Text, IntWritable, ImmutableBytesWritable>
{
//public void reduce(String key, Iterable<IntWritable> values, Context context)
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
// following code for test reasons only, to check if this part was running
try (PrintStream out = new PrintStream(new FileOutputStream("red1Anfang.txt"))) {
out.print("in Reducer1.reduce before for ..."); }
for (IntWritable val : values) {
sum += val.get();
System.out.println(sum);
Put put = new Put(key.getBytes());
// Put put = new Put(Bytes.toBytes(key.toString())); // former trials
// addColumn(byte[] family, byte[] qualifier, byte[] value)
put.addColumn(Bytes.toBytes("details"), Bytes.toBytes("total"), Bytes.toBytes(sum));
context.write(new ImmutableBytesWritable(key.getBytes()), put);
}
}
}
// the Reducer1 did not yiels any results, so the next trial was to output into the file systems
// which should be done by Reducer2
// which anyway does not yield any results
static class Reducer2 extends Reducer<Text, IntWritable, Text, IntWritable>
{
/*public Reducer2() {
}*/
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
// following code for test reasons only, to check if this part was running
try (PrintStream out = new PrintStream(new FileOutputStream("red2Anfang.txt"))) {
out.print("in Reducer2.reduce Anfang"); }
// following code for test reasons only, to check if this part was running
try (PrintStream out = new PrintStream(new FileOutputStream("redlaeuft.txt"))) {
out.print("reduce läuft"); }
String sumstr="";
int sum = 0;
// following code for test reasons only, to check if this part was running
try (PrintStream out = new PrintStream(new FileOutputStream("redoutvorfor.txt"))) {
out.print("in Reducer2.reduce vor Schleife"); }
for (IntWritable val : values) {
sum += val.get();
// the following lines for test reasons only
sumstr = new Integer(sum).toString();
try (PrintStream out = new PrintStream(new FileOutputStream("redout.txt"))) {
out.print(key.getBytes() + " " + sumstr); }
// Write out the key and the sum --- which of the following should do?
// context.write( new ImmutableBytesWritable(key.getBytes()), new IntWritable( sum ) );
//context.write( key, new IntWritable( sum ) );
// Even the simplest output does not work
context.write (new Text("abc"), new IntWritable(1));
}
}
}
public static void main(String[] args) throws Exception {
// HBaseConfiguration conf = new HBaseConfiguration(); // trial 1
Configuration conf = HBaseConfiguration.create();
Path output=new Path("Output");
// Job job = new Job(conf, "HBase_MetaSummary"); // trial 1
Job job = Job.getInstance(conf, "HBase_MetaSummary");
job.setJarByClass(MetaSummary.class);
Scan scan = new Scan();
TableMapReduceUtil.initTableMapperJob("videodaten", scan, Mapper1.class, ImmutableBytesWritable.class,
IntWritable.class, job);
// job.setMapperClass(Mapper1.class); // does not change anything
job.setReducerClass(Reducer2.class);
// the following outcommented lines should have caused the reduce results to be writen to a HBase table
// precondition: a table was created before :: create values'meta_summary', {NAME=>'details',VERSIONS=>1)
//TableMapReduceUtil.initTableReducerJob("meta_summary", Reducer1.class, job);
// instead I try to write into a text file which should do as well
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass( TextOutputFormat.class );
job.setNumReduceTasks( 1 );
FileOutputFormat.setOutputPath(job, output);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Related

Duplicate "values" for some key in map-reduce java program

I am new in mapreduce and hadoop (hadoop 3.2.3 and java 8).
I am trying to separate some lines based on a symbol in a line.
Example: "q1,a,q0," should be return ('a',"q1,a,q0,") as (key, value).
My dataset contains ten(10) lines , five(5) for key 'a' and five for key 'b'.
I expect to get 5 line for each key but i always get five for 'a' and 10 for 'b'
Data
A,q0,a,q1;A,q0,b,q0;A,q1,a,q1;A,q1,b,q2;A,q2,a,q1;A,q2,b,q0;B,s0,a,s0;B,s0,b,s1;B,s1,a,s1;B,s1,b,s0
Mapper class:
import java.io.IOException;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<LongWritable, Text, ByteWritable ,Text>{
private ByteWritable key1 = new ByteWritable();
//private int n ;
private int count =0 ;
private Text wordObject = new Text();
#Override
public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
String ftext = value.toString();
for (String line: ftext.split(";")) {
wordObject = new Text();
if (line.split(",")[2].equals("b")) {
key1.set((byte) 'b');
wordObject.set(line) ;
context.write(key1,wordObject);
continue ;
}
key1.set((byte) 'a');
wordObject.set(line) ;
context.write(key1,wordObject);
}
}
}
Reducer class:
import java.io.IOException;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
public class MyReducer extends Reducer<ByteWritable, Text, ByteWritable ,Text>{
private Integer count=0 ;
#Override
public void reduce(ByteWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for(Text val : values ) {
count++ ;
}
Text symb = new Text(count.toString()) ;
context.write(key , symb);
}
}
Driver class:
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MyDriver extends Configured implements Tool {
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.out.printf("Usage: %s [generic options] <inputdir> <outputdir>\n", getClass().getSimpleName());
return -1;
}
#SuppressWarnings("deprecation")
Job job = new Job(getConf());
job.setJarByClass(MyDriver.class);
job.setJobName("separation ");
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(ByteWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(ByteWritable.class);
job.setOutputValueClass(Text.class);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Configuration(), new MyDriver(), args);
System.exit(exitCode);
}
}
The problem was solved by putting the variable "count" inside the function "Reduce()".
Does your input read more than one line that has 5 more b's? I cannot reproduce for that one line, but your code can be cleaned up.
For the following code, I get output as
a 5
b 5
static class Mapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, ByteWritable, Text> {
final ByteWritable keyOut = new ByteWritable();
final Text valueOut = new Text();
#Override
protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, ByteWritable, Text>.Context context) throws IOException, InterruptedException {
String line = value.toString();
if (line.isEmpty()) {
return;
}
StringTokenizer tokenizer = new StringTokenizer(line, ";");
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
String[] parts = token.split(",");
String keyStr = parts[2];
if (keyStr.matches("[ab]")) {
keyOut.set((byte) keyStr.charAt(0));
valueOut.set(token);
context.write(keyOut, valueOut);
}
}
}
}
static class Reducer extends org.apache.hadoop.mapreduce.Reducer<ByteWritable, Text, Text, LongWritable> {
static final Text keyOut = new Text();
static final LongWritable valueOut = new LongWritable();
#Override
protected void reduce(ByteWritable key, Iterable<Text> values, org.apache.hadoop.mapreduce.Reducer<ByteWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
keyOut.set(new String(new byte[]{key.get()}, StandardCharsets.UTF_8));
valueOut.set(StreamSupport.stream(values.spliterator(), true)
.mapToLong(v -> 1).sum());
context.write(keyOut, valueOut);
}
}

Pass variables to mapper and reducer in Hadoop (old api)

I am not an expert in Hadoop and I have the following problem. I have a job that have to run on a cluster with Hadoop version 0.20.2.
When I start the job I specify some parameters. Two of that I want to pass to mapper and reduce class becase I need it.
I try different solution and now my code looks like this:
package bigdata;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.TreeMap;
import org.apache.commons.math3.stat.regression.SimpleRegression;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.vividsolutions.jts.geom.Geometry;
import com.vividsolutions.jts.io.ParseException;
import com.vividsolutions.jts.io.WKTReader;
public class BoxCount extends Configured implements Tool{
private static String mbr;
private static double cs;
public static class Map extends Mapper<LongWritable, Text, IntWritable, Text> implements JobConfigurable
{
public void configure(JobConf job) {
mbr = job.get(mbr);
cs = job.getDouble("cellSide", 0.1);
}
protected void setup(Context context)
throws IOException, InterruptedException {
// metodo in cui leggere l'MBR passato come parametro
System.out.println("mbr: " + mbr + "\ncs: " + cs);
// ...
}
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// some code here
}
protected void cleanup(Context context) throws IOException, InterruptedException
{
// other code
}
}
public static class Reduce extends Reducer<IntWritable,Text,IntWritable,IntWritable>implements JobConfigurable
{
private static String mbr;
private static double cs;
public void configure(JobConf job) {
mbr = job.get(mbr);
cs = job.getDouble("cellSide", 0.1);
}
protected void setup(Context context) throws IOException, InterruptedException
{
System.out.println("mbr: " + mbr + " cs: " + cs);
}
public void reduce(IntWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
//the reduce code
}
#SuppressWarnings("unused")
protected void cleanup(Context context)
throws IOException, InterruptedException {
// cleanup code
}
public BoxCount (String[] args) {
if (args.length != 4) {
// 0 1 2 3
System.out.println("Usage: OneGrid <mbr (Rectangle: (xmin,ymin)-(xmax,ymax))> <cell_Side> <input_path> <output_path>");
System.out.println("args.length = "+args.length);
for(int i = 0; i< args.length;i++)
System.out.println("args["+i+"]"+" = "+args[i]);
System.exit(0);
}
this.numReducers = 1;
//this.mbr = new String(args[0]);
// this.mbr = "Rectangle: (0.01,0.01)-(99.99,99.99)";
// per sierpinski_jts
this.mbr = "Rectangle: (0.0,0.0)-(100.01,86.6125)";
// per diagonale
//this.mbr = "Rectangle: (1.5104351688932738,1.0787616413335854)-(99999.3453727045,99999.98043392139)";
// per uniforme
// this.mbr = "Rectangle: (0.3020720559407146,0.2163091760095974)-(99999.68881210628,99999.46079314972)";
this.cellSide = Double.parseDouble(args[1]);
this.inputPath = new Path(args[2]);
this.outputDir = new Path(args[3]);
// Ricalcola la cellSize in modo da ottenere
// almeno minMunGriglie (10) griglie!
Grid g = new Grid(mbr, cellSide);
if ((this.cellSide*(Math.pow(2,minNumGriglie))) > g.width)
this.cellSide = g.width/(Math.pow(2,minNumGriglie));
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new BoxCount(args), args);
System.exit(res);
}
public int run(String[] args) throws Exception
{
// define new job instead of null using conf
Configuration conf = getConf();
#SuppressWarnings("deprecation")
Job job = new Job(conf, "BoxCount");
// conf.set("mapreduce.framework.name", "local");
// conf.set("mapreduce.jobtracker.address", "local");
// conf.set("fs.defaultFS","file:///");
// passo il valore mbr per creare la griglia
conf.set("mbr", mbr);
// passo lato cella
conf.setDouble("cellSide", cellSide);
job.setJarByClass(BoxCount.class);
// set job input format
job.setInputFormatClass(TextInputFormat.class);
// set map class and the map output key and value classes
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(Map.class);
// set reduce class and the reduce output key and value classes
job.setReducerClass(Reduce.class);
// set job output format
job.setOutputFormatClass(TextOutputFormat.class);
// add the input file as job input (from HDFS) to the variable
// inputFile
TextInputFormat.setInputPaths(job, inputPath);
// set the output path for the job results (to HDFS) to the variable
// outputPath
TextOutputFormat.setOutputPath(job, outputDir);
// set the number of reducers using variable numberReducers
job.setNumReduceTasks(numReducers);
// set the jar class
job.setJarByClass(BoxCount.class);
return job.waitForCompletion(true) ? 0 : 1; // this will execute the job
}
}
But the job not run. What is the correct solution?

Output file contains Mapper Output instead of Reducer output

Hi I am trying to find average of few numbers using map reduce technique in stand alone mode. I have two input files.It contain values file1: 25 25 25 25 25 and file2: 15 15 15 15 15.
My program is working fine but the output file contains output of the mapper instead of reducer output.
Here is my code :
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.io.Writable;
import java.io.*;
public class Average {
public static class SumCount implements Writable {
public int sum;
public int count;
#Override
public void write(DataOutput out) throws IOException {
out.writeInt(sum);
out.writeInt(count);
}
#Override
public void readFields(DataInput in) throws IOException {
sum = in.readInt();
count =in.readInt();
}
}
public static class TokenizerMapper extends Mapper<Object, Text, Text, Object>{
private final static IntWritable valueofkey = new IntWritable();
private Text word = new Text();
SumCount sc=new SumCount();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
int sum=0;
int count=0;
int v;
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
v=Integer.parseInt(word.toString());
count=count+1;
sum=sum+v;
}
word.set("average");
sc.sum=sum;
sc.count=count;
context.write(word,sc);
}
}
public static class IntSumReducer extends Reducer<Text,Object,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<SumCount> values,Context context) throws IOException, InterruptedException {
int sum = 0;
int count=0;
int wholesum=0;
int wholecount=0;
for (SumCount val : values) {
wholesum=wholesum+val.sum;
wholecount=wholecount+val.count;
}
int res=wholesum/wholecount;
result.set(res);
context.write(key, result );
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "");
job.setJarByClass(Average.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(SumCount.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
after i run the program my output file is like this:
average Average$SumCount#434ba039
average Average$SumCount#434ba039
You can't use your Reducer class IntSumReducer as a combiner. A combiner must receive and emit the same Key/Value types.
So i would remove job.setCombinerClass(IntSumReducer.class);.
Remember the output from the combine is the input to the reduce, so writing out Text and IntWritable isnt going to work.
If your output files looked like part-m-xxxxx then the above issue could mean it only ran the Map phase and stoppped. Your counters would confirm this.
You also have Reducer<Text,Object,Text,IntWritable> which should be Reducer<Text,SumCount,Text,IntWritable>.

Hadoop jar command error for multiple mapper inputs and 1 reducer output (Join 2 values from 2 files)

Here is my sample program joining 2 datasets.
The program has 2 mappers and 1 reducer joining the values obtained from 2 different mappers having 2 different files as input.
I am getting an error in the hadoop jar command.
command:
hadoop jar /home/rahul/Downloads/testjars/datajoin.jar DataJoin
/user/rahul/cust.txt /user/rahul/delivery.txt /user/rahul/output
Error: Invalid number of arguments Datajoin
It is actually expecting only 1 input path and 1 output path whereas in my command I have 2 inputs for 2 different mappers and 1 output.
Can anyone help me out ?
Code:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class DataJoin {
public static class TokenizerMapper1 extends Mapper {
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String itr[] = value.toString().split("::");
word.set(itr[0].trim());
context.write(word, new Text("CD~" + itr[1]));
}
}
public static class TokenizerMapper2 extends Mapper {
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String itr[] = value.toString().split("::");
word.set(itr[0].trim());
context.write(word, new Text("DD~" + itr[1]));
}
}
public static class IntSumReducer extends Reducer {
private Text result = new Text();
public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
String sum = "";
for (Text val : values) {
sum += val.toString();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: DataJoin ");
System.exit(2);
}
Job job = new Job(conf, "Data Join");
job.setJarByClass(DataJoin.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
MultipleInputs.addInputPath(job, new Path(otherArgs[0]),
TextInputFormat.class, TokenizerMapper1.class);
MultipleInputs.addInputPath(job, new Path(otherArgs[1]),
TextInputFormat.class, TokenizerMapper2.class);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
You have error in this portion
if (otherArgs.length != 2) {
System.err.println("Usage: DataJoin ");
System.exit(2);
}
Your argument is of length 3. 2 inputs and 1 output.
Argument count starts from 1,2... not from 0,1....
Change to
if (otherArgs.length != 3) {
System.err.println("Usage: DataJoin ");
System.exit(0);
}
This solves your issue.

Parsing of Stackoverflow`s posts.xml on hadoop

I am following this article by Anoop Madhusudanan on codeproject to build a recommendation engine not on cluster but on my system.
Problem is when i try to parse posts.xml whose structure is as follows:
<row Id="99" PostTypeId="2" ParentId="88" CreationDate="2008-08-01T14:55:08.477" Score="2" Body="<blockquote>
<p>The actual resolution of gettimeofday() depends on the hardware architecture. Intel processors as well as SPARC machines offer high resolution timers that measure microseconds. Other hardware architectures fall back to the system’s timer, which is typically set to 100 Hz. In such cases, the time resolution will be less accurate. </p>
</blockquote>
<p>I obtained this answer from <a href="http://www.informit.com/guides/content.aspx?g=cplusplus&amp;seqNum=272" rel="nofollow">High Resolution Time Measurement and Timers, Part I</a></p>" OwnerUserId="25" LastActivityDate="2008-08-01T14:55:08.477" />
Now I need to parse this file(size 1.4 gb) on hadoop for which i have written code in java and created its jar.
Java class is as follows:
import java.io.IOException;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.w3c.dom.Element;
import java.io.File;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
public class Recommend {
static class Map extends Mapper<Text, Text, Text, Text> {
Path path;
String fXmlFile;
DocumentBuilderFactory dbFactory;
DocumentBuilder dBuilder;
Document doc;
/**
* Given an output filename, write a bunch of random records to it.
*/
public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
try{
fXmlFile=value.toString();
dbFactory = DocumentBuilderFactory.newInstance();
dBuilder= dbFactory.newDocumentBuilder();
doc= dBuilder.parse(fXmlFile);
doc.getDocumentElement().normalize();
NodeList nList = doc.getElementsByTagName("row");
for (int temp = 0; temp < nList.getLength(); temp++) {
Node nNode = nList.item(temp);
Element eElement = (Element) nNode;
Text keyWords =new Text(eElement.getAttribute("OwnerUserId"));
Text valueWords = new Text(eElement.getAttribute("ParentId"));
String val=keyWords.toString()+" "+valueWords.toString();
// Write the sentence
if(keyWords != null && valueWords != null){
output.collect(keyWords, new Text(val));
}
}
}catch (Exception e) {
e.printStackTrace();
}
}
}
/**
*
* #throws IOException
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
/*if (args.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}*/
// FileSystem fs = FileSystem.get(conf);
Job job = new Job(conf, "Recommend");
job.setJarByClass(Recommend.class);
// the keys are words (strings)
job.setOutputKeyClass(Text.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
// the values are counts (ints)
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
//conf.setReducerClass(Reduce.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
Path outPath = new Path(args[1]);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}
}
}
I expect the output to be as a file in hadoop containing output as OwnerUserId ParentId
but instead I get output as:
1599788 <row Id="2292" PostTypeId="2" ParentId="2284" CreationDate="2008-08-05T13:28:06.700" Score="0" ViewCount="0" Body="<p>The first thing you should do is contact the main people who run the open source project. Ask them if it is ok to contribute to the code and go from there.</p>
<p>Simply writing your improved code and then giving it to them may result in your code being rejected.</p>" OwnerUserId="383" LastActivityDate="2008-08-05T13:28:06.700" />
I dont know about the origin of 1599788 appearing as a key value from mapper.
I don`t know much about writing mapper classes for hadoop, I need help to modify my code to get the desired output.
Thanks in advance.
After a lot of research and experiments , finally learnt the way to write map for parsin xml files which have syntax like one I provided. I changed my approach and this is my new mapper code... Its workin for my usecase.
hope it help someone and they can save their time :)
import java.io.IOException;
import java.util.StringTokenizer;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.xml.sax.SAXException;
public class Map extends Mapper<LongWritable, Text, NullWritable, Text> {
NullWritable obj;
#Override
public void map(LongWritable key, Text value, Context context) throws InterruptedException {
StringTokenizer tok= new StringTokenizer(value.toString());
String pa=null,ow=null,pi=null,v;
while (tok.hasMoreTokens()) {
String[] arr;
String val = (String) tok.nextToken();
if(val.contains("PostTypeId")){
arr= val.split("[\"]");
pi=arr[arr.length-1];
if(pi.equals("2")){
continue;
}
else break;
}
if(val.contains("ParentId")){
arr= val.split("[\"]");
pa=arr[arr.length-1];
}
else if(val.contains("OwnerUserId") ){
arr= val.split("[\"]");
ow=arr[arr.length-1];
try {
if(pa!=null && ow != null){
v=String.format("{0},{1}", ow,pa);
context.write(obj,new Text(v));
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
}
Here is the mapper I've written to parse the so posts xml and create a tab separated file(s) on hadoop to be used by other map reduce jobs or Hive or Pig.
Mapper
package com.aravind.learning.hadoop.mapred.techtalks;
import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.google.common.base.Joiner;
public class StackoverflowDataWranglerMapper extends Mapper<LongWritable, Text, Text, Text>
{
static enum BadRecordCounters
{
NO_CREATION_DATE, UNKNOWN_USER_ID, UNPARSEABLE_RECORD, UNTAGGED_POSTS
}
private final Text outputKey = new Text();
private final Text outputValue = new Text();
private final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
private DocumentBuilder builder;
private static final Joiner TAG_JOINER = Joiner.on(",").skipNulls();
// 2008-07-31T21:42:52.667
private static final DateFormat DATE_PARSER = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");
private static final SimpleDateFormat DATE_BUILDER = new SimpleDateFormat("yyyy-MM-dd");
#Override
protected void setup(Context context) throws IOException, InterruptedException
{
try
{
builder = factory.newDocumentBuilder();
}
catch (ParserConfigurationException e)
{
new IOException(e);
}
}
#Override
protected void map(LongWritable inputKey, Text inputValue, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException
{
try
{
String entry = inputValue.toString();
if (entry.contains("<row "))
{
Document doc = builder.parse(new InputSource(new StringReader(entry)));
Element rootElem = doc.getDocumentElement();
String id = rootElem.getAttribute("Id");
String postedBy = rootElem.getAttribute("OwnerUserId").trim();
String viewCount = rootElem.getAttribute("ViewCount");
String postTypeId = rootElem.getAttribute("PostTypeId");
String score = rootElem.getAttribute("Score");
String title = rootElem.getAttribute("Title");
String tags = rootElem.getAttribute("Tags");
String answerCount = rootElem.getAttribute("AnswerCount");
String commentCount = rootElem.getAttribute("CommentCount");
String favoriteCount = rootElem.getAttribute("FavoriteCount");
String creationDate = rootElem.getAttribute("CreationDate");
Date parsedDate = null;
if (creationDate != null && creationDate.trim().length() > 0)
{
try
{
parsedDate = DATE_PARSER.parse(creationDate);
}
catch (ParseException e)
{
context.getCounter("Bad Record Counters", "Posts missing CreationDate").increment(1);
}
}
if (postedBy.length() == 0 || postedBy.trim().equals("-1"))
{
context.getCounter("Bad Record Counters", "Posts with either empty UserId or UserId contains '-1'")
.increment(1);
try
{
parsedDate = DATE_BUILDER.parse("2100-00-01");
}
catch (ParseException e)
{
// ignore
}
}
tags = tags.trim();
String tagTokens[] = null;
if (tags.length() > 1)
{
tagTokens = tags.substring(1, tags.length() - 1).split("><");
}
else
{
context.getCounter("Bad Record Counters", "Untagged Posts").increment(1);
}
outputKey.clear();
outputKey.set(id);
StringBuilder sb = new StringBuilder(postedBy).append("\t").append(parsedDate.getTime()).append("\t")
.append(postTypeId).append("\t").append(title).append("\t").append(viewCount).append("\t").append(score)
.append("\t");
if (tagTokens != null)
{
sb.append(TAG_JOINER.join(tagTokens)).append("\t");
}
else
{
sb.append("").append("\t");
}
sb.append(answerCount).append("\t").append(commentCount).append("\t").append(favoriteCount).toString();
outputValue.set(sb.toString());
context.write(outputKey, outputValue);
}
}
catch (SAXException e)
{
context.getCounter("Bad Record Counters", "Unparsable records").increment(1);
}
finally
{
builder.reset();
}
}
}
Driver
public class StackoverflowDataWranglerDriver extends Configured implements Tool
{
#Override
public int run(String[] args) throws Exception
{
if (args.length != 2)
{
System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
Job job = Job.getInstance(getConf());
job.setJobName("Tech Talks - Stackoverflow Forum Posts - Data Wrangler");
TextInputFormat.addInputPath(job, new Path(args[0]));
TextOutputFormat.setOutputPath(job, new Path(args[1]));
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setJarByClass(StackoverflowDataWranglerMapper.class);// required for mr1
job.setMapperClass(StackoverflowDataWranglerMapper.class);
job.setNumReduceTasks(0);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String args[]) throws Exception
{
int exitCode = ToolRunner.run(new Configuration(), new StackoverflowDataWranglerDriver(), args);
System.exit(exitCode);
}
}
Job submit command
hadoop jar ./hadoop-examples-0.0.1-SNAPSHOT.jar com.aravind.learning.hadoop.mapred.techtalks.StackoverflowDataWranglerDriver data/stackoverflow-posts.xml data/so-posts-tsv

Categories

Resources