I am not an expert in Hadoop and I have the following problem. I have a job that have to run on a cluster with Hadoop version 0.20.2.
When I start the job I specify some parameters. Two of that I want to pass to mapper and reduce class becase I need it.
I try different solution and now my code looks like this:
package bigdata;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.TreeMap;
import org.apache.commons.math3.stat.regression.SimpleRegression;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.vividsolutions.jts.geom.Geometry;
import com.vividsolutions.jts.io.ParseException;
import com.vividsolutions.jts.io.WKTReader;
public class BoxCount extends Configured implements Tool{
private static String mbr;
private static double cs;
public static class Map extends Mapper<LongWritable, Text, IntWritable, Text> implements JobConfigurable
{
public void configure(JobConf job) {
mbr = job.get(mbr);
cs = job.getDouble("cellSide", 0.1);
}
protected void setup(Context context)
throws IOException, InterruptedException {
// metodo in cui leggere l'MBR passato come parametro
System.out.println("mbr: " + mbr + "\ncs: " + cs);
// ...
}
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// some code here
}
protected void cleanup(Context context) throws IOException, InterruptedException
{
// other code
}
}
public static class Reduce extends Reducer<IntWritable,Text,IntWritable,IntWritable>implements JobConfigurable
{
private static String mbr;
private static double cs;
public void configure(JobConf job) {
mbr = job.get(mbr);
cs = job.getDouble("cellSide", 0.1);
}
protected void setup(Context context) throws IOException, InterruptedException
{
System.out.println("mbr: " + mbr + " cs: " + cs);
}
public void reduce(IntWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
//the reduce code
}
#SuppressWarnings("unused")
protected void cleanup(Context context)
throws IOException, InterruptedException {
// cleanup code
}
public BoxCount (String[] args) {
if (args.length != 4) {
// 0 1 2 3
System.out.println("Usage: OneGrid <mbr (Rectangle: (xmin,ymin)-(xmax,ymax))> <cell_Side> <input_path> <output_path>");
System.out.println("args.length = "+args.length);
for(int i = 0; i< args.length;i++)
System.out.println("args["+i+"]"+" = "+args[i]);
System.exit(0);
}
this.numReducers = 1;
//this.mbr = new String(args[0]);
// this.mbr = "Rectangle: (0.01,0.01)-(99.99,99.99)";
// per sierpinski_jts
this.mbr = "Rectangle: (0.0,0.0)-(100.01,86.6125)";
// per diagonale
//this.mbr = "Rectangle: (1.5104351688932738,1.0787616413335854)-(99999.3453727045,99999.98043392139)";
// per uniforme
// this.mbr = "Rectangle: (0.3020720559407146,0.2163091760095974)-(99999.68881210628,99999.46079314972)";
this.cellSide = Double.parseDouble(args[1]);
this.inputPath = new Path(args[2]);
this.outputDir = new Path(args[3]);
// Ricalcola la cellSize in modo da ottenere
// almeno minMunGriglie (10) griglie!
Grid g = new Grid(mbr, cellSide);
if ((this.cellSide*(Math.pow(2,minNumGriglie))) > g.width)
this.cellSide = g.width/(Math.pow(2,minNumGriglie));
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new BoxCount(args), args);
System.exit(res);
}
public int run(String[] args) throws Exception
{
// define new job instead of null using conf
Configuration conf = getConf();
#SuppressWarnings("deprecation")
Job job = new Job(conf, "BoxCount");
// conf.set("mapreduce.framework.name", "local");
// conf.set("mapreduce.jobtracker.address", "local");
// conf.set("fs.defaultFS","file:///");
// passo il valore mbr per creare la griglia
conf.set("mbr", mbr);
// passo lato cella
conf.setDouble("cellSide", cellSide);
job.setJarByClass(BoxCount.class);
// set job input format
job.setInputFormatClass(TextInputFormat.class);
// set map class and the map output key and value classes
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(Map.class);
// set reduce class and the reduce output key and value classes
job.setReducerClass(Reduce.class);
// set job output format
job.setOutputFormatClass(TextOutputFormat.class);
// add the input file as job input (from HDFS) to the variable
// inputFile
TextInputFormat.setInputPaths(job, inputPath);
// set the output path for the job results (to HDFS) to the variable
// outputPath
TextOutputFormat.setOutputPath(job, outputDir);
// set the number of reducers using variable numberReducers
job.setNumReduceTasks(numReducers);
// set the jar class
job.setJarByClass(BoxCount.class);
return job.waitForCompletion(true) ? 0 : 1; // this will execute the job
}
}
But the job not run. What is the correct solution?
Related
I am new in mapreduce and hadoop (hadoop 3.2.3 and java 8).
I am trying to separate some lines based on a symbol in a line.
Example: "q1,a,q0," should be return ('a',"q1,a,q0,") as (key, value).
My dataset contains ten(10) lines , five(5) for key 'a' and five for key 'b'.
I expect to get 5 line for each key but i always get five for 'a' and 10 for 'b'
Data
A,q0,a,q1;A,q0,b,q0;A,q1,a,q1;A,q1,b,q2;A,q2,a,q1;A,q2,b,q0;B,s0,a,s0;B,s0,b,s1;B,s1,a,s1;B,s1,b,s0
Mapper class:
import java.io.IOException;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<LongWritable, Text, ByteWritable ,Text>{
private ByteWritable key1 = new ByteWritable();
//private int n ;
private int count =0 ;
private Text wordObject = new Text();
#Override
public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
String ftext = value.toString();
for (String line: ftext.split(";")) {
wordObject = new Text();
if (line.split(",")[2].equals("b")) {
key1.set((byte) 'b');
wordObject.set(line) ;
context.write(key1,wordObject);
continue ;
}
key1.set((byte) 'a');
wordObject.set(line) ;
context.write(key1,wordObject);
}
}
}
Reducer class:
import java.io.IOException;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
public class MyReducer extends Reducer<ByteWritable, Text, ByteWritable ,Text>{
private Integer count=0 ;
#Override
public void reduce(ByteWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for(Text val : values ) {
count++ ;
}
Text symb = new Text(count.toString()) ;
context.write(key , symb);
}
}
Driver class:
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MyDriver extends Configured implements Tool {
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.out.printf("Usage: %s [generic options] <inputdir> <outputdir>\n", getClass().getSimpleName());
return -1;
}
#SuppressWarnings("deprecation")
Job job = new Job(getConf());
job.setJarByClass(MyDriver.class);
job.setJobName("separation ");
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(ByteWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(ByteWritable.class);
job.setOutputValueClass(Text.class);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new Configuration(), new MyDriver(), args);
System.exit(exitCode);
}
}
The problem was solved by putting the variable "count" inside the function "Reduce()".
Does your input read more than one line that has 5 more b's? I cannot reproduce for that one line, but your code can be cleaned up.
For the following code, I get output as
a 5
b 5
static class Mapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, ByteWritable, Text> {
final ByteWritable keyOut = new ByteWritable();
final Text valueOut = new Text();
#Override
protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, ByteWritable, Text>.Context context) throws IOException, InterruptedException {
String line = value.toString();
if (line.isEmpty()) {
return;
}
StringTokenizer tokenizer = new StringTokenizer(line, ";");
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
String[] parts = token.split(",");
String keyStr = parts[2];
if (keyStr.matches("[ab]")) {
keyOut.set((byte) keyStr.charAt(0));
valueOut.set(token);
context.write(keyOut, valueOut);
}
}
}
}
static class Reducer extends org.apache.hadoop.mapreduce.Reducer<ByteWritable, Text, Text, LongWritable> {
static final Text keyOut = new Text();
static final LongWritable valueOut = new LongWritable();
#Override
protected void reduce(ByteWritable key, Iterable<Text> values, org.apache.hadoop.mapreduce.Reducer<ByteWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
keyOut.set(new String(new byte[]{key.get()}, StandardCharsets.UTF_8));
valueOut.set(StreamSupport.stream(values.spliterator(), true)
.mapToLong(v -> 1).sum());
context.write(keyOut, valueOut);
}
}
I am running HBASE hbase-1.2.4 standalone on a ubuntu 17.04 machine.
I am trying to write a map reduce job in java that extracts metadata (i.e. column family identifiers followed by column identifiers) and sum up the number of records sharing the same schema.
I found a lot of examples and copied some code from
http://www.informit.com/articles/article.aspx?p=2262143&seqNum=2
which treats a similar issue
also the article
http://sujee.net/2011/04/10/hbase-map-reduce-example/
seemed to be helpful.
My customized code compiles and runs, but my tests show up that the reducer does not run, and at the end I get no results.
I attach the code which contains some comments indicating the critical places and also some trials.
I hope, somebody can give me the hint for the necessary corrections
Code
import java.io.*;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.fs.Path;
//
import java.util.ArrayList;
import java.util.List;
//
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
/**
* ermittelt die Anzahl gleich strukturierter Records in einer Tabelle
*
**/
public class MetaSummary {
static class Mapper1 extends TableMapper<Text, IntWritable> {
// wenn moeglich, spaeter Tabellenname als Parameter übergeben (String tableName)
private int numRecords = 0;
private static final IntWritable one = new IntWritable(1);
// eingefuegt MU
private String l_row ="";
private String l_family;
private String l_qualifier;
private String l_out ="";
private byte[] l_bytearray;
private Text l_text;
private String l_mapout ="";
#Override
public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
// für jeden Record den Aufbau Spaltenfamile:Spalte als String
// zusammengesetzt erzeugen ergibt den Output-Key des Mappers
// Der zugehörige Wert ist jeweils 1.
// Die Werte für gleiche Keys sollen nachher in der Reduce Phase addiert werden
// Aufgabe, dem userKey den String zuweisen, der den Aufbau beschreibt
//
// the user key is composed of the column famliy identifiers along with the respective column names
l_out="";
for(KeyValue kv : values.raw()){
l_family = new String(kv.getFamily());
l_qualifier = new String(kv.getQualifier());
l_out = l_out+l_family+":";
if (l_qualifier == null){ l_qualifier = "<null>"; }
if (l_qualifier.equals("")){ l_qualifier = "<leer>"; }
l_out = l_out +l_qualifier + " ";
}
l_out = l_out.trim();
l_mapout = l_mapout+ l_out + " ";
l_text = new Text(l_mapout);
// following code for test reasons only, to check if this part was running
try (PrintStream out = new PrintStream(new FileOutputStream("mapout.txt"))) {
out.print(l_mapout); }
try {
//context.write(l_userkey, one); // former trials
// context.write(l_out, one);
context.write(l_text, one);
}
catch (InterruptedException e) {
throw new IOException(e);
}
}
}
static class Reducer1 extends TableReducer<ImmutableBytesWritable, IntWritable, ImmutableBytesWritable>
//public static class Reducer1 extends TableReducer<Text, IntWritable, ImmutableBytesWritable>
{
//public void reduce(String key, Iterable<IntWritable> values, Context context)
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
// following code for test reasons only, to check if this part was running
try (PrintStream out = new PrintStream(new FileOutputStream("red1Anfang.txt"))) {
out.print("in Reducer1.reduce before for ..."); }
for (IntWritable val : values) {
sum += val.get();
System.out.println(sum);
Put put = new Put(key.getBytes());
// Put put = new Put(Bytes.toBytes(key.toString())); // former trials
// addColumn(byte[] family, byte[] qualifier, byte[] value)
put.addColumn(Bytes.toBytes("details"), Bytes.toBytes("total"), Bytes.toBytes(sum));
context.write(new ImmutableBytesWritable(key.getBytes()), put);
}
}
}
// the Reducer1 did not yiels any results, so the next trial was to output into the file systems
// which should be done by Reducer2
// which anyway does not yield any results
static class Reducer2 extends Reducer<Text, IntWritable, Text, IntWritable>
{
/*public Reducer2() {
}*/
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
// following code for test reasons only, to check if this part was running
try (PrintStream out = new PrintStream(new FileOutputStream("red2Anfang.txt"))) {
out.print("in Reducer2.reduce Anfang"); }
// following code for test reasons only, to check if this part was running
try (PrintStream out = new PrintStream(new FileOutputStream("redlaeuft.txt"))) {
out.print("reduce läuft"); }
String sumstr="";
int sum = 0;
// following code for test reasons only, to check if this part was running
try (PrintStream out = new PrintStream(new FileOutputStream("redoutvorfor.txt"))) {
out.print("in Reducer2.reduce vor Schleife"); }
for (IntWritable val : values) {
sum += val.get();
// the following lines for test reasons only
sumstr = new Integer(sum).toString();
try (PrintStream out = new PrintStream(new FileOutputStream("redout.txt"))) {
out.print(key.getBytes() + " " + sumstr); }
// Write out the key and the sum --- which of the following should do?
// context.write( new ImmutableBytesWritable(key.getBytes()), new IntWritable( sum ) );
//context.write( key, new IntWritable( sum ) );
// Even the simplest output does not work
context.write (new Text("abc"), new IntWritable(1));
}
}
}
public static void main(String[] args) throws Exception {
// HBaseConfiguration conf = new HBaseConfiguration(); // trial 1
Configuration conf = HBaseConfiguration.create();
Path output=new Path("Output");
// Job job = new Job(conf, "HBase_MetaSummary"); // trial 1
Job job = Job.getInstance(conf, "HBase_MetaSummary");
job.setJarByClass(MetaSummary.class);
Scan scan = new Scan();
TableMapReduceUtil.initTableMapperJob("videodaten", scan, Mapper1.class, ImmutableBytesWritable.class,
IntWritable.class, job);
// job.setMapperClass(Mapper1.class); // does not change anything
job.setReducerClass(Reducer2.class);
// the following outcommented lines should have caused the reduce results to be writen to a HBase table
// precondition: a table was created before :: create values'meta_summary', {NAME=>'details',VERSIONS=>1)
//TableMapReduceUtil.initTableReducerJob("meta_summary", Reducer1.class, job);
// instead I try to write into a text file which should do as well
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass( TextOutputFormat.class );
job.setNumReduceTasks( 1 );
FileOutputFormat.setOutputPath(job, output);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
I'd like to replace values of input data in my mapper, using dictionalies(csv) defined in another file. So I tried to put the csv data to HashMap and refer it in the mapper.
The java code and csv below are simplified version of my program. This code works in my local environment(Mac OS X, pseudo-distributed mode), but doesn't in my EC2 instance(ubuntu, pseudo-distributed mode).
In detail, I got this stdout in process:
cat:4
human:2
flamingo:1
this means the filereader successfully put csv data into HashMap.
However the mapper mapped nothing and therefore I got empty output in the EC2 environment, although it mapped 3 * (the number of lines of the input file) elements and generated the following in the local:
test,cat
test,flamingo
test,human
Does anyone have answers or hints?
Test.java
import java.io.IOException;
import java.util.StringTokenizer;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.DataInput;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.io.WritableUtils;
public class Test {
public static HashMap<String, Integer> map = new HashMap<String, Integer>();
public static class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
for(Map.Entry<String, Integer> e : map.entrySet()) {
context.write(new Text(e.getKey()), new Text("test"));
}
}
}
public static class Reducer1 extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> vals, Context context) throws IOException, InterruptedException {
context.write(new Text("test"), key);
}
}
public static class CommaTextOutputFormat extends TextOutputFormat<Text, Text> {
#Override
public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
String extension = ".txt";
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<Text, Text>(fileOut, ",");
}
}
public static void get_list(String list_path){
try {
FileReader fr = new FileReader(list_path);
BufferedReader br = new BufferedReader(fr);
String line = null, name = null;
int leg = 0;
while ((line = br.readLine()) != null) {
if (!line.startsWith("name") && !line.trim().isEmpty()) {
String[] name_leg = line.split(",", 0);
name = name_leg[0];
leg = Integer.parseInt(name_leg[1]);
map.put(name, leg);
}
}
br.close();
}
catch(IOException ex) {
System.err.println(ex.getMessage());
ex.printStackTrace();
}
for(Map.Entry<String, Integer> e : map.entrySet()) {
System.out.println(e.getKey() + ":" + e.getValue());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 3) {
System.err.println(
"Need 3 arguments: <input dir> <output base dir> <list path>");
System.exit(1);
}
get_list(args[2]);
Job job = Job.getInstance(conf, "test");
job.setJarByClass(Test.class);
job.setMapperClass(Mapper1.class);
job.setReducerClass(Reducer1.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(TextInputFormat.class);
// mapper output
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// reducer output
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// formtter
job.setOutputFormatClass(CommaTextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
if(!job.waitForCompletion(true)){
System.exit(1);
}
System.out.println("All Finished");
System.exit(0);
}
}
list.csv (args[2])
name,legs
cat,4
human,2
flamingo,1
=================================
I refer to #Rahul Sharma 's answer and modifiy my code as below. Then my code works in the both environments.
Thank you very much #Rahul Sharma and #Serhiy for your precise answer and useful comments.
Test.java
import java.io.IOException;
import java.util.StringTokenizer;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.DataInput;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.net.URI;
import java.io.InputStreamReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.io.WritableUtils;
public class Test {
public static HashMap<String, Integer> map = new HashMap<String, Integer>();
public static class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void setup(Context context) throws IOException, InterruptedException {
URI[] files = context.getCacheFiles();
Path list_path = new Path(files[0]);
try {
FileSystem fs = list_path.getFileSystem(context.getConfiguration());
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(list_path)));
String line = null, name = null;
int leg = 0;
while ((line = br.readLine()) != null) {
if (!line.startsWith("name") && !line.trim().isEmpty()) {
String[] name_leg = line.split(",", 0);
name = name_leg[0];
leg = Integer.parseInt(name_leg[1]);
map.put(name, leg);
}
}
br.close();
}
catch(IOException ex) {
System.err.println(ex.getMessage());
ex.printStackTrace();
}
for(Map.Entry<String, Integer> e : map.entrySet()) {
System.out.println(e.getKey() + ":" + e.getValue());
}
}
#Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
for(Map.Entry<String, Integer> e : map.entrySet()) {
context.write(new Text(e.getKey()), new Text("test"));
}
}
}
public static class Reducer1 extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> vals, Context context) throws IOException, InterruptedException {
context.write(new Text("test"), key);
}
}
// Writer
public static class CommaTextOutputFormat extends TextOutputFormat<Text, Text> {
#Override
public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
String extension = ".txt";
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<Text, Text>(fileOut, ",");
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 3) {
System.err.println(
"Need 3 arguments: <input dir> <output base dir> <list path>");
System.exit(1);
}
Job job = Job.getInstance(conf, "test");
job.addCacheFile(new Path(args[2]).toUri());
job.setJarByClass(Test.class);
job.setMapperClass(Mapper1.class);
job.setReducerClass(Reducer1.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(TextInputFormat.class);
// mapper output
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// reducer output
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// formtter
job.setOutputFormatClass(CommaTextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
if(!job.waitForCompletion(true)){
System.exit(1);
}
System.out.println("All Finished");
System.exit(0);
}
}
First you need to learn more about mapreduce framework.
Your program behave as expected in local mode because Mapper, reducer and Job are launched on same JVM. In case, of pseudo-distributed mode or distributed modes there will be separate jvms allocated for each component. The values you put into hashMap using get_list are not visible to mapper and reducer as they are in separate jvms
Use distributed cache to make it work in cluster mode.
Job Main class add file to distributed cache:
JobConf job = new JobConf();<br>
DistributedCache.addCacheArchive(new URI(args[2]), job);
Access file in mapper or reducer:
public void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.getLocal(conf);
Path[] dataFile = DistributedCache.getLocalCacheFiles(conf);
BufferedReader cacheReader = new BufferedReader(new InputStreamReader(fs.open(dataFile[0])));
// Implement here get_list method functionality
}
I have to process 250 XML files each of which is 25 MB in size. For processing XML files, I am using XMLInputFormat from Apache Mahout and generating a sequence file. The key is filename and value is entire file contents in sequence file. But problem with this approach is that 250 Mappers are launched which makes the MapReduce job slower.
I have come across CombineFileInputFormat (while going through Tom White book) using which 250 Mappers wouldn't be launched for 250 files. But CombineFileInputFormat is an abstract class and I am facing difficulty implementing it for XML files as I am new to Java as well as Hadoop.
So, can someone please provide me implementation of CombineFileInputFormat for XML files.
Driver Code:
package com.ericsson.sequencefile;
//A MapReduce program for packaging a collection of small files as a single SequenceFile.
//hadoop jar sequencefiles.jar com.ericsson.sequencefile.SmallFilesToSequenceFileConverter -D xmlinput.start="<XMLstart>" -D xmlinput.end="</XMLstart>" /IRIS_NG/pfinder2/ccn/archive /IRIS_NG/pfinder2/output
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class SmallFilesToSequenceFileConverter extends Configured implements Tool {
public static class SequenceFileMapper extends Mapper<LongWritable, Text, Text, Text> {
private Text filenameKey;
#Override
public void setup(Context context) throws IOException, InterruptedException {
InputSplit split = context.getInputSplit();
Path path = ((FileSplit) split).getPath();
filenameKey = new Text(path.toString() + "\n");
}
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String document = value.toString();
context.write(filenameKey, new Text(document));
}
}
#Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
Configuration conf = getConf();
Job job = Job.getInstance(conf,"SmallFilesToSequenceFile");
job.setJarByClass(getClass());
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setInputFormatClass(XmlInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(SequenceFileMapper.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new SmallFilesToSequenceFileConverter(), args);
System.exit(exitCode);
}
}
XMLInputFormt.java
package com.ericsson.sequencefile;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.slf4j.*;
import java.io.IOException;
/**
* Reads records that are delimited by a specific begin/end tag.
*/
public class XmlInputFormat extends TextInputFormat {
private static final Logger log =
LoggerFactory.getLogger(XmlInputFormat.class);
public static final String START_TAG_KEY = "xmlinput.start";
public static final String END_TAG_KEY = "xmlinput.end";
#Override
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
try {
return new XmlRecordReader((FileSplit) split,
context.getConfiguration());
} catch (IOException ioe) {
log.warn("Error while creating XmlRecordReader", ioe);
return null;
}
}
/**
* XMLRecordReader class to read through a given xml document to
* output xml blocks as records as specified
* by the start tag and end tag
*/
public static class XmlRecordReader
extends RecordReader<LongWritable, Text> {
private final byte[] startTag;
private final byte[] endTag;
private final long start;
private final long end;
private final FSDataInputStream fsin;
private final DataOutputBuffer buffer = new DataOutputBuffer();
private LongWritable currentKey;
private Text currentValue;
public XmlRecordReader(FileSplit split, Configuration conf)
throws IOException {
startTag = conf.get(START_TAG_KEY).getBytes("UTF-8");
endTag = conf.get(END_TAG_KEY).getBytes("UTF-8");
// open the file and seek to the start of the split
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
FileSystem fs = file.getFileSystem(conf);
fsin = fs.open(split.getPath());
fsin.seek(start);
}
private boolean next(LongWritable key, Text value)
throws IOException {
if (fsin.getPos() < end && readUntilMatch(startTag, false)) {
try {
buffer.write(startTag);
if (readUntilMatch(endTag, true)) {
key.set(fsin.getPos());
value.set(buffer.getData(), 0, buffer.getLength());
return true;
}
} finally {
buffer.reset();
}
}
return false;
}
#Override
public void close() throws IOException {
fsin.close();
}
#Override
public float getProgress() throws IOException {
return (fsin.getPos() - start) / (float) (end - start);
}
private boolean readUntilMatch(byte[] match, boolean withinBlock)
throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
// end of file:
if (b == -1) {
return false;
}
// save to buffer:
if (withinBlock) {
buffer.write(b);
}
// check if we're matching:
if (b == match[i]) {
i++;
if (i >= match.length) {
return true;
}
} else {
i = 0;
}
// see if we've passed the stop point:
if (!withinBlock && i == 0 && fsin.getPos() >= end) {
return false;
}
}
}
#Override
public LongWritable getCurrentKey()
throws IOException, InterruptedException {
return currentKey;
}
#Override
public Text getCurrentValue()
throws IOException, InterruptedException {
return currentValue;
}
#Override
public void initialize(InputSplit split,
TaskAttemptContext context)
throws IOException, InterruptedException {
}
#Override
public boolean nextKeyValue()
throws IOException, InterruptedException {
currentKey = new LongWritable();
currentValue = new Text();
return next(currentKey, currentValue);
}
}
}
I am new to Hadoop's MapReduce. I have written a map reduce task and I am trying to run that on my local machine. But the job hangs after map 100%.
Below is the code, I don't understand what am I missing.
I have a custom key class
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class AirlineMonthKey implements WritableComparable<AirlineMonthKey>{
Text airlineName;
Text month;
public AirlineMonthKey(){
super();
}
public AirlineMonthKey(Text airlineName, Text month) {
super();
this.airlineName = airlineName;
this.month = month;
}
public Text getAirlineName() {
return airlineName;
}
public void setAirlineName(Text airlineName) {
this.airlineName = airlineName;
}
public Text getMonth() {
return month;
}
public void setMonth(Text month) {
this.month = month;
}
#Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.airlineName.readFields(in);
this.month.readFields(in);
}
#Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
this.airlineName.write(out);
this.month.write(out);
}
#Override
public int compareTo(AirlineMonthKey airlineMonthKey) {
// TODO Auto-generated method stub
int diff = getAirlineName().compareTo(airlineMonthKey.getAirlineName());
if(diff != 0){
return diff;
}
int m1 = Integer.parseInt(getMonth().toString());
int m2 = Integer.parseInt(airlineMonthKey.getMonth().toString());
if(m1>m2){
return -1;
}
else
return 1;
}
}
and The mapper and the reducer class that uses the custom key as below.
package com.mapresuce.secondarysort;
import java.io.IOException;
import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import com.opencsv.CSVReader;
public class FlightDelayByMonth {
public static class FlightDelayByMonthMapper extends
Mapper<Object, Text, AirlineMonthKey, Text> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String str = value.toString();
// Reading Line one by one from the input CSV.
CSVReader reader = new CSVReader(new StringReader(str));
String[] split = reader.readNext();
reader.close();
String airlineName = split[6];
String month = split[2];
String year = split[0];
String delayMinutes = split[37];
String cancelled = split[41];
if (!(airlineName.equals("") || month.equals("") || delayMinutes
.equals(""))) {
if (year.equals("2008") && cancelled.equals("0.00")) {
AirlineMonthKey airlineMonthKey = new AirlineMonthKey(
new Text(airlineName), new Text(month));
Text delay = new Text(delayMinutes);
context.write(airlineMonthKey, delay);
System.out.println("1");
}
}
}
}
public static class FlightDelayByMonthReducer extends
Reducer<AirlineMonthKey, Text, Text, Text> {
public void reduce(AirlineMonthKey key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
for(Text val : values){
context.write(new Text(key.getAirlineName().toString()+" "+key.getMonth().toString()), val);
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage:<in> <out>");
System.exit(2);
}
Job job = new Job(conf, "Average monthly flight dealy");
job.setJarByClass(FlightDelayByMonth.class);
job.setMapperClass(FlightDelayByMonthMapper.class);
job.setReducerClass(FlightDelayByMonthReducer.class);
job.setOutputKeyClass(AirlineMonthKey.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Also I have created a job and configuration in the main. Don't know what I am missing. I am running all this in local environment.
Try with writing a custom implementation of toString, equals and hashcode in your AirlineMonthKey class.
Read below link.
http://hadoop.apache.org/docs/stable/api/org/apache/hadoop/io/WritableComparable.html
It is important for key types to implement hashCode().
Hope this could help you.
The issue was I had to use the default Constructor in the AirlineMonthKey (which I did) and initialize the instance variables in the custom key class (which I didn't).