java-hadoop using virtualbox -- no output - java

this is my code
public class solution1 {
public static void main(String[] args) throws IOException {
String localStr = args[0];
String hdfsStr = args[1];
Configuration conf = new Configuration();
FileSystem hdfs = FileSystem.get(URI.create(hdfsStr), conf);
FileSystem local = FileSystem.getLocal(conf);
Path inputDir = new Path(localStr);
String folderName = inputDir.getName();
Path hdfsFile = new Path(hdfsStr, folderName);
try {
FileStatus[] inputFiles = local.listStatus(inputDir);
FSDataOutputStream out = hdfs.create(hdfsFile);
for (int i=0; i<inputFiles.length; i++) {
System.out.println(inputFiles[i].getPath().getName());
FSDataInputStream in = local.open(inputFiles[i].getPath());
byte buffer[] = new byte[256];
int bytesRead = 0;
while( (bytesRead = in.read(buffer)) > 0) {
out.write(buffer, 0, bytesRead);
}
in.close();
}
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
i am trying to use this code to loop through hundreds of .txt file
this the the content of the .txt files, and there are 500 of this
files
updated
this is what i typed in virtual box and this is the result i get, the output is not the expected output, no successful result
no successful result
it is not reading the content inside individual txt file
but if i were to use the same command to run another java file, there is a result, it will run successfully. it looks like this
this is running
the code for wordcount java is here
public class WordCount {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
}
i am stuck and cannot really figure out how to proceed. what should i type in virtualbox for my code to read all 500 txt files?

Related

How do I import and use a class in Mapper in Hadoop?

I have a class PorterStemmer which I would like to use in my Mapper. My Driver class consists of Mapper, Reducer too. I tried putting the PorterStemmer class in the Driver class, but Hadoop showed ClassNotFoundException during runtime. I also tried putting the PorterStemmer in a JAR and added it to distributed cache but then obviously I got compiler time error as PorterStemmer wasn't present inside Driver class. Is there anyway I can get around this problem?
Here is my Driver class
public class InvertedIndex {
public static class IndexMapper extends Mapper<Object, Text, Text, Text>{
private Text word = new Text();
private Text filename = new Text();
private boolean caseSensitive = false;
public static PorterStemmer stemmer = new PorterStemmer();
String token;
#Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String filenameStr = ((FileSplit) context.getInputSplit()).getPath().getName();
filename = new Text(filenameStr);
String line = value.toString();
if (!caseSensitive) {
line = line.toLowerCase();
}
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
token = tokenizer.nextToken();
stemmer.add(token.toCharArray(), token.length());
stemmer.stem();
token =stemmer.toString();
word.set(token);
context.write(word, filename);
}
}
}
public static class IndexReducer extends Reducer<Text,Text,Text,Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder stringBuilder = new StringBuilder();
for (Text value : values) {
stringBuilder.append(value.toString());
if (values.iterator().hasNext()) {
stringBuilder.append(" -> ");
}
}
context.write(key, new Text(stringBuilder.toString()));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "inverted index");
job.addCacheFile(new Path("/invertedindex/lib/stemmer.jar").toUri());
job.setJarByClass(InvertedIndex.class);
/* Field separator for reducer output*/
job.getConfiguration().set("mapreduce.output.textoutputformat.separator", " | ");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(IndexMapper.class);
job.setCombinerClass(IndexReducer.class);
job.setReducerClass(IndexReducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
Path inputFilePath = new Path(args[0]);
Path outputFilePath = new Path(args[1]);
FileInputFormat.addInputPath(job, inputFilePath);
FileOutputFormat.setOutputPath(job, outputFilePath);
/* Delete output filepath if already exists */
FileSystem fs = FileSystem.newInstance(conf);
if (fs.exists(outputFilePath)) {
fs.delete(outputFilePath, true);
}
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Either you build a fat jar with all the dependencies or share the jar to nodes using below process
You need to use -libjars to make the jar you are using distributed to all nodes. Then this new jar would be added to classpath of the task node and picked up by either mapper or reducer
hadoop jar yourJar.jar com.JobClass -libjars /path/of/stemmer.jar

Why the job chaining not working in mapreduce?

I create two jobs, and I want to chain them, so that one job is executed just after the previous job is complete. So I wrote the following code. But as I have observed job1 finished correctly, and job2 never seems to execute.
public class Simpletask extends Configured implements Tool {
public static enum FileCounters {
COUNT;
}
public static class TokenizerMapper extends Mapper<Object, Text, IntWritable, Text>{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
String line = itr.nextToken();
String part[] = line.split(",");
int id = Integer.valueOf(part[0]);
int x1 = Integer.valueOf(part[1]);
int y1 = Integer.valueOf(part[2]);
int z1 = Integer.valueOf(part[3]);
int x2 = Integer.valueOf(part[4]);
int y2 = Integer.valueOf(part[5]);
int z2 = Integer.valueOf(part[6]);
int h_v = Hilbert(x1,y1,z1);
int parti = h_v/10;
IntWritable partition = new IntWritable(parti);
Text neuron = new Text();
neuron.set(line);
context.write(partition,neuron);
}
}
public int Hilbert(int x,int y,int z){
return (int) (Math.random()*20);
}
}
public static class IntSumReducer extends Reducer<IntWritable,Text,IntWritable,Text> {
private Text result = new Text();
private MultipleOutputs<IntWritable, Text> mos;
public void setup(Context context) {
mos = new MultipleOutputs<IntWritable, Text>(context);
}
<K, V> String generateFileName(K k) {
return "p"+k.toString();
}
public void reduce(IntWritable key,Iterable<Text> values, Context context) throws IOException, InterruptedException {
String accu = "";
for (Text val : values) {
String[] entry=val.toString().split(",");
String MBR = entry[1];
accu+=entry[0]+",MBR"+MBR+" ";
}
result.set(accu);
context.getCounter(FileCounters.COUNT).increment(1);
mos.write(key, result, generateFileName(key));
}
}
public static class RTreeMapper extends Mapper<Object, Text, IntWritable, Text>{
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
System.out.println("WOWOWOWOW RUNNING");// NOTHING SHOWS UP!
}
}
public static class RTreeReducer extends Reducer<IntWritable,Text,IntWritable,Text> {
private MultipleOutputs<IntWritable, Text> mos;
Text t = new Text();
public void setup(Context context) {
mos = new MultipleOutputs<IntWritable, Text>(context);
}
public void reduce(IntWritable key,Iterable<Text> values, Context context) throws IOException, InterruptedException {
t.set("dsfs");
mos.write(key, t, "WOWOWOWOWOW"+key.get());
//ALSO, NOTHING IS WRITTEN TO THE FILE!!!!!
}
}
public static class RTreeInputFormat extends TextInputFormat{
protected boolean isSplitable(FileSystem fs, Path file) {
return false;
}
}
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Enter valid number of arguments <Inputdirectory> <Outputlocation>");
System.exit(0);
}
ToolRunner.run(new Configuration(), new Simpletask(), args);
}
#Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Job1");
job.setJarByClass(Simpletask.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
boolean complete = job.waitForCompletion(true);
//================RTree Loop============
int capacity = 3;
Configuration rconf = new Configuration();
Job rtreejob = Job.getInstance(rconf, "rtree");
if(complete){
int count = (int) job.getCounters().findCounter(FileCounters.COUNT).getValue();
System.out.println("File count: "+count);
String path = null;
for(int i=0;i<count;i++){
path = "/Worker/p"+i+"-m-00000";
System.out.println("Add input path: "+path);
FileInputFormat.addInputPath(rtreejob, new Path(path));
}
System.out.println("Input path done.");
FileOutputFormat.setOutputPath(rtreejob, new Path("/RTree"));
rtreejob.setJarByClass(Simpletask.class);
rtreejob.setMapperClass(RTreeMapper.class);
rtreejob.setCombinerClass(RTreeReducer.class);
rtreejob.setReducerClass(RTreeReducer.class);
rtreejob.setOutputKeyClass(IntWritable.class);
rtreejob.setOutputValueClass(Text.class);
rtreejob.setInputFormatClass(RTreeInputFormat.class);
complete = rtreejob.waitForCompletion(true);
}
return 0;
}
}
For a mapreduce job, the output directory should not exists. It will check for the output directory first. If it is exists, the job will fail. In your case, you specified the same output directory for both the jobs. I modified your code. I changed the args[1] to args[2] in the job2. Now the third argument will be the output directory of second job. So pass a third argument also.
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Job1");
job.setJarByClass(Simpletask.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//AND THEN I WAIT THIS JOB TO COMPLETE.
boolean complete = job.waitForCompletion(true);
//I START A NEW JOB, BUT WHY IS IT NOT RUNNING?
Configuration conf = new Configuration();
Job job2 = Job.getInstance(conf, "Job2");
job2.setJarByClass(Simpletask.class);
job2.setMapperClass(TokenizerMapper.class);
job2.setCombinerClass(IntSumReducer.class);
job2.setReducerClass(IntSumReducer.class);
job2.setOutputKeyClass(IntWritable.class);
job2.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job2, new Path(args[0]));
FileOutputFormat.setOutputPath(job2, new Path(args[2]));
A few possible causes of errors:
conf is declared twice (no compile error there?)
The output path of job2 already exists, as it was created from job1 (+1 to Amal G Jose's answer)
I think you should also use job.setMapOutputKeyClass(Text.class); and job.setMapOutputValueClass(IntWritable.class); for both jobs.
Do you also have a command to execute job2 after the code snippet that you posted? I mean, do you actually run job2.waitForCompletion(true);, or something similar to that?
Overall: check the logs for error messages, which should clearly explain what went wrong.

Hadoop mapper is never called, custom input format might be the issue

So I am doing a little test program just to get the hang of hadoops inputformat classes. I had a word search already built which took in lines as values and searched for the word line by line. I wanted to see if I could get hadoop to take in values word by word, hadoop doesn't seem to like that and keeps giving me results using the default mapper. My mappers initialize function is never even called.
I do know my record reader is called and that it is doing more or less what it is supposed to and I'm pretty sure the output of the record reader is what my mapper is searching for so why does hadoop decide not to call it?
Here is the relevant code
Input Format Class
public class WordReader extends FileInputFormat<Text, Text> {
#Override
public RecordReader<Text, Text> createRecordReader(InputSplit split,
TaskAttemptContext context) {
return new MyWholeFileReader();
}
}
Record Reader
public class MyWholeFileReader extends RecordReader<Text, Text> {
private long start;
private LineReader in;
private Text key = null;
private Text value = null;
private ArrayList<String> outputvalues;
public void initialize(InputSplit genericSplit,
TaskAttemptContext context) throws IOException {
outputvalues = new ArrayList<String>();
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
start = split.getStart();
final Path file = split.getPath();
// open the file and seek to the start of the split
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
in = new LineReader(fileIn, job);
if (key == null) {
key = new Text();
}
key.set(split.getPath().getName());
if (value == null) {
value = new Text();
}
}
public boolean nextKeyValue() throws IOException {
if (outputvalues.size() == 0) {
Text buffer = new Text();
int i = in.readLine(buffer);
String str = buffer.toString();
for (String vals : str.split(" ")) {
outputvalues.add(vals);
}
if (i == 0 || outputvalues.size() == 0) {
key = null;
value = null;
return false;
}
}
value.set(outputvalues.remove(0));
System.out.println(value.toString());
return true;
}
#Override
public Text getCurrentKey() {
return key;
}
#Override
public Text getCurrentValue() {
return value;
}
/**
*
* Get the progress within the split
*/
public float getProgress() {
return 0.0f;
}
public synchronized void close() throws IOException {
if (in != null) {
in.close();
}
}
}
Mapper
public class WordSearchMapper extends Mapper<Text, Text, OutputCollector<Text,IntWritable>, Reporter> {
static String keyword;
BloomFilter<String> b;
public void configure(JobContext jobConf) {
keyword = jobConf.getConfiguration().get("keyword");
System.out.println("keyword>> " + keyword);
b = new BloomFilter<String>(.01,10000);
b.add(keyword);
System.out.println(b.getExpectedBitsPerElement());
}
public void map(Text key, Text value, OutputCollector<Text,IntWritable> output,
Reporter reporter) throws IOException {
int wordPos;
System.out.println("value.toString()>> " + value.toString());
System.out.println(((FileSplit) reporter.getInputSplit()).getPath()
.getName());
String[] tokens = value.toString().split("[\\p{P} \\t\\n\\r]");
for (String st :tokens) {
if (b.contains(st)) {
if (value.toString().contains(keyword)) {
System.out.println("Found one");
wordPos = ((Text) value).find(keyword);
output.collect(value, new IntWritable(wordPos));
}
}
}
}
}
Driver:
public class WordSearch {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf,"WordSearch");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(WordSearchMapper.class);
job.setInputFormatClass( WordReader.class);
job.setOutputFormatClass(TextOutputFormat.class);
conf.set("keyword", "the");
FileInputFormat.setInputPaths(job, new Path("search.txt"));
FileOutputFormat.setOutputPath(job, new Path("outputs"+System.currentTimeMillis()));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
And I figured it out... this is why hadoop needs to stop supporting multiple versions of itself or why I should stop jamming multiple tutorials together. Turns out my mapper needs to be set up like this for the way my mapper and record reader are set up to interact.
'public class WordSearchMapper extends Mapper { static String keyword;`
I only realized this after looking at my imports and seeing that reporter was from package org.apache.hadoop.mapred as opposed to org.apache.hadoop.mapreduce –

ClassCast Error while writing to Cassandra from hadoop job

I am running a hadoop job and trying to write the output to Cassandra. I am getting following exception:
java.lang.ClassCastException: org.apache.hadoop.io.Text cannot be cast to java.nio.ByteBuffer
at org.apache.cassandra.hadoop.ColumnFamilyRecordWriter.write(ColumnFamilyRecordWriter.java:60)
at org.apache.hadoop.mapred.ReduceTask$NewTrackingRecordWriter.write(ReduceTask.java:514)
at org.apache.hadoop.mapreduce.TaskInputOutputContext.write(TaskInputOutputContext.java:80)
at org.apache.hadoop.mapreduce.Reducer.reduce(Reducer.java:156)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:572)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:414)
at org.apache.hadoop.mapred.Child$4.run(Child.java:270)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1127)
at org.apache.hadoop.mapred.Child.main(Child.java:264)
I modeled my map reduce code on the WordCount example given at https://wso2.org/repos/wso2/trunk/carbon/dependencies/cassandra/contrib/word_count/src/WordCount.java
Here's my MR code:
public class SentimentAnalysis extends Configured implements Tool {
static final String KEYSPACE = "Travel";
static final String OUTPUT_COLUMN_FAMILY = "Keyword_PtitleId";
public static class Map extends Mapper<LongWritable, Text, Text, LongWritable> {
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
Sentiment sentiment = null;
try {
sentiment = (Sentiment) PojoMapper.fromJson(line, Sentiment.class);
} catch(Exception e) {
return;
}
if(sentiment != null && sentiment.isLike()) {
word.set(sentiment.getNormKeyword());
context.write(word, new LongWritable(sentiment.getPtitleId()));
}
}
}
public static class Reduce extends Reducer<Text, LongWritable, ByteBuffer, List<Mutation>> {
private ByteBuffer outputKey;
public void reduce(Text key, Iterator<LongWritable> values, Context context) throws IOException, InterruptedException {
List<Long> ptitles = new ArrayList<Long>();
java.util.Map<Long, Integer> ptitleToFrequency = new HashMap<Long, Integer>();
while (values.hasNext()) {
Long value = values.next().get();
ptitles.add(value);
}
for(Long ptitle : ptitles) {
if(ptitleToFrequency.containsKey(ptitle)) {
ptitleToFrequency.put(ptitle, ptitleToFrequency.get(ptitle) + 1);
}
else {
ptitleToFrequency.put(ptitle, 1);
}
}
byte[] keyBytes = key.getBytes();
outputKey = ByteBuffer.wrap(Arrays.copyOf(keyBytes, keyBytes.length));
for(Long ptitle : ptitleToFrequency.keySet()) {
context.write(outputKey, Collections.singletonList(getMutation(new Text(ptitle.toString()), ptitleToFrequency.get(ptitle))));
}
}
private static Mutation getMutation(Text word, int sum)
{
Column c = new Column();
byte[] wordBytes = word.getBytes();
c.name = ByteBuffer.wrap(Arrays.copyOf(wordBytes, wordBytes.length));
c.value = ByteBuffer.wrap(String.valueOf(sum).getBytes());
c.timestamp = System.currentTimeMillis() * 1000;
Mutation m = new Mutation();
m.column_or_supercolumn = new ColumnOrSuperColumn();
m.column_or_supercolumn.column = c;
return m;
}
}
public static void main(String[] args) throws Exception {
int ret = ToolRunner.run(new SentimentAnalysis(), args);
System.exit(ret);
}
#Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "SentimentAnalysis");
job.setJarByClass(SentimentAnalysis.class);
String inputFile = args[0];
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(ByteBuffer.class);
job.setOutputValueClass(List.class);
job.setOutputFormatClass(ColumnFamilyOutputFormat.class);
job.setInputFormatClass(TextInputFormat.class);
ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
FileInputFormat.setInputPaths(job, inputFile);
ConfigHelper.setRpcPort(job.getConfiguration(), "9160");
ConfigHelper.setInitialAddress(job.getConfiguration(), "localhost");
ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner");
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
}
If you look under the Reduce class, I am converting Text field (key) to ByteBuffer properly.
Would appreciate some pointers on how to fix this.
After some trial and error, I was able to figure out how to solve this particular issue. Basically, in my reduce method signature, I was using Iterator instead of Iterable and so the reducer was never called. And, hadoop was trying to write my Mapper output (Text, LongWritable) to Cassandra using outputKey/Value Classes for Reducer (ByteBuffer, List). This was causing the ClassCastException.
Changing reduce method signature to Iterable solved this issue.

Type mismatch in key from map when replacing Mapper with MultithreadMapper

I'd like to implement a MultithreadMapper for my MapReduce job.
For this I replaced Mapper with MultithreadMapper in a working code.
Here's the exeption I'm getting:
java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.IntWritable, recieved org.apache.hadoop.io.LongWritable
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:862)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:549)
at org.apache.hadoop.mapreduce.TaskInputOutputContext.write(TaskInputOutputContext.java:80)
at org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper$SubMapRecordWriter.write(MultithreadedMapper.java:211)
at org.apache.hadoop.mapreduce.TaskInputOutputContext.write(TaskInputOutputContext.java:80)
at org.apache.hadoop.mapreduce.Mapper.map(Mapper.java:124)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144)
at org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper$MapRunner.run(MultithreadedMapper.java:264)
Here's the code setup:
public static void main(String[] args) {
try {
if (args.length != 2) {
System.err.println("Usage: MapReduceMain <input path> <output path>");
System.exit(123);
}
Job job = new Job();
job.setJarByClass(MapReduceMain.class);
job.setInputFormatClass(TextInputFormat.class);
FileSystem fs = FileSystem.get(URI.create(args[0]), job.getConfiguration());
FileStatus[] files = fs.listStatus(new Path(args[0]));
for(FileStatus sfs:files){
FileInputFormat.addInputPath(job, sfs.getPath());
}
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(MyMultithreadMapper.class);
job.setReducerClass(MyReducer.class);
MultithreadedMapper.setNumberOfThreads(job, MyMultithreadMapper.nThreads);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(MyPage.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);//write the result as sequential file
System.exit(job.waitForCompletion(true) ? 0 : 1);
} catch (Exception e) {
e.printStackTrace();
}
}
And here's the mapper's code:
public class MyMultithreadMapper extends MultithreadedMapper<LongWritable, Text, IntWritable, MyPage> {
ConcurrentLinkedQueue<MyScraper> scrapers = new ConcurrentLinkedQueue<MyScraper>();
public static final int nThreads = 5;
public MyMultithreadMapper() {
for (int i = 0; i < nThreads; i++) {
scrapers.add(new MyScraper());
}
}
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
MyScraper scraper = scrapers.poll();
MyPage result = null;
for (int i = 0; i < 10; i++) {
try {
result = scraper.scrapPage(value.toString(), true);
break;
} catch (Exception e) {
e.printStackTrace();
}
}
if (result == null) {
result = new MyPage();
result.setUrl(key.toString());
}
context.write(new IntWritable(result.getUrl().hashCode()), result);
scrapers.add(scraper);
}
Why the hell am I getting this?
Here's what has to be done:
MultithreadedMapper.setMapperClass(job, MyMapper.class);
MyMapper must implement the map logic
MultithreadMapper must be empty

Categories

Resources