MapReduce Hadoop 2.4.1 Reducer not running - java

For some reason my Reducer doesn't seem to be running.
My Driver is
import java.io.File;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class PageRank {
public static void main(String[] args) throws Exception {
PageRank pageRanking = new PageRank();
//In and Out dirs in HDFS
pageRanking.runXmlParsing(args[0], args[1]);
System.out.println("finished");
}
public void runXmlParsing(String inputPath, String outputPath) throws IOException {
Configuration conf = new Configuration();
conf.set(XmlInputFormat.START_TAG_KEY, "<page>");
conf.set(XmlInputFormat.END_TAG_KEY, "</page>");
Job job1 = Job.getInstance(conf);
job1.setJarByClass(PageRank.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
// Our class to parse links from content.
job1.setMapperClass(WikiPageXMLMapper.class);
job1.setReducerClass(WikiLinksReducer.class);
job1.setInputFormatClass(XmlInputFormat.class);
job1.setOutputFormatClass(TextOutputFormat.class);
// Remove output if already exists
FileSystem.getLocal(conf).delete(new Path(outputPath), true);
FileInputFormat.setInputPaths(job1, new Path(inputPath));
FileOutputFormat.setOutputPath(job1, new Path(outputPath));
System.out.println("BEFORE RUN");
try {
job1.waitForCompletion(true);
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void deleteDir(File dir) {
File[] files = dir.listFiles();
for (File myFile: files) {
if (myFile.isDirectory()) {
deleteDir(myFile);
}
myFile.delete();
}
}
}
My Mapper is
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
public class WikiPageXMLMapper extends Mapper<LongWritable, Text, Text, Text> {
#Override
public void map(LongWritable key, Text value, Context output) throws IOException {
String[] titleAndText = parseTitleAndText(value.toString());
String pageString = titleAndText[0];
Text page = new Text(pageString.replace(' ', '_'));
String[] parts = titleAndText[1].split("\\[\\[");
String pages = "!##$ ";
for (int i = 1; i < parts.length; i++) {
int lastIndexBrackets = parts[i].lastIndexOf("]]");
// This checks and skips the first part of the outer link
if (lastIndexBrackets == -1)
continue;
String insideLinkPlusExtra = parts[i].substring(0, lastIndexBrackets);
int multipleClosingBrackets = insideLinkPlusExtra.indexOf("]]");
String otherPage = insideLinkPlusExtra;
if (multipleClosingBrackets != -1) {
otherPage = insideLinkPlusExtra.substring(0, multipleClosingBrackets);
}
otherPage = otherPage.split("\\|")[0];
otherPage = checkForDuplicates(otherPage, pages);
otherPage = (otherPage.indexOf(":") == -1) ? otherPage : "";
otherPage = (otherPage.indexOf("#") == -1) ? otherPage : "";
otherPage = checkForSubpageLinks(otherPage);
otherPage = checkForRedLink(otherPage);
if (otherPage == "")
continue;
Text oP = new Text(otherPage.replace(' ', '_'));
pages += oP + " ";
// taking each outlink and making it its own key (ingraph)
try {
output.write(new Text(oP), new Text(page));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
// Designate this page as not a redlink
try {
output.write(new Text(page), new Text("!##$"));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return;
}
}
And my Reducer is:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WikiLinksReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values, org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text>.Context output) throws IOException, InterruptedException {
System.out.println("REDUCER");
String links = "";
boolean isNotRedLink = false;
System.out.println("Starting reduce");
// Brett concern (and zach's): if n pages link to a redlink
// we will iterate n times and it could be wasteful
while(values.hasNext()){
String v = values.next().toString();
// Check first outlink is not #, if so, it is a redlink
if (v.equals("!##$")) {
isNotRedLink = true;
continue;
} else {
links += v;
continue;
}
}
// If the key is not a redlink, send it to the output
if (isNotRedLink) {
try {
output.write(key, new Text(links));
output.write(key, new Text("TESTING!"));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(links);
} else {
System.out.println(output);
try {
output.write(key, new Text("BLEG"));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(key + " IS A RED LINK");
return;
}
}
}
None of the System.out.println(...)s in my Reducer output to the console, and my output from the program (the file it leaves on my HDD) only has the results from the Mapper.

I feel silly. I tried Iterable instead of Iterator in the line public void reduce(Text key, Iterable<Text> values, org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text>.Context output) throws IOException, InterruptedException { and my issue is solved.

Related

Hadoop - Reducer class in java

I am developing a Hadoop project in java. I want to find the customers with max consumption in a certain day. I have managed to find the customers in the date I want, but I am facing a problem in my Reducer class. Here is the code:
Mapper Class
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class alicanteMapperC extends
Mapper<LongWritable, Text, Text, IntWritable> {
String Customer = new String();
SimpleDateFormat ft = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date t = new Date();
IntWritable Consumption = new IntWritable();
int counter = 0;
//new vars
int max=0;
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
Date d2 = null;
try {
d2 = ft.parse("2013-07-01 01:00:00");
} catch (ParseException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
if (counter > 0) {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line, ",");
while (itr.hasMoreTokens()) {
Customer = itr.nextToken();
try {
t = ft.parse(itr.nextToken());
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Consumption.set(Integer.parseInt(itr.nextToken()));
}
if (t.compareTo(d2) == 0) {
context.write(new Text(Customer), Consumption);
}
}
counter++;
}
}
Reducer class
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class alicanteReducerC extends
Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable maximum = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int max = 0;
for (IntWritable val : values) {
if (val.get() > max) {
max = val.get();
}
}
for (IntWritable val : values) {
if (val.get() == max) {
context.write(key, val);
}
}
}
}
Do you have any idea why the reducer won't write to the output file? In other words, why doesn't the second for works?
EDIT
In my mapper class I find the Customers in a specific date and thus the consumption of them and I pass these values in the reducer class.
In the reducer class I want to find the max consumption and the customer associated to this consumption.

Could not find or load main class when having multiple files

I have class A,B and Main
Class A uses class B, and main uses class A
I want to run those files with the command line
I done:
javac *.java
java Main
And then i get Error: Could not find or load main class Main.
Code:
Location.java
package hw3;
public class Location {
private int objectLength;
private long byteLocation;
public Location(int freshLength, long freshLocation)
{
objectLength = freshLength;
byteLocation = freshLocation;
}
public int getLength()
{
return objectLength;
}
public long getLocation()
{
return byteLocation;
}
}
FileMap.java
package hw3;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.io.Serializable;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
public class FileMap<K, V extends Serializable> implements Map<K, V>{
private HashMap<K, Location> database; // Used to hold a key and the value location and length
File fp;
RandomAccessFile s;
public FileMap(String filename) throws FileNotFoundException
{
database = new HashMap<K, Location>();
fp = new File(filename);
s = new RandomAccessFile(fp, "rws");
}
public void closeFile()
{
try {
s.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public byte[] serialize(V value) throws IOException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ObjectOutputStream oos = new ObjectOutputStream(baos);
oos.writeObject(value);
return baos.toByteArray();
}
#SuppressWarnings("unchecked")
public V deserialize(byte[] byteArray) throws IOException, ClassNotFoundException
{
ByteArrayInputStream bais = new ByteArrayInputStream(byteArray);
ObjectInputStream ois = new ObjectInputStream(bais);
return (V)ois.readObject();
}
#Override
public void clear() {
// TODO Auto-generated method stub
}
#Override
public boolean containsKey(Object key) {
if(database.containsKey(key)) return true;
return false;
}
#Override
public boolean containsValue(Object value) {
// TODO Auto-generated method stub
return false;
}
#Override
public Set<java.util.Map.Entry<K, V>> entrySet() {
// TODO Auto-generated method stub
return null;
}
#Override
public V get(Object key) {
if(database.containsKey(key))
{
try
{
byte[] bar = new byte[database.get(key).getLength()]; // Create a byteArray long enough to hold the object
s.seek((int)database.get(key).getLocation()); // Move file pointer to saved value location
s.read(bar, 0, database.get(key).getLength()); // Read object
try
{
return deserialize(bar); // "Un-flatten" object and return it
}
catch (ClassNotFoundException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
catch (IOException e)
{
System.err.println("Couldn't read file");
e.printStackTrace();
}
}
return null;
}
#Override
public boolean isEmpty() {
if(database.isEmpty()) return true;
return false;
}
#Override
public Set<K> keySet() {
// TODO Auto-generated method stub
return null;
}
#Override
public V put(K key, V value) {
if(!database.containsKey(key))
{
try
{
byte[] ba = serialize(value); // "flatten" object to byte array
s.seek(s.length()); // Go to end of file
s.write(ba); // Write byte array to end of file
database.put(key, new Location(ba.length,s.length()-ba.length)); // Save key in internal key-location map
} catch (IOException e) {
e.printStackTrace();
}
}
else
{
System.out.println("Key already exists");
}
return null;
}
#Override
public void putAll(Map<? extends K, ? extends V> m) {
// TODO Auto-generated method stub
}
#Override
public V remove(Object key) {
// TODO Auto-generated method stub
return null;
}
#Override
public int size() {
return database.size();
}
#Override
public Collection<V> values() {
// TODO Auto-generated method stub
return null;
}
}
Main.java
package hw3;
import java.io.FileNotFoundException;
public class Main {
public static void main(String[] args) {
String fileName = "/home/rippxe/Documents/S2/Java/hw3/randFile.bin";
Integer x1 = new Integer(152), x2 = new Integer(485), x3 = new Integer(825);
String str1 = "bob",str2 = "john",str3 = "steve";
FileMap<Integer, String> fm = null;
try
{
fm = new FileMap<Integer, String>(fileName);
}
catch(FileNotFoundException fnfe)
{
System.err.println("File not found");
}
fm.put(x1, str1);
fm.put(x2, str2);
fm.put(x3, str3);
String new1 = fm.get(x1);
String new2 = fm.get(x2);
String new3 = fm.get(x3);
System.out.println(new1 + " " + x1 + "\n"+ new2 + " " +x2 +"\n"+ new3 + " " +x3);
fm.closeFile();
}
}
I have added the code of my classes, which you can see above.
Thanks

How to Access variable from another class in same package in java

I want to access the class Totalnoofwords in class newrepeatedcount.
I want to print "a" of class Totalnoofwords megring with
System.out.println("( "+file1.getName() +" )-" +"Total words counted:"+total);
in class newrepeatedcount.
So I could run both the code for getting System.out.println("( "+file1.getName() +" )-" +" Total no of words=" + a +"Total repeated words counted:"+total);
Here is the snippet of 1 output which I wanted
( filenameBlog 39.txt )-Total no of words=83,total repeated words counted:4
Any suggestions Welcomed.
I am a beginner to java.
Here is my two class codes below.:)
Totalnoofwords.java
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;
import org.apache.commons.io.FileUtils;
public class Totalnoofwords
{
public static void main(String[] args)
{
FilenameFilter filter = new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".txt");
}
};
File folder = new File("E:\\testfolder");
File[] listOfFiles = folder.listFiles(filter);
for (int i = 0; i < listOfFiles.length; i++) {
File file1 = listOfFiles[i];
try {
String content = FileUtils.readFileToString(file1);
} catch (IOException e) {
e.printStackTrace();
}
BufferedReader ins = null;
try {
ins = new BufferedReader (
new InputStreamReader(
new FileInputStream(file1)));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
String line = "", str = "";
int a = 0;
int b = 0;
try {
while ((line = ins.readLine()) != null) {
str += line + " ";
b++;
}
} catch (IOException e) {
e.printStackTrace();
}
StringTokenizer st = new StringTokenizer(str);
while (st.hasMoreTokens()) {
String s = st.nextToken();
a++;
}
System.out.println(" Total no of words=" + a );
}
}
}
newrepeatedcount.java
package ramki;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
public class newrepeatedcount {
public static void main(String[] args){
FilenameFilter filter = new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".txt");
}
};
File folder = new File("E:\\testfolder\\");
File[] listOfFiles = folder.listFiles(filter);
for (int i = 0; i < listOfFiles.length; i++) {
File file1 = listOfFiles[i];
try {
String content = FileUtils.readFileToString(file1);
} catch (IOException e) {
e.printStackTrace();
}
BufferedReader ins = null;
try {
ins = new BufferedReader ( new InputStreamReader(new FileInputStream(file1)));
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String st = null;
try {
st = IOUtils.toString(ins);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//split text to array of words
String[] words=st.split("\\s");
//frequency array
int[] fr=new int[words.length];
//init frequency array
for(int i1=0;i1<fr.length;i1++)
fr[i1]=-1;
//count words frequency
for(int i1=0;i1<words.length;i1++){
for(int j=0;j<words.length;j++){
if(words[i1].equals(words[j]))
{
fr[i1]++;
}
}
}
//clean duplicates
for(int i1=0;i1<words.length;i1++){
for(int j=0;j<words.length;j++){
if(words[i1].equals(words[j]))
{
if(i1!=j) words[i1]="";
}
}
}
//show the output
int total=0;
//System.out.println("Duplicate words:");
for(int i1=0;i1<words.length;i1++){
if(words[i1]!=""){
//System.out.println(words[i1]+"="+fr[i1]);
total+=fr[i1];
}
}
//System.out.println("Total words counted: "+total);
//System.out.println("Total no of repeated words : "+total+" ");
System.out.println("( "+file1.getName() +" )-" +"Total repeated words counted:"+total);
}
}}
I tried to put both the code into a single class
but neither one of the variable is working
System.out.println("( "+file1.getName() +" )-" +" Total no of words=" + a +"Total repeated words counted:"+total);
When I run neither "a" or "total" is working.(vice versa) If i change the code (variable)order.
Anyone tell how should I get both the variable output??
:)
Here is my updated code.below.
package ramki;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.StringTokenizer;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
public class newrepeatedcount {
public static void main(String[] args){
FilenameFilter filter = new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".txt");
}
};
File folder = new File("E:\\testfolder\\");
File[] listOfFiles = folder.listFiles(filter);
for (int i = 0; i < listOfFiles.length; i++) {
File file1 = listOfFiles[i];
try {
String content = FileUtils.readFileToString(file1);
} catch (IOException e) {
e.printStackTrace();
}
BufferedReader ins = null;
try {
ins = new BufferedReader ( new InputStreamReader(new FileInputStream(file1)));
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String line = "", str = "";
String st = null;
try {
st = IOUtils.toString(ins);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//split text to array of words
String[] words=st.split("\\s");
//frequency array
int[] fr=new int[words.length];
//init frequency array
for(int i1=0;i1<fr.length;i1++)
fr[i1]=-1;
//count words frequency
for(int i1=0;i1<words.length;i1++){
for(int j=0;j<words.length;j++){
if(words[i1].equals(words[j]))
{
fr[i1]++;
}
}
}
//clean duplicates
for(int i1=0;i1<words.length;i1++){
for(int j=0;j<words.length;j++){
if(words[i1].equals(words[j]))
{
if(i1!=j) words[i1]="";
}
}
}
int a = 0;
try {
while ((line = ins.readLine()) != null) {
str += line + " ";
}
} catch (IOException e) {
e.printStackTrace();
}
StringTokenizer st1 = new StringTokenizer(str);
while (st1.hasMoreTokens()) {
String s = st1.nextToken();
a++;
}
int total=0;
for(int i1=0;i1<words.length;i1++){
if(words[i1]!=""){
//System.out.println(words[i1]+"="+fr[i1]);
total+=fr[i1];
}
}
System.out.println("( "+file1.getName() +" )-" +"Total repeated words counted:"+total+","+"total no of words:"+a);
// System.out.println("total no of words:"+a);
}
}}
package Packagename;
public class newrepeatedcount {
public static void main(String[] args){
Totalnoofwords B=new Totalnoofwords();
B.somename();
System.out.println("a:"+B.a);
}
}
The variables inside the main function cannot be accessed from other class.
So you can modify Totalnoofwords.java something like.
package Packagename;
public class Totalnoofwords
{
static int a = 1;
public void somename(){
Totalnoofwords A=new Totalnoofwords();
A.a+=5;
System.out.println("a"+A.a);
}
}
and your newrepeatedcount.java be like
package Packagename;
public class newrepeatedcount {
public static void main(String[] args){
Totalnoofwords B=new Totalnoofwords();
B.somename();
System.out.println("a:"+B.a);
}
}
It looks like you have 2 main methods in the same package, I'm not sure if you wanted it this way or not, but this won't work because your not overloading the methods. For instance currently you have public static void main(String[] args) in one class, If you change the other class to accept an extra argument public static void main(String[] args1, String[]args2).
Also in order to access your second class, as stated above you would use something like
Totalnoofwords totalNoofWords = new Totalnoofwords();
totalNoofWords.accessSomething();
But this won't work, because you don't have a constructor.

MapReduce: String index out of Range

I have a Problem with the MapReduce-Code. It is saying that my String-Index is out of Range, but the String is long enough for sure.
Does anyone have any suggestions? Thanks!
This is my Code:
`
package Test;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class TestMapper extends Mapper<Text,Text,IntWritable,IntWritable> {
private IntWritable date_int = null;
private IntWritable amount_int = null;
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
String date_str = value.toString().substring(4,5);
String amount_str = value.toString().substring(7,8);
date_int = new IntWritable(Integer.parseInt(date_str));
amount_int = new IntWritable(Integer.parseInt(amount_str));
// Sammeln der Ergebnisse
context.write(date_int, amount_int);
}
}
package Test
import java.io.IOException;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
//import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
//import org.apache.hadoop.mapreduce.Reducer.Context;
public class TestReducer extends Reducer<IntWritable, IntWritable,
IntWritable, FloatWritable> {
public void reduce(IntWritable key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
float sum = 0;
int count = 0;
for (IntWritable val : values) { sum +=val.get();
count +=1;
}
float result = sum / count;
context.write(key, new FloatWritable(result));
}
}
package Test;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class TestDriver extends Configured implements Tool {
private final static Logger log = Logger.getLogger(TestDriver.class.getName());
public static void main(String[] args) {
int res = 1; // Wenn 1 nicht veraendert wird, endet der Job nicht korrekt
try {
res = ToolRunner.run(new Configuration(), new TestDriver(), args);
} catch (Exception e) {
log.log(Level.SEVERE, "Fehler beim Ausfuehren des Jobs!");
e.printStackTrace();
}
System.exit(res);
}
#Override
public int run(String[] args) {
log.log(Level.INFO, "Start Map-Reduce-Job 'TestDriver'... ");
Configuration conf = this.getConf();
Job job = null;
try {
job = Job.getInstance(conf);
} catch (IOException e1) {
log.log(Level.SEVERE, "Fehler bei Instanziierung des Jobs!");
e1.printStackTrace();
}
job.setJarByClass(TestDriver.class);
job.setMapperClass(TestMapper.class);
job.setReducerClass(TestReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(FloatWritable.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
try {
FileInputFormat.addInputPath(job, new Path(args[0]));
} catch (IllegalArgumentException e) {
log.log(Level.SEVERE, "Fehler (Argument) beim Setzen des Eingabepfades!");
e.printStackTrace();
} catch (IOException e) {
log.log(Level.SEVERE, "Fehler (IO) beim Setzen des Eingabepfades!");
e.printStackTrace();
}
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean result = false;
try {
result = job.waitForCompletion(true);
} catch (ClassNotFoundException e) {
log.log(Level.SEVERE, "Fehler (ClassNotFound) beim Ausfuehren des Jobs!");
e.printStackTrace();
} catch (IOException e) {
log.log(Level.SEVERE, "Fehler (IOException) beim Ausfuehren des Jobs!");
e.printStackTrace();
} catch (InterruptedException e) {
log.log(Level.SEVERE, "Fehler (Interrupted) beim Ausfuehren des Jobs!");
e.printStackTrace();
}
log.log(Level.INFO, "Fertig!");
return result ? 0 : 1;
}
}
`
And this is the Error Message:
java.lang.Exception: java.lang.StringIndexOutOfBoundsException: String index out of range: 5
at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:462)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:522)
My Input File is a Text File and it is like that:
200912024
2009120420
2009120750
200912083
2009120912
2009121066
2009121170
2009121225
2009121430
2009121560
2009121621
2009121722
2009121818
2009122112
2009122213
Thanks!
It is because your input value to your mapper is empty.
You are using
job.setInputFormatClass( KeyValueTextInputFormat.class );
In KeyValueTextInputFormat Each line is divided into key and value parts by a separator byte. If no such a byte exists, the key will be the entire line and value will be empty.
Please see Class KeyValueTextInputFormat
So, if you change your input format to the default:
job.setInputFormatClass( TextInputFormat.class );
And your mapper to:
import java.io.IOException;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Mapper;
public class TestMapper extends
Mapper<LongWritable,Text,IntWritable,IntWritable> {
private IntWritable date_int = new IntWritable();
private IntWritable amount_int = new IntWritable();
/**
* #param key - Line offset - ignored.
* #param value - Value to process.
* #param context - MapperContext object for accessing output, configuration information, etc.
* #throws IOException, InterruptedException.
*/
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException
{
String date_str = value.toString().substring(4,5);
String amount_str = value.toString().substring(7,8);
int date = Integer.parseInt(date_str);
date_int.set(date);
int amount = Integer.parseInt(amount_str);
amount_int.set(amount);
// Sammeln der Ergebnisse
context.write(date_int, amount_int);
}
}
It should work.
Good luck!

How do i periodically flush the rows in excel using jxl while do the close at the last

Below is my code.
I first create headers in init method.
Then in pushData I fill the rows.
The problem is once I do write and flush in init method nothing else comes in my excel sheet.
The rows that I would be writing to excel could be huge. The idea of using flush is not free the memory periodically.
Please tell me what mistake I am doing here. How do i achieve what I intent?l
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import jxl.Workbook;
import jxl.WorkbookSettings;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
public class ExportTranscriptDetailsToExcel implements IExportToFormat {
private static final ILogger logger = ILabsPlatform.getLogger(ExportTranscriptDetailsToExcel.class.getName());
OutputStream outputStream;
List<String> labels;
WritableWorkbook workbook;
WritableSheet sheet0;
public ExportTranscriptDetailsToExcel() {
this.outputStream = null;
workbook = null;
sheet0 = null;
}
#Override
public void init(String sheetName, List<String> labels, OutputStream outputStream) throws IOException,
RowsExceededException, WriteException {
this.outputStream = outputStream;
this.labels = labels;
WorkbookSettings workbookSettings = new WorkbookSettings();
workbookSettings.setEncoding("Cp1252");
workbook = Workbook.createWorkbook(outputStream, workbookSettings);
sheet0 = workbook.createSheet(sheetName, 0);
for (int i = 0; i < labels.size(); ++i) {
Label label = new Label(i, 0, labels.get(i));
sheet0.addCell(label);
}
workbook.write();
outputStream.flush();
}
#Override
public void pushDataRows(Object listOfResultRow) {
if ((listOfResultRow == null)) {
return;
}
String fieldName = null;
String fieldValue = null;
#SuppressWarnings("unchecked")
ArrayList<Map<String, Object>> interactionMap = (ArrayList<Map<String, Object>>) listOfResultRow;
try {
int i = 1;// the data rows starts from row1
for (Map<String, Object> element : interactionMap) {
for (int j = 0; j < labels.size(); j++) {
fieldName = labels.get(j);
Object ob = element.get(fieldName);
if (ob != null) {
fieldValue = ob.toString();
}
if (fieldValue == null) {
fieldValue = "-";
}
System.out.println("***********************fieldName:" + fieldName);
System.out.println("***********************fieldValue:" + fieldValue);
Label label1 = new Label(j, i, fieldValue);
fieldValue = null;
sheet0.addCell(label1);
}
i++;
}
} catch (Exception e) {
System.out.println(e.getMessage());
}
try {
workbook.write();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
outputStream.flush();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
#Override
public void done() {
try {
workbook.close();
} catch (WriteException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
As suggested here
To get around the memory problem, you can signal jxl to use temporary files when writing. This will write data to a temporary file during execution rather than storing it in memory.
You need to adjust your WorkbookSettings:
workbookSettings.setUseTemporaryFileDuringWrite(true);
workbookSettings.setTemporaryFileDuringWriteDirectory(new File("your_temporary_directory"));
Replace your_temporary_directory above with a temporary directory you prefer
Also note that this feature is available in jxl version >= 2.6.9
I think I have found the problem with my code. This is just out of hit and trial and I would certainly need someone to tell if I am correct.
In the init method, if I only do outputStream.flush() the I don't see a problem. I think doing a workbook.write() kind of closes the stream for any further writing.
So basically do outputStream.flush() everytime you want to flush out of memory.
Do workbook.write() and workbook.close() in the last

Categories

Resources