Hadoop Mapreduce - Access Local File System - java

I have a requirement where in the Map Reduce code should read the local file system in each node. The program will be running on HDFS and I cannot change the FileSystem property for hadoop in xml files for configuration.
I have tried the following solutions, but none gave me results.
Approach 1
Configuration config = new Configuration();
FileSystem localFileSystem = FileSystem.get(config);
localFileSystem.set("fs.defaultFS", "file:///");
BufferedReader bufferRedaer = new BufferedReader(new InputStreamReader(localFileSystem.open(new Path("/user/input/localFile"))));
Approach 2
Configuration config = new Configuration();
LocalFileSystem localFileSystem = FileSystem.getLocal(config);
BufferedReader bufferRedaer = new BufferedReader(new InputStreamReader(localFileSystem.open(new Path("/user/input/localFile"))));
Approach 3
Configuration config = new Configuration();
LocalFileSystem localFileSystem = FileSystem.getLocal(config);
localFileSystem.set("fs.defaultFS", "file:///");
BufferedReader bufferRedaer = new BufferedReader(new InputStreamReader(localFileSystem.open(new Path("/user/input/localFile"))));
Approach 4
Configuration config = new Configuration();
LocalFileSystem localFileSystem = FileSystem.getLocal(config);
BufferedReader bufferRedaer = new BufferedReader(new InputStreamReader(localFileSystem.getRaw().open(new Path("/user/input/localFile"))));
This did not work either
[Reading HDFS and local files in Java
Each of them gave the error: No such file exists
Error Stack
attempt_201406050021_0018_m_000000_2: java.io.FileNotFoundException: File /home/cloudera/sftp/id_rsa does not exist
attempt_201406050021_0018_m_000000_2: at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:468)
attempt_201406050021_0018_m_000000_2: at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:380)
attempt_201406050021_0018_m_000000_2: at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:231)
attempt_201406050021_0018_m_000000_2: at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:183)
attempt_201406050021_0018_m_000000_2: at org.apache.hadoop.fs.LocalFileSystem.copyFromLocalFile(LocalFileSystem.java:81)
attempt_201406050021_0018_m_000000_2: at org.apache.hadoop.fs.FileSystem.copyFromLocalFile(FileSystem.java:1934)
attempt_201406050021_0018_m_000000_2: at com.skanda.ecomm.sftp.FTPMapper.configure(FTPMapper.java:91)
I am hoping to get a positive solution here. Let me know where I am going wrong.
Main class (Driver class)
/*
* #SFTPClient.java #May 20, 2014
*
*
*/
package com.skanda.ecomm.sftp;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
*
* <p>
* SFTPClient Class
* </p>
*
* #author skanda
* #version 1.0
*
*/
public class SFTPClient extends Configured implements Tool {
public int run(String[] args) throws Exception {
Configuration config = getConf();
String inputPath = config.get(ApplicationConstants.INPUT_PATH);
String outputPath = config.get(ApplicationConstants.OUTPUT_PATH);
String configPath = config.get(ApplicationConstants.CONFIG_PATH);
int reducers = Integer.parseInt(config.get(ApplicationConstants.REDUCERS));
if(outputPath == null || inputPath == null || configPath == null) {
throw new Exception("Usage: \n" + "-D configPath=<configPath> -D inputPath=<inputPath> -D reducers=<reducers" +
"-D outputPath=<path>");
}
JobConf conf = new JobConf(SFTPClient.class);
conf.setJobName("SFTP Injection client");
DistributedCache.addCacheFile(new URI(configPath),conf);
conf.setMapperClass(FTPMapper.class);
conf.setReducerClass(FTPReducer.class);
conf.setMapOutputKeyClass(IntWritable.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(IntWritable.class);
// configuration should contain reference to your namenode
FileSystem fs = FileSystem.get(new Configuration());
fs.delete(new Path(outputPath), true); // true stands for recursively, deleting the folder you gave
conf.setStrings(ApplicationConstants.INPUT_PATH, inputPath);
conf.setStrings(ApplicationConstants.OUTPUT_PATH, outputPath);
FileInputFormat.setInputPaths(conf, new Path(inputPath));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
conf.setNumReduceTasks(reducers);
conf.setInt(ApplicationConstants.NUNBER_OF_REDUCERS, reducers);
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new SFTPClient(), args);
System.exit(exitCode);
}
}
Mapper
/*
* #FTPMapper.java #May 20, 2014
*
*
*/
package com.skanda.ecomm.sftp;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.InetAddress;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import com.ftp.mapreduce.CommonUtility;
import com.ftp.mapreduce.RetrieveFileNames;
import com.jcraft.jsch.hm.Channel;
/**
*
* <p>
* FTP Mapper Class
* </p>
*
* #author skanda
* #version 1.0
*
*/
#SuppressWarnings("unused")
public class FTPMapper extends MapReduceBase implements Mapper<LongWritable, Text, IntWritable, Text> {
private URI[] localFiles;
private String userName;
private String hostName;
private String folderPath;
private int reducers;
private byte[] pvtKey;
private String fileName;
private String startDate;
private String endDate;
private String sshKeyPath;
private String password;
public void configure(JobConf job) {
Properties properties = new Properties();
try {
localFiles = DistributedCache.getCacheFiles(job);
if (localFiles != null && localFiles.length == 1) {
Configuration conf = new Configuration();
FileSystem fileSystem = FileSystem.get(localFiles[0], conf);
BufferedReader bufferRedaer=new BufferedReader(new InputStreamReader(fileSystem.open(new Path(localFiles[0]))));
properties.load(bufferRedaer);
userName = properties.getProperty(ApplicationConstants.USER_NAME);
reducers = job.getInt(ApplicationConstants.NUNBER_OF_REDUCERS, 30);
hostName = properties.getProperty(ApplicationConstants.SFTP_SERVER_HOST);
folderPath = properties.getProperty(ApplicationConstants.HOSTFILE_DIRECTORY_PATH);
fileName = properties.getProperty(ApplicationConstants.FILE_NAME_PATTERN);
startDate = properties.getProperty(ApplicationConstants.FILE_START_DATE);
endDate = properties.getProperty(ApplicationConstants.FILE_END_DATE);
sshKeyPath = properties.getProperty(ApplicationConstants.SSH_KEY_PATH);
password = properties.getProperty(ApplicationConstants.PASSWORD);
System.out.println("--------------------------------------------------");
/*FileSystem fs = FileSystem.getLocal(conf);
//Path inputPath = fs.makeQualified(new Path(sshKeyPath));
String inputPath = new Path("file:///home/cloudera/"+sshKeyPath).toUri().getPath();
fs.copyFromLocalFile(new Path(inputPath), new Path("outputSFTP/idFile") );*/
try{
Configuration conf1 = new Configuration();
Path pt = new Path("file:///home/cloudera/.ssh/id_rsa");
FileSystem fs = FileSystem.get( new URI("file:///home/cloudera/.ssh/id_rsa"), conf);
LocalFileSystem localFileSystem = fs.getLocal(conf1);
BufferedReader bufferRedaer1 = new BufferedReader(new InputStreamReader(localFileSystem.open(pt)));
String str = null;
while ((str = bufferRedaer1.readLine())!= null)
{
System.out.println("-----------");
System.out.println(str);
}
}catch(Exception e){
System.out.println("failed again");
String computername=InetAddress.getLocalHost().getHostName();
System.out.println(computername);
e.printStackTrace();
}
System.out.println("--------------------------------------------------");
Configuration config = new Configuration();
config.set("fs.defaultFS", "file:////");
LocalFileSystem localFileSystem = FileSystem.getLocal(config);
bufferRedaer = new BufferedReader(new InputStreamReader(localFileSystem.open(new Path(sshKeyPath))));
/*Configuration config = new Configuration();
//config.set("fs.defaultFS", "file:///home/cloudera/.ssh/id_rsa");
LocalFileSystem fileSystm = FileSystem.getLocal(config);
Path path = fileSystm.makeQualified(new Path("/home/cloudera/.ssh/id_rsa"));*/
//FileInputFormat.setInputPaths(job, path);
//bufferRedaer = new BufferedReader(new InputStreamReader(fileSystem.open(path)));
String key = "";
try {
String line = "";
while ((line = bufferRedaer.readLine()) != null) {
key += line + "\n";
}
pvtKey = key.getBytes();
} catch(Exception e){
e.printStackTrace();
} finally {
//fileSystem.close();
//bufferRedaer.close();
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
public void map(LongWritable key, Text value, OutputCollector<IntWritable, Text> output, Reporter reporter)
throws IOException {
List<String> filterFileNamesList = new ArrayList<String>();
Channel channel = CommonUtility.connectSFTP(userName, hostName, pvtKey);
Map<String, String> fileNamesMap = CommonUtility.getFileNames(channel, folderPath);
List<String> filterFileNameList_output = RetrieveFileNames.FILTER_BY_NAME.retrieveFileNames(fileNamesMap, filterFileNamesList,
fileName, startDate, endDate);
for (int i = 0; i < filterFileNameList_output.size(); i++) {
int keyGroup = i % reducers;
output.collect(new IntWritable(keyGroup), new Text(filterFileNameList_output.get(i)));
}
}
}

This code is working for me when program runs on hdfs and my txt file is in this location:
/home/Rishi/Documents/RishiFile/r.txt
public class HadoopRead {
public static void main(String[] args) {
try{
Configuration conf = new Configuration();
Path pt = new Path("/home/Rishi/Documents/RishiFile/r.txt");
FileSystem fs = FileSystem.get( new URI("/home/Rishi/Documents/RishiFile"), conf);
LocalFileSystem localFileSystem = fs.getLocal(conf);
BufferedReader bufferRedaer = new BufferedReader(new InputStreamReader(localFileSystem.open(pt)));
String str = null;
while ((str = bufferRedaer.readLine())!= null)
{
System.out.println("-----------");
System.out.println(str);
}
}catch(Exception e){
e.printStackTrace();
}
}
}
Word Count Example for reading local file on hdfs
my main class
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class FileDriver extends Configured implements Tool {
public static void main(String[] args) {
try{
ToolRunner.run(new Configuration(), new FileDriver(), args);
System.exit(0);
}catch(Exception e){
e.printStackTrace();
}
}
public int run(String[] arg0) throws Exception {
Configuration conf = new Configuration();
Path pt = new Path("file:///home/winoria/Documents/Ri/r");
Job job = new Job(conf, "new Job");
job.setJarByClass(FileDriver.class);
job.setMapperClass(FileMapper.class);
job.setReducerClass(FileReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, pt);
FileSystem.get(job.getConfiguration()).delete(new Path("Output2"), true);
FileOutputFormat.setOutputPath(job, new Path("Output2"));
job.waitForCompletion(true);
return 0;
}
}
mapper class :
public class FileMapper extends Mapper<LongWritable, Text, Text, Text> {
protected void map(LongWritable key, Text value,Context context) throws java.io.IOException ,InterruptedException {
String str[] = value.toString().split(" ");
for(int i =0; i<str.length;i++){
context.write(new Text(str[i]), new Text());
}
};
}
Reducer Class:
public class FileReducer extends Reducer<Text, Text, Text, Text> {
protected void reduce(Text key,Iterable<Text> value,Context context) throws java.io.IOException ,InterruptedException {
int count=0;
for (Text text : value) {
count++;
}
context.write(key, new Text(count+""));
};
}

Related

Can't access hashmap in mapper, MapReduce

I'd like to replace values of input data in my mapper, using dictionalies(csv) defined in another file. So I tried to put the csv data to HashMap and refer it in the mapper.
The java code and csv below are simplified version of my program. This code works in my local environment(Mac OS X, pseudo-distributed mode), but doesn't in my EC2 instance(ubuntu, pseudo-distributed mode).
In detail, I got this stdout in process:
cat:4
human:2
flamingo:1
this means the filereader successfully put csv data into HashMap.
However the mapper mapped nothing and therefore I got empty output in the EC2 environment, although it mapped 3 * (the number of lines of the input file) elements and generated the following in the local:
test,cat
test,flamingo
test,human
Does anyone have answers or hints?
Test.java
import java.io.IOException;
import java.util.StringTokenizer;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.DataInput;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.io.WritableUtils;
public class Test {
public static HashMap<String, Integer> map = new HashMap<String, Integer>();
public static class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
for(Map.Entry<String, Integer> e : map.entrySet()) {
context.write(new Text(e.getKey()), new Text("test"));
}
}
}
public static class Reducer1 extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> vals, Context context) throws IOException, InterruptedException {
context.write(new Text("test"), key);
}
}
public static class CommaTextOutputFormat extends TextOutputFormat<Text, Text> {
#Override
public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
String extension = ".txt";
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<Text, Text>(fileOut, ",");
}
}
public static void get_list(String list_path){
try {
FileReader fr = new FileReader(list_path);
BufferedReader br = new BufferedReader(fr);
String line = null, name = null;
int leg = 0;
while ((line = br.readLine()) != null) {
if (!line.startsWith("name") && !line.trim().isEmpty()) {
String[] name_leg = line.split(",", 0);
name = name_leg[0];
leg = Integer.parseInt(name_leg[1]);
map.put(name, leg);
}
}
br.close();
}
catch(IOException ex) {
System.err.println(ex.getMessage());
ex.printStackTrace();
}
for(Map.Entry<String, Integer> e : map.entrySet()) {
System.out.println(e.getKey() + ":" + e.getValue());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 3) {
System.err.println(
"Need 3 arguments: <input dir> <output base dir> <list path>");
System.exit(1);
}
get_list(args[2]);
Job job = Job.getInstance(conf, "test");
job.setJarByClass(Test.class);
job.setMapperClass(Mapper1.class);
job.setReducerClass(Reducer1.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(TextInputFormat.class);
// mapper output
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// reducer output
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// formtter
job.setOutputFormatClass(CommaTextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
if(!job.waitForCompletion(true)){
System.exit(1);
}
System.out.println("All Finished");
System.exit(0);
}
}
list.csv (args[2])
name,legs
cat,4
human,2
flamingo,1
=================================
I refer to #Rahul Sharma 's answer and modifiy my code as below. Then my code works in the both environments.
Thank you very much #Rahul Sharma and #Serhiy for your precise answer and useful comments.
Test.java
import java.io.IOException;
import java.util.StringTokenizer;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.DataInput;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.net.URI;
import java.io.InputStreamReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.io.WritableUtils;
public class Test {
public static HashMap<String, Integer> map = new HashMap<String, Integer>();
public static class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
#Override
protected void setup(Context context) throws IOException, InterruptedException {
URI[] files = context.getCacheFiles();
Path list_path = new Path(files[0]);
try {
FileSystem fs = list_path.getFileSystem(context.getConfiguration());
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(list_path)));
String line = null, name = null;
int leg = 0;
while ((line = br.readLine()) != null) {
if (!line.startsWith("name") && !line.trim().isEmpty()) {
String[] name_leg = line.split(",", 0);
name = name_leg[0];
leg = Integer.parseInt(name_leg[1]);
map.put(name, leg);
}
}
br.close();
}
catch(IOException ex) {
System.err.println(ex.getMessage());
ex.printStackTrace();
}
for(Map.Entry<String, Integer> e : map.entrySet()) {
System.out.println(e.getKey() + ":" + e.getValue());
}
}
#Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
for(Map.Entry<String, Integer> e : map.entrySet()) {
context.write(new Text(e.getKey()), new Text("test"));
}
}
}
public static class Reducer1 extends Reducer<Text, Text, Text, Text> {
#Override
protected void reduce(Text key, Iterable<Text> vals, Context context) throws IOException, InterruptedException {
context.write(new Text("test"), key);
}
}
// Writer
public static class CommaTextOutputFormat extends TextOutputFormat<Text, Text> {
#Override
public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
String extension = ".txt";
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter<Text, Text>(fileOut, ",");
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
if (args.length != 3) {
System.err.println(
"Need 3 arguments: <input dir> <output base dir> <list path>");
System.exit(1);
}
Job job = Job.getInstance(conf, "test");
job.addCacheFile(new Path(args[2]).toUri());
job.setJarByClass(Test.class);
job.setMapperClass(Mapper1.class);
job.setReducerClass(Reducer1.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(TextInputFormat.class);
// mapper output
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// reducer output
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// formtter
job.setOutputFormatClass(CommaTextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
if(!job.waitForCompletion(true)){
System.exit(1);
}
System.out.println("All Finished");
System.exit(0);
}
}
First you need to learn more about mapreduce framework.
Your program behave as expected in local mode because Mapper, reducer and Job are launched on same JVM. In case, of pseudo-distributed mode or distributed modes there will be separate jvms allocated for each component. The values you put into hashMap using get_list are not visible to mapper and reducer as they are in separate jvms
Use distributed cache to make it work in cluster mode.
Job Main class add file to distributed cache:
JobConf job = new JobConf();<br>
DistributedCache.addCacheArchive(new URI(args[2]), job);
Access file in mapper or reducer:
public void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
FileSystem fs = FileSystem.getLocal(conf);
Path[] dataFile = DistributedCache.getLocalCacheFiles(conf);
BufferedReader cacheReader = new BufferedReader(new InputStreamReader(fs.open(dataFile[0])));
// Implement here get_list method functionality
}

Sequence File created gives strange output in hadoop

I want to combine several small bzip2 files into a sequence file .I saw a code to create sequence file and tried it. But it gives strange output as below. Is this because it is unable to read bzip2 files?
SEQorg.apache.hadoop.io.Textorg.apache.hadoop.io.Text
�*org.apache.hadoop.io.compress.DefaultCodec����gWŒ‚ÊO≈îbº¡vœÖ��� ���
.DS_StorexúÌò±
¬0EÔ4.S∫a�6∞¢0P∞=0ì·‡/d)ÄDï˛ì¨w≈ù7÷ùØ›⁄ÖüO;≥X¬`’∂µóÆ Æâ¡=Ñ B±lP6Û˛ÜbÅå˜C¢3}ª‘�Lp¥oä"ùËL?jK�&:⁄”Åét¢3]Î
º∑¿˘¸68§ÄÉùø:µ√™*é-¿fifi>!~¯·0Ùˆú ¶ eõ¯c‡ÍÉa◊':”ÍÑòù;I1•�∂©���00.json.bz2xúL\gWTK∞%
,Y
ä( HJFêúsŒ\PrRrŒ9ÁCŒ9√0ÃZUÏÌÊΩÔ≤Ù‚Ãô”’UªvÌÍÓ3£oˆä2ä<˝”-”ãȧπË/d;u¥Û£üV;ÀÒÛ¯Ú˜ˇ˚…≥2¢5Í0‰˝8M⁄,S¸¢`f•†`O<ëüD£≈tÃ¥ó`•´D˚~aº˝«õ˜v'≠)(F|§fiÆÕ ?y¬àœTÒÊYåb…U%E?⁄§efiWˇÒY#üÛÓÓ‚
⁄è„ÍåÚÊU5‡ æ‚Â?q‘°�À{©?íWyü÷ÈûF<[˘éŒhãd>x_ÅÁ
fiÒ_eâ5-—|-M)˙)¸R·ªCÆßs„F>UŒ©ß{o„uÔ&∫˚˚Ÿ?Ä©ßW,”◊Ê∫â«õxã¸[yûgÈñFmx|‡ªÍ¶”¶‡Óp-∆ú§ı
<JN t «F4™#Àä¥Jœ¥‰√|E„‘œ„&º§#g|ˆá{iõOx
The code is
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.GenericOptionsParser;
public class cinput {
/**
* #param args
* #throws IOException
* #throws IllegalAccessException
* #throws InstantiationException
*/
public static void main(String[] args) throws IOException,
InstantiationException, IllegalAccessException {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
Path inputFile = new Path(otherArgs[0]);
Path outputFile = new Path(otherArgs[1]);
FSDataInputStream inputStream;
Text key = new Text();
Text value = new Text();
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf,
outputFile, key.getClass(), value.getClass());
FileStatus[] fStatus = fs.listStatus(inputFile);
for (FileStatus fst : fStatus) {
String str = "";
System.out.println("Processing file : " + fst.getPath().getName() + " and the size is : " + fst.getPath().getName().length());
inputStream = fs.open(fst.getPath());
key.set(fst.getPath().getName());
while(inputStream.available()>0) {
str = str+inputStream.readLine();
// System.out.println(str);
}
value.set(str);
writer.append(key, value);
}
fs.close();
IOUtils.closeStream(writer);
System.out.println("SEQUENCE FILE CREATED SUCCESSFULLY........");
}
}
The input I am passing is Json.bzip2 files. Could someone please point out why I am getting strange output.

Hadoop 2 IOException only when trying to open supposed cache files

I recent updated to hadoop 2.2 (using this tutorial here).
My main job class looks like so, and throws an IOException:
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.chain.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.mapreduce.lib.reduce.*;
public class UFOLocation2
{
public static class MapClass extends Mapper<LongWritable, Text, Text, LongWritable>
{
private final static LongWritable one = new LongWritable(1);
private static Pattern locationPattern = Pattern.compile("[a-zA-Z]{2}[^a-zA-Z]*$");
private Map<String, String> stateNames;
#Override
public void setup(Context context)
{
try
{
URI[] cacheFiles = context.getCacheFiles();
setupStateMap(cacheFiles[0].toString());
}
catch (IOException ioe)
{
System.err.println("Error reading state file.");
System.exit(1);
}
}
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException
{
String line = value.toString();
String[] fields = line.split("\t");
String location = fields[2].trim();
if (location.length() >= 2)
{
Matcher matcher = locationPattern.matcher(location);
if (matcher.find())
{
int start = matcher.start();
String state = location.substring(start, start + 2);
context.write(new Text(lookupState(state.toUpperCase())), one);
}
}
}
private void setupStateMap(String filename) throws IOException
{
Map<String, String> states = new HashMap<String, String>();
BufferedReader reader = new BufferedReader(new FileReader(filename));
String line = reader.readLine();
while (line != null)
{
String[] split = line.split("\t");
states.put(split[0], split[1]);
line = reader.readLine();
}
stateNames = states;
}
private String lookupState(String state)
{
String fullName = stateNames.get(state);
return fullName == null ? "Other" : fullName;
}
}
public static void main(String[] args) throws Exception
{
Configuration config = new Configuration();
Job job = Job.getInstance(config, "UFO Location 2");
job.setJarByClass(UFOLocation2.class);
job.addCacheFile(new URI("/user/kevin/data/states.txt"));
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
Configuration mapconf1 = new Configuration(false);
ChainMapper.addMapper(job, UFORecordValidationMapper.class, LongWritable.class,
Text.class, LongWritable.class,Text.class, mapconf1);
Configuration mapconf2 = new Configuration(false);
ChainMapper.addMapper(job, MapClass.class, LongWritable.class,
Text.class, Text.class, LongWritable.class, mapconf2);
job.setMapperClass(ChainMapper.class);
job.setCombinerClass(LongSumReducer.class);
job.setReducerClass(LongSumReducer.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
I get an IOException because it can't find the file "/user/kevin/data/states.txt" when it tries to instantiate the BufferredReader in the method setupStateMap()
Yes, it is deprecated and Job.addCacheFile() should be used to add the files and in your tasks( map or reduce) files can be accessed with Context.getCacheFiles().
//its fine addCacheFile and getCacheFiles are from 2.x u can use something like this
Path path = new Path(uri[0].getPath().toString());
if (fileSystem.exists(path)) {
FSDataInputStream dataInputStream = fileSystem.open(path);
byte[] data = new byte[1024];
while (dataInputStream.read(data) > 0) {
//do your stuff here
}
dataInputStream.close();
}
Deprecated functionality shall work anyway.

Parsing of Stackoverflow`s posts.xml on hadoop

I am following this article by Anoop Madhusudanan on codeproject to build a recommendation engine not on cluster but on my system.
Problem is when i try to parse posts.xml whose structure is as follows:
<row Id="99" PostTypeId="2" ParentId="88" CreationDate="2008-08-01T14:55:08.477" Score="2" Body="<blockquote>
<p>The actual resolution of gettimeofday() depends on the hardware architecture. Intel processors as well as SPARC machines offer high resolution timers that measure microseconds. Other hardware architectures fall back to the system’s timer, which is typically set to 100 Hz. In such cases, the time resolution will be less accurate. </p>
</blockquote>
<p>I obtained this answer from <a href="http://www.informit.com/guides/content.aspx?g=cplusplus&amp;seqNum=272" rel="nofollow">High Resolution Time Measurement and Timers, Part I</a></p>" OwnerUserId="25" LastActivityDate="2008-08-01T14:55:08.477" />
Now I need to parse this file(size 1.4 gb) on hadoop for which i have written code in java and created its jar.
Java class is as follows:
import java.io.IOException;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.w3c.dom.Element;
import java.io.File;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
public class Recommend {
static class Map extends Mapper<Text, Text, Text, Text> {
Path path;
String fXmlFile;
DocumentBuilderFactory dbFactory;
DocumentBuilder dBuilder;
Document doc;
/**
* Given an output filename, write a bunch of random records to it.
*/
public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
try{
fXmlFile=value.toString();
dbFactory = DocumentBuilderFactory.newInstance();
dBuilder= dbFactory.newDocumentBuilder();
doc= dBuilder.parse(fXmlFile);
doc.getDocumentElement().normalize();
NodeList nList = doc.getElementsByTagName("row");
for (int temp = 0; temp < nList.getLength(); temp++) {
Node nNode = nList.item(temp);
Element eElement = (Element) nNode;
Text keyWords =new Text(eElement.getAttribute("OwnerUserId"));
Text valueWords = new Text(eElement.getAttribute("ParentId"));
String val=keyWords.toString()+" "+valueWords.toString();
// Write the sentence
if(keyWords != null && valueWords != null){
output.collect(keyWords, new Text(val));
}
}
}catch (Exception e) {
e.printStackTrace();
}
}
}
/**
*
* #throws IOException
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
/*if (args.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}*/
// FileSystem fs = FileSystem.get(conf);
Job job = new Job(conf, "Recommend");
job.setJarByClass(Recommend.class);
// the keys are words (strings)
job.setOutputKeyClass(Text.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
// the values are counts (ints)
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
//conf.setReducerClass(Reduce.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
Path outPath = new Path(args[1]);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}
}
}
I expect the output to be as a file in hadoop containing output as OwnerUserId ParentId
but instead I get output as:
1599788 <row Id="2292" PostTypeId="2" ParentId="2284" CreationDate="2008-08-05T13:28:06.700" Score="0" ViewCount="0" Body="<p>The first thing you should do is contact the main people who run the open source project. Ask them if it is ok to contribute to the code and go from there.</p>
<p>Simply writing your improved code and then giving it to them may result in your code being rejected.</p>" OwnerUserId="383" LastActivityDate="2008-08-05T13:28:06.700" />
I dont know about the origin of 1599788 appearing as a key value from mapper.
I don`t know much about writing mapper classes for hadoop, I need help to modify my code to get the desired output.
Thanks in advance.
After a lot of research and experiments , finally learnt the way to write map for parsin xml files which have syntax like one I provided. I changed my approach and this is my new mapper code... Its workin for my usecase.
hope it help someone and they can save their time :)
import java.io.IOException;
import java.util.StringTokenizer;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.xml.sax.SAXException;
public class Map extends Mapper<LongWritable, Text, NullWritable, Text> {
NullWritable obj;
#Override
public void map(LongWritable key, Text value, Context context) throws InterruptedException {
StringTokenizer tok= new StringTokenizer(value.toString());
String pa=null,ow=null,pi=null,v;
while (tok.hasMoreTokens()) {
String[] arr;
String val = (String) tok.nextToken();
if(val.contains("PostTypeId")){
arr= val.split("[\"]");
pi=arr[arr.length-1];
if(pi.equals("2")){
continue;
}
else break;
}
if(val.contains("ParentId")){
arr= val.split("[\"]");
pa=arr[arr.length-1];
}
else if(val.contains("OwnerUserId") ){
arr= val.split("[\"]");
ow=arr[arr.length-1];
try {
if(pa!=null && ow != null){
v=String.format("{0},{1}", ow,pa);
context.write(obj,new Text(v));
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
}
Here is the mapper I've written to parse the so posts xml and create a tab separated file(s) on hadoop to be used by other map reduce jobs or Hive or Pig.
Mapper
package com.aravind.learning.hadoop.mapred.techtalks;
import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.google.common.base.Joiner;
public class StackoverflowDataWranglerMapper extends Mapper<LongWritable, Text, Text, Text>
{
static enum BadRecordCounters
{
NO_CREATION_DATE, UNKNOWN_USER_ID, UNPARSEABLE_RECORD, UNTAGGED_POSTS
}
private final Text outputKey = new Text();
private final Text outputValue = new Text();
private final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
private DocumentBuilder builder;
private static final Joiner TAG_JOINER = Joiner.on(",").skipNulls();
// 2008-07-31T21:42:52.667
private static final DateFormat DATE_PARSER = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");
private static final SimpleDateFormat DATE_BUILDER = new SimpleDateFormat("yyyy-MM-dd");
#Override
protected void setup(Context context) throws IOException, InterruptedException
{
try
{
builder = factory.newDocumentBuilder();
}
catch (ParserConfigurationException e)
{
new IOException(e);
}
}
#Override
protected void map(LongWritable inputKey, Text inputValue, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException
{
try
{
String entry = inputValue.toString();
if (entry.contains("<row "))
{
Document doc = builder.parse(new InputSource(new StringReader(entry)));
Element rootElem = doc.getDocumentElement();
String id = rootElem.getAttribute("Id");
String postedBy = rootElem.getAttribute("OwnerUserId").trim();
String viewCount = rootElem.getAttribute("ViewCount");
String postTypeId = rootElem.getAttribute("PostTypeId");
String score = rootElem.getAttribute("Score");
String title = rootElem.getAttribute("Title");
String tags = rootElem.getAttribute("Tags");
String answerCount = rootElem.getAttribute("AnswerCount");
String commentCount = rootElem.getAttribute("CommentCount");
String favoriteCount = rootElem.getAttribute("FavoriteCount");
String creationDate = rootElem.getAttribute("CreationDate");
Date parsedDate = null;
if (creationDate != null && creationDate.trim().length() > 0)
{
try
{
parsedDate = DATE_PARSER.parse(creationDate);
}
catch (ParseException e)
{
context.getCounter("Bad Record Counters", "Posts missing CreationDate").increment(1);
}
}
if (postedBy.length() == 0 || postedBy.trim().equals("-1"))
{
context.getCounter("Bad Record Counters", "Posts with either empty UserId or UserId contains '-1'")
.increment(1);
try
{
parsedDate = DATE_BUILDER.parse("2100-00-01");
}
catch (ParseException e)
{
// ignore
}
}
tags = tags.trim();
String tagTokens[] = null;
if (tags.length() > 1)
{
tagTokens = tags.substring(1, tags.length() - 1).split("><");
}
else
{
context.getCounter("Bad Record Counters", "Untagged Posts").increment(1);
}
outputKey.clear();
outputKey.set(id);
StringBuilder sb = new StringBuilder(postedBy).append("\t").append(parsedDate.getTime()).append("\t")
.append(postTypeId).append("\t").append(title).append("\t").append(viewCount).append("\t").append(score)
.append("\t");
if (tagTokens != null)
{
sb.append(TAG_JOINER.join(tagTokens)).append("\t");
}
else
{
sb.append("").append("\t");
}
sb.append(answerCount).append("\t").append(commentCount).append("\t").append(favoriteCount).toString();
outputValue.set(sb.toString());
context.write(outputKey, outputValue);
}
}
catch (SAXException e)
{
context.getCounter("Bad Record Counters", "Unparsable records").increment(1);
}
finally
{
builder.reset();
}
}
}
Driver
public class StackoverflowDataWranglerDriver extends Configured implements Tool
{
#Override
public int run(String[] args) throws Exception
{
if (args.length != 2)
{
System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
Job job = Job.getInstance(getConf());
job.setJobName("Tech Talks - Stackoverflow Forum Posts - Data Wrangler");
TextInputFormat.addInputPath(job, new Path(args[0]));
TextOutputFormat.setOutputPath(job, new Path(args[1]));
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setJarByClass(StackoverflowDataWranglerMapper.class);// required for mr1
job.setMapperClass(StackoverflowDataWranglerMapper.class);
job.setNumReduceTasks(0);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String args[]) throws Exception
{
int exitCode = ToolRunner.run(new Configuration(), new StackoverflowDataWranglerDriver(), args);
System.exit(exitCode);
}
}
Job submit command
hadoop jar ./hadoop-examples-0.0.1-SNAPSHOT.jar com.aravind.learning.hadoop.mapred.techtalks.StackoverflowDataWranglerDriver data/stackoverflow-posts.xml data/so-posts-tsv

Parsing and writing log data from mapreduce to hive

Ive written a small hadoop map program to parse(regex) information from log files generated from other apps. I found this article http://www.nearinfinity.com//blogs/stephen_mouring_jr/2013/01/04/writing-hive-tables-from-mapreduce.html
This article explains how to parse and write it into the hive table
Here is my code
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class ParseDataToDB {
public static final String SEPARATOR_FIELD = new String(new char[] {1});
public static final String SEPARATOR_ARRAY_VALUE = new String(new char[] {2});
public static final BytesWritable NULL_KEY = new BytesWritable();
public static class MyMapper extends Mapper<LongWritable, Text, BytesWritable, Text> {
//private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private ArrayList<String> bazValues = new ArrayList<String>();
public void map(LongWritable key, Text value,
OutputCollector<BytesWritable, Text> context)
throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while(tokenizer.hasMoreTokens()){
word.set(tokenizer.nextToken());
if(word.find("extract") > -1) {
System.out.println("in herer");
bazValues.add(line);
}
}
// Build up the array values as a delimited string.
StringBuilder bazValueBuilder = new StringBuilder();
int i = 0;
for (String bazValue : bazValues) {
bazValueBuilder.append(bazValue);
++i;
if (i < bazValues.size()) {
bazValueBuilder.append(SEPARATOR_ARRAY_VALUE);
}
}
// Build up the column values / fields as a delimited string.
String hiveRow = new String();
hiveRow += "fooValue";
hiveRow += SEPARATOR_FIELD;
hiveRow += "barValue";
hiveRow += SEPARATOR_FIELD;
hiveRow += bazValueBuilder.toString();
System.out.println("in herer hiveRow" + hiveRow);
// StringBuilder hiveRow = new StringBuilder();
// hiveRow.append("fooValue");
// hiveRow.append(SEPARATOR_FIELD);
// hiveRow.append("barValue");
// hiveRow.append(SEPARATOR_FIELD);
// hiveRow.append(bazValueBuilder.toString());
// Emit a null key and a Text object containing the delimited fields
context.collect(NULL_KEY, new Text(hiveRow));
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = new Job(conf, "MyTest");
job.setJarByClass(ParseDataToDB.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(BytesWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(BytesWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
But when i run this app, i get an error saying "expected ByteWritable but recieved LongWritable. Can someone tell me what im doing wrong? Im new to hadoop programming. Im also open to creating external tables and pointing that to hdfs, again im struggling with implementation.
Thanks.
from looking at article you provided LINK, Show NULL_KEY that you haven't set any value.
It should be
public static final BytesWritable NULL_KEY = new BytesWritable(null);
I think as you are trying to output NULL as key from the map so you can use NullWritable. So your code would be something as below:-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class ParseDataToDB {
public static final String SEPARATOR_FIELD = new String(new char[] {1});
public static final String SEPARATOR_ARRAY_VALUE = new String(new char[] {2});
public static class MyMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
//private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private ArrayList<String> bazValues = new ArrayList<String>();
public void map(LongWritable key, Text value,
OutputCollector<NullWritable, Text> context)
throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while(tokenizer.hasMoreTokens()){
word.set(tokenizer.nextToken());
if(word.find("extract") > -1) {
System.out.println("in herer");
bazValues.add(line);
}
}
// Build up the array values as a delimited string.
StringBuilder bazValueBuilder = new StringBuilder();
int i = 0;
for (String bazValue : bazValues) {
bazValueBuilder.append(bazValue);
++i;
if (i < bazValues.size()) {
bazValueBuilder.append(SEPARATOR_ARRAY_VALUE);
}
}
// Build up the column values / fields as a delimited string.
String hiveRow = new String();
hiveRow += "fooValue";
hiveRow += SEPARATOR_FIELD;
hiveRow += "barValue";
hiveRow += SEPARATOR_FIELD;
hiveRow += bazValueBuilder.toString();
System.out.println("in herer hiveRow" + hiveRow);
// Emit a null key and a Text object containing the delimited fields
context.collect(NullWritable.get(), new Text(hiveRow));
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = new Job(conf, "MyTest");
job.setJarByClass(ParseDataToDB.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

Categories

Resources