How to obtain the cosine similarity with Lucene

How to obtain the cosine similarity with Lucene - java

Currently I try to get the cosine similarity between two document with Lucene (4.10.4).
I already read this answer about cosine similarity with Lucene , and I used this example to understand how it works with Lucene.
But when I tested with 2 same words per each document (ex: "Hello world"), I've got a cosine of similarity at 0.9999999999999998
My code look like that:
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.commons.math3.linear.ArrayRealVector;
import org.apache.commons.math3.linear.RealVector;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
public class CosineSimeTest {
static String indexName = "/tmp/CosineExample";
public static final String CONTENT = "field";
public static final int N = 2;
private final Set<String> terms = new HashSet<>();
private final RealVector v1;
private final RealVector v2;
public static void main(String[] args) {
try {
CosineSimeTest cosSim = new CosineSimeTest("hello world", "hello world");
System.out.println(cosSim.getCosineSimilarity());
} catch (IOException e) {
e.printStackTrace();
}
}
public CosineSimeTest(String s1, String s2) throws IOException {
Directory directory = createIndex(s1, s2);
IndexReader reader = DirectoryReader.open(directory);
Map<String, Double> f1 = getWieghts(reader, 0);
Map<String, Double> f2 = getWieghts(reader, 1);
reader.close();
v1 = toRealVector(f1);
System.out.println("V1: " + v1);
v2 = toRealVector(f2);
System.out.println("V2: " + v2);
}
public Directory createIndex(String s1, String s2) throws IOException {
File f = new File(indexName);
if (f.exists()) {
FileUtils.deleteDirectory(f);
}
Directory directory = FSDirectory.open(new File(indexName));
StandardAnalyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(null, analyzer);
IndexWriter writer = new IndexWriter(directory, iwc);
addDocument(writer, s1);
addDocument(writer, s2);
writer.close();
return directory;
}
public void addDocument(IndexWriter writer, String data) throws IOException {
Document doc = new Document();
FieldType type = new FieldType();
type.setIndexed(true);
type.setStoreTermVectors(true);
type.setStoreTermVectorPositions(true);
type.freeze();
Field field = new Field(CONTENT, data, type);
doc.add(field);
writer.addDocument(doc);
}
public double getCosineSimilarity() {
double dotProduct = v1.dotProduct(v2);
System.out.println("Dot: " + dotProduct);
System.out.println("V1_norm: " + v1.getNorm() + ", V2_norm: " + v2.getNorm());
double normalization = (v1.getNorm() * v2.getNorm());
System.out.println("Norm: " + normalization);
return dotProduct / normalization;
}
public Map<String, Double> getWieghts(IndexReader reader, int docId) throws IOException {
Terms vector = reader.getTermVector(docId, CONTENT);
Map<String, Integer> docFrequencies = new HashMap<>();
Map<String, Integer> termFrequencies = new HashMap<>();
Map<String, Double> tf_Idf_Weights = new HashMap<>();
TermsEnum termsEnum = null;
DocsEnum docsEnum = null;
termsEnum = vector.iterator(termsEnum);
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
docFrequencies.put(term, reader.docFreq(new Term(CONTENT, term)));
docsEnum = termsEnum.docs(null, null);
while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
termFrequencies.put(term, docsEnum.freq());
}
terms.add(term);
}
for (String term : docFrequencies.keySet()) {
int tf = termFrequencies.get(term);
int df = docFrequencies.get(term);
double idf = (1 + Math.log(N) - Math.log(df));
double w = tf * idf;
tf_Idf_Weights.put(term, w);
}
// System.out.println("Printing docFrequencies:");
// printMap(docFrequencies);
//
// System.out.println("Printing termFrequencies:");
// printMap(termFrequencies);
//
// System.out.println("Printing if/idf weights:");
// printMapDouble(tf_Idf_Weights);
return tf_Idf_Weights;
}
public RealVector toRealVector(Map<String, Double> map) {
RealVector vector = new ArrayRealVector(terms.size());
int i = 0;
double value = 0;
for (String term : terms) {
if (map.containsKey(term)) {
value = map.get(term);
} else {
value = 0;
}
vector.setEntry(i++, value);
}
return vector;
}
public static void printMap(Map<String, Integer> map) {
for (String key : map.keySet()) {
System.out.println("Term: " + key + ", value: " + map.get(key));
}
}
public static void printMapDouble(Map<String, Double> map) {
for (String key : map.keySet()) {
System.out.println("Term: " + key + ", value: " + map.get(key));
}
}
public void getVersionOfLucene(StandardAnalyzer analyzer) {
System.out.println("version : " + analyzer.getVersion());
}
}
What is the problem ? How to fix this ?
Thanks in advance.

Related

Add custom metadata to tiff

I want to add some custom metadata to a multipage tiff for further processing steps, like
identifier1 = XYZ1
identifier2 = XYZ2
...
My idea was to update (see code/TODO below)
IIOMetadata streamMetadata [option 1]
IIOMetadata imageMetadata [option 2]
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import javax.imageio.IIOImage;
import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.ImageWriteParam;
import javax.imageio.ImageWriter;
import javax.imageio.metadata.IIOMetadata;
import javax.imageio.stream.ImageInputStream;
import javax.imageio.stream.ImageOutputStream;
public class TiffMetadataExample {
public static void addMetadata(File tiff, File out, Object metadata2Add)
throws FileNotFoundException, IOException {
try (FileInputStream fis = new FileInputStream(tiff);
FileOutputStream fos = new FileOutputStream(out)) {
addMetadata(fis, fos, metadata2Add);
}
}
public static void addMetadata(InputStream inputImage, OutputStream out, Object metadata2Add)
throws IOException {
List<IIOMetadata> metadata = new ArrayList<>();
List<BufferedImage> images = getImages(inputImage, metadata);
if (metadata.size() != images.size()) {
throw new IllegalStateException();
}
// Obtain a TIFF writer
ImageWriter writer = ImageIO.getImageWritersByFormatName("TIFF").next();
try (ImageOutputStream output = ImageIO.createImageOutputStream(out)) {
writer.setOutput(output);
ImageWriteParam params = writer.getDefaultWriteParam();
params.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
// Compression: None, PackBits, ZLib, Deflate, LZW, JPEG and CCITT variants allowed
// (different plugins may use a different set of compression type names)
params.setCompressionType("Deflate");
// streamMetadata is null here
IIOMetadata streamMetadata = writer.getDefaultStreamMetadata(params);
// TODO: add custom metadata fields [option 1]
writer.prepareWriteSequence(streamMetadata);
for (int i = 0; i < images.size(); i++) {
BufferedImage image = images.get(i);
IIOMetadata imageMetadata = metadata.get(i);
// TODO: add custom metadata fields [option 2]
writer.writeToSequence(new IIOImage(image, null, imageMetadata), params);
}
writer.endWriteSequence();
} finally {
writer.dispose();
}
}
private static List<BufferedImage> getImages(final InputStream inputImage,
final List<IIOMetadata> metadata) throws IOException {
List<BufferedImage> images = new ArrayList<>();
ImageReader reader = null;
try (ImageInputStream is = ImageIO.createImageInputStream(inputImage)) {
Iterator<ImageReader> iterator = ImageIO.getImageReaders(is);
reader = iterator.next();
reader.setInput(is);
int numPages = reader.getNumImages(true);
for (int numPage = 0; numPage < numPages; numPage++) {
BufferedImage pageImage = reader.read(numPage);
IIOMetadata imageMetadata = reader.getImageMetadata(numPage);
metadata.add(imageMetadata);
images.add(pageImage);
}
return images;
} finally {
if (reader != null) {
reader.dispose();
}
}
}
}
Try to update imageMetadata [option 2] with following code does not work. What is wrong here?
IIOMetadataNode textEntry = new IIOMetadataNode("tEXtEntry");
textEntry.setAttribute("keyword", "aaaaaa");
textEntry.setAttribute("value", "bbb");
IIOMetadataNode text = new IIOMetadataNode("tEXt");
text.appendChild(textEntry);
Node root = meta.getAsTree(formatName);
root.appendChild(text);
//e.g. formatName = "javax_imageio_1.0"
imageMetadata.setFromTree(imageMetadata.getNativeMetadataFormatName(), root);
Or is there a nicer/other way to store some further processing informations within the tiff?

This is my working solution.
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.attribute.UserDefinedFileAttributeView;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import javax.imageio.IIOImage;
import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.ImageWriteParam;
import javax.imageio.ImageWriter;
import javax.imageio.metadata.IIOMetadata;
import javax.imageio.metadata.IIOMetadataNode;
import javax.imageio.stream.ImageInputStream;
import javax.imageio.stream.ImageOutputStream;
import org.apache.commons.imaging.common.RationalNumber;
import org.apache.commons.imaging.formats.tiff.constants.TiffTagConstants;
import org.apache.commons.imaging.formats.tiff.taginfos.TagInfo;
import org.apache.commons.imaging.formats.tiff.taginfos.TagInfoAscii;
import org.apache.commons.imaging.formats.tiff.taginfos.TagInfoBytes;
import org.apache.commons.imaging.formats.tiff.taginfos.TagInfoDouble;
import org.apache.commons.imaging.formats.tiff.taginfos.TagInfoFloat;
import org.apache.commons.imaging.formats.tiff.taginfos.TagInfoLong;
import org.apache.commons.imaging.formats.tiff.taginfos.TagInfoRational;
import org.apache.commons.imaging.formats.tiff.taginfos.TagInfoShort;
import com.twelvemonkeys.imageio.metadata.tiff.Rational;
public class TiffMetadataExample {
public static final int TIFF_TAG_XMP = 0x2BC;
public static final String TIFF_TAG_XMP_NAME = "XMP";
private static final String SUN_TIFF_FORMAT = "com_sun_media_imageio_plugins_tiff_image_1.0";
private static final String SUN_TIFF_STREAM_FORMAT =
"com_sun_media_imageio_plugins_tiff_stream_1.0";
private static final String TAG_SET_CLASS_NAME =
"com.sun.media.imageio.plugins.tiff.BaselineTIFFTagSet";
public static void setMetaData(File in, File out, Metadata metaData) throws IOException {
try (FileInputStream fis = new FileInputStream(in);
FileOutputStream fos = new FileOutputStream(out)) {
setMetaData(fis, fos, metaData);
}
UserDefinedFileAttributeView userDefView =
Files.getFileAttributeView(out.toPath(), UserDefinedFileAttributeView.class);
for (Entry<String, String> fileAttEntry : metaData.getfileAtt().entrySet()) {
userDefView.write(fileAttEntry.getKey(),
Charset.defaultCharset().encode(fileAttEntry.getValue()));
}
}
public static void setMetaData(InputStream inputImage, OutputStream out, Metadata metdaData2Add)
throws IOException {
List<IIOMetadata> metadataList = new ArrayList<>();
List<BufferedImage> images = getImages(inputImage, metadataList);
// Obtain a TIFF writer
ImageWriter writer = ImageIO.getImageWritersByFormatName("TIFF").next();
try (ImageOutputStream output = ImageIO.createImageOutputStream(out)) {
writer.setOutput(output);
ImageWriteParam params = writer.getDefaultWriteParam();
params.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
// Compression: None, PackBits, ZLib, Deflate, LZW, JPEG and CCITT variants allowed
// (different plugins may use a different set of compression type names)
params.setCompressionType("Deflate");
IIOMetadata streamMetadata = writer.getDefaultStreamMetadata(params);
writer.prepareWriteSequence(streamMetadata);
for (int i = 0; i < images.size(); i++) {
BufferedImage image = images.get(i);
IIOMetadata imageMetadata = metadataList.get(i);
updateMetadata(imageMetadata, metdaData2Add.get());
writer.writeToSequence(new IIOImage(image, null, imageMetadata), params);
}
writer.endWriteSequence();
} finally {
writer.dispose();
}
}
private static void updateMetadata(IIOMetadata metadata, List<IIOMetadataNode> metdaData2AddList)
throws IOException {
if (SUN_TIFF_FORMAT.equals(metadata.getNativeMetadataFormatName())
|| SUN_TIFF_STREAM_FORMAT.equals(metadata.getNativeMetadataFormatName())) {
// wanted format
} else {
throw new IllegalArgumentException(
"Could not write tiff metadata, wrong format: " + metadata.getNativeMetadataFormatName());
}
IIOMetadataNode root = new IIOMetadataNode(metadata.getNativeMetadataFormatName());
IIOMetadataNode ifd;
if (root.getElementsByTagName("TIFFIFD").getLength() == 0) {
ifd = new IIOMetadataNode("TIFFIFD");
ifd.setAttribute("tagSets", TAG_SET_CLASS_NAME);
root.appendChild(ifd);
} else {
ifd = (IIOMetadataNode) root.getElementsByTagName("TIFFIFD").item(0);
}
for (IIOMetadataNode metdaData2Add : metdaData2AddList) {
ifd.appendChild(metdaData2Add);
}
metadata.mergeTree(metadata.getNativeMetadataFormatName(), root);
}
private static List<BufferedImage> getImages(final InputStream inputImage,
final List<IIOMetadata> metadata) throws IOException {
List<BufferedImage> images = new ArrayList<>();
ImageReader reader = null;
try (ImageInputStream is = ImageIO.createImageInputStream(inputImage)) {
Iterator<ImageReader> iterator = ImageIO.getImageReaders(is);
reader = iterator.next();
reader.setInput(is);
int numPages = reader.getNumImages(true);
for (int numPage = 0; numPage < numPages; numPage++) {
BufferedImage pageImage = reader.read(numPage);
IIOMetadata meta = reader.getImageMetadata(numPage);
metadata.add(meta);
images.add(pageImage);
}
return images;
} finally {
if (reader != null) {
reader.dispose();
}
}
}
public static class Metadata {
private final List<IIOMetadataNode> addList = new ArrayList<>();
private final Map<String, String> fileAtt = new TreeMap<>();
public Metadata() {}
private List<IIOMetadataNode> get() {
return addList;
}
private Map<String, String> getfileAtt() {
return fileAtt;
}
public void add(int exifTag, String exifTagName, Object val) {
IIOMetadataNode md;
if (val instanceof byte[]) {
md = createBytesField(exifTag, exifTagName, (byte[]) val);
} else if (val instanceof String) {
md = createAsciiField(exifTag, exifTagName, (String) val);
fileAtt.put(exifTagName, String.valueOf(val));
} else if (val instanceof Short) {
md = createShortField(exifTag, exifTagName, ((Short) val).intValue());
fileAtt.put(exifTagName, String.valueOf(val));
} else if (val instanceof Integer) {
md = createShortField(exifTag, exifTagName, ((Integer) val).intValue());
fileAtt.put(exifTagName, String.valueOf(val));
} else if (val instanceof Long) {
md = createLongField(exifTag, exifTagName, ((Long) val).longValue());
fileAtt.put(exifTagName, String.valueOf(val));
} else if (val instanceof Float) {
md = createFloatField(exifTag, exifTagName, ((Float) val).floatValue());
fileAtt.put(exifTagName, String.valueOf(val));
} else if (val instanceof Double) {
md = createDoubleField(exifTag, exifTagName, ((Double) val).doubleValue());
fileAtt.put(exifTagName, String.valueOf(val));
} else if (val instanceof Rational) {
md = createRationalField(exifTag, exifTagName, ((Rational) val));
fileAtt.put(exifTagName, String.valueOf(val));
} else if (val instanceof RationalNumber) {
md = createRationalField(exifTag, exifTagName, ((RationalNumber) val));
fileAtt.put(exifTagName, String.valueOf(val));
} else {
throw new IllegalArgumentException("unsupported value class: " + val.getClass().getName());
}
addList.add(md);
}
/**
*
* #param tagInfo {#link TiffTagConstants} like {#link TiffTagConstants#TIFF_TAG_XMP}
* #param val String, byte[],
*/
public void add(TagInfo tagInfo, Object val) {
if (tagInfo instanceof TagInfoBytes) {
if (!(val instanceof byte[])) {
throw new IllegalArgumentException("expecting byte[] value");
}
} else if (tagInfo instanceof TagInfoAscii) {
if (!(val instanceof String)) {
throw new IllegalArgumentException("expecting String value");
}
} else if (tagInfo instanceof TagInfoShort) {
if (val instanceof Short || val instanceof Integer) {
// ok
} else {
throw new IllegalArgumentException("expecting Short/Integer value");
}
} else if (tagInfo instanceof TagInfoLong) {
if (!(val instanceof Long)) {
throw new IllegalArgumentException("expecting Long value");
}
} else if (tagInfo instanceof TagInfoDouble) {
if (!(val instanceof Double)) {
throw new IllegalArgumentException("expecting double value");
}
} else if (tagInfo instanceof TagInfoFloat) {
if (!(val instanceof Float)) {
throw new IllegalArgumentException("expecting float value");
}
} else if (tagInfo instanceof TagInfoRational) {
if (val instanceof RationalNumber || val instanceof Rational) {
// ok
} else {
throw new IllegalArgumentException("expecting rational value");
}
}
add(tagInfo.tag, tagInfo.name, val);
}
private static IIOMetadataNode createBytesField(int number, String name, byte[] bytes) {
IIOMetadataNode field = new IIOMetadataNode("TIFFField");
field.setAttribute("number", Integer.toString(number));
field.setAttribute("name", name);
IIOMetadataNode arrayNode = new IIOMetadataNode("TIFFBytes");
field.appendChild(arrayNode);
for (byte b : bytes) {
IIOMetadataNode valueNode = new IIOMetadataNode("TIFFByte");
valueNode.setAttribute("value", Integer.toString(b));
arrayNode.appendChild(valueNode);
}
return field;
}
private static IIOMetadataNode createShortField(int number, String name, int val) {
IIOMetadataNode field, arrayNode, valueNode;
field = new IIOMetadataNode("TIFFField");
field.setAttribute("number", Integer.toString(number));
field.setAttribute("name", name);
arrayNode = new IIOMetadataNode("TIFFShorts");
field.appendChild(arrayNode);
valueNode = new IIOMetadataNode("TIFFShort");
arrayNode.appendChild(valueNode);
valueNode.setAttribute("value", Integer.toString(val));
return field;
}
private static IIOMetadataNode createAsciiField(int number, String name, String val) {
IIOMetadataNode field, arrayNode, valueNode;
field = new IIOMetadataNode("TIFFField");
field.setAttribute("number", Integer.toString(number));
field.setAttribute("name", name);
arrayNode = new IIOMetadataNode("TIFFAsciis");
field.appendChild(arrayNode);
valueNode = new IIOMetadataNode("TIFFAscii");
arrayNode.appendChild(valueNode);
valueNode.setAttribute("value", val);
return field;
}
private static IIOMetadataNode createLongField(int number, String name, long val) {
IIOMetadataNode field, arrayNode, valueNode;
field = new IIOMetadataNode("TIFFField");
field.setAttribute("number", Integer.toString(number));
field.setAttribute("name", name);
arrayNode = new IIOMetadataNode("TIFFLongs");
field.appendChild(arrayNode);
valueNode = new IIOMetadataNode("TIFFLong");
arrayNode.appendChild(valueNode);
valueNode.setAttribute("value", Long.toString(val));
return field;
}
private static IIOMetadataNode createFloatField(int number, String name, float val) {
IIOMetadataNode field, arrayNode, valueNode;
field = new IIOMetadataNode("TIFFField");
field.setAttribute("number", Integer.toString(number));
field.setAttribute("name", name);
arrayNode = new IIOMetadataNode("TIFFFloats");
field.appendChild(arrayNode);
valueNode = new IIOMetadataNode("TIFFFloat");
arrayNode.appendChild(valueNode);
valueNode.setAttribute("value", Float.toString(val));
return field;
}
private static IIOMetadataNode createDoubleField(int number, String name, double val) {
IIOMetadataNode field, arrayNode, valueNode;
field = new IIOMetadataNode("TIFFField");
field.setAttribute("number", Integer.toString(number));
field.setAttribute("name", name);
arrayNode = new IIOMetadataNode("TIFFDoubles");
field.appendChild(arrayNode);
valueNode = new IIOMetadataNode("TIFFDouble");
arrayNode.appendChild(valueNode);
valueNode.setAttribute("value", Double.toString(val));
return field;
}
private static IIOMetadataNode createRationalField(int number, String name, Rational rational) {
return createRationalField(number, name, rational.numerator(), rational.denominator());
}
private static IIOMetadataNode createRationalField(int number, String name,
RationalNumber rational) {
return createRationalField(number, name, rational.numerator, rational.divisor);
}
private static IIOMetadataNode createRationalField(int number, String name, long numerator,
long denominator) {
IIOMetadataNode field, arrayNode, valueNode;
field = new IIOMetadataNode("TIFFField");
field.setAttribute("number", Integer.toString(number));
field.setAttribute("name", name);
arrayNode = new IIOMetadataNode("TIFFRationals");
field.appendChild(arrayNode);
valueNode = new IIOMetadataNode("TIFFRational");
arrayNode.appendChild(valueNode);
valueNode.setAttribute("value", numerator + "/" + denominator);
return field;
}
}
}
Usage
byte[] bytes = create();
TiffMetadata.Metadata metaData = new TiffMetadata.Metadata();
metaData.add(TiffTagConstants.TIFF_TAG_SOFTWARE, "FUBAR");
// metaData.add(TiffMetadata.TIFF_TAG_XMP, TiffMetadata.TIFF_TAG_XMP_NAME, bytes );
metaData.add(TiffTagConstants.TIFF_TAG_XMP, bytes);
TiffMetadata.setMetaData(tiffIn, tiffOut, metaData);

Getting error while using combinebykey in JavaDStream

I am getting the below error while running the program. I am trying to find the average of the Dstream in the format (name,avg).
'The method combineByKey(Function, Function2,
Function2, Partitioner) in the type
JavaPairDStream is not applicable for the arguments
(Function,
Function2,
Function2)'
Please help.
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.log4j.*;
import org.json.JSONArray;
import org.json.JSONObject;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.Map;
import java.util.Map.Entry;
import scala.Tuple2;
public class FirstSparkApplication {
#SuppressWarnings("serial")
public static class AvgCount implements java.io.Serializable {
public AvgCount(double total, int num) {
total_ = total;
num_ = num;
}
public double total_;
public int num_;
public double avg() {
return total_ / (double) num_;
}
}
public static void main(String[] args) throws InterruptedException {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("FirstSparkApplication");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(20));
Logger.getRootLogger().setLevel(Level.ERROR);
Function2<Double, Double, Double> reduceFunc = new Function2<Double, Double,
Double>() {
public Double call(Double result, Double value)
throws Exception {
System.out.println("Reduce running");
System.out.println(result + "+" + value);
return result + value;
}
};
JavaDStream<String> lines = jssc.textFileStream("/home/dominic/Downloads/DATADIR").cache();
final String[] path = new String[]{ "/home/dominic/Downloads/OUTPUTDIR"};
JavaPairDStream<String, Double> pair = lines.flatMapToPair(
new PairFlatMapFunction<String, String, Double>() {
private static final long serialVersionUID = 67676744;
public Iterator<Tuple2<String, Double>> call(String t) throws Exception {
List<Tuple2<String, Double>> list = new ArrayList<Tuple2<String, Double>>();
JSONArray js1 = new JSONArray(t);
for (int i = 0; i < js1.length(); i++) {
String symbol = js1.getJSONObject(i).get("symbol")
.toString();
JSONObject jo = new JSONObject(js1.getJSONObject(i)
.get("priceData").toString());
list.add(new Tuple2<String, Double>(symbol,jo.getDouble("close")));
}
return list.iterator();
}
});
JavaPairDStream<String, Double> result=pair.reduceByKeyAndWindow(reduceFunc, Durations.seconds(100), Durations.seconds(60));
pair.print();
result.print();
//Average
Function<Double, AvgCount> createAcc = new Function<Double, AvgCount>() {
public AvgCount call(Double x) {
return new AvgCount(x, 1);
}
};
Function2<AvgCount, Double, AvgCount> addAndCount = new Function2<AvgCount, Double, AvgCount>() {
public AvgCount call(AvgCount a, Double x) {
a.total_ += x;
a.num_ += 1;
return a;
}
};
Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {
public AvgCount call(AvgCount a, AvgCount b) {
a.total_ += b.total_;
a.num_ += b.num_;
return a;
}
};
AvgCount initial = new AvgCount(0,0);
JavaPairDStream<String, AvgCount> avgCounts = result.combineByKey(createAcc, addAndCount, combine);
// Map<String, AvgCount> countMap = avgCounts.collectAsMap();
// for (Entry<String, AvgCount> entry : countMap.entrySet()) {
// System.out.println(entry.getKey() + ":" + entry.getValue().avg());
jssc.start();
jssc.awaitTermination();
jssc.close();
}
}

You're missing a Partitioner implementation in the call to combineByKey. That's what the compiler is telling you.
result.combineByKey(createAcc, addAndCount, combine /*, Partitioner here */);
It's a good habit to check the appropriate documentation once you get these kind of errors. See Partitioner.

calculation of RSI(Relative Strength Index) Using ta4j java library is incorrect

This is my Class to fetch the list of OHLC (open, high, low, close) also the volume and date. i have separated each one of the arraylist for each stocksymbol. i have used my local API to fetch the data. To perform all these calculation i have used ta4j library for JAVA.
package com.infodev.util;
import com.google.gson.Gson;
import com.infodev.Model.Data;
import com.infodev.Model.Find;
import com.infodev.Pojo.RequestForTechnicalCalculation;
import org.apache.log4j.Logger;
import org.json.simple.JSONObject;
import org.springframework.http.*;
import org.springframework.http.converter.json.MappingJackson2HttpMessageConverter;
import org.springframework.web.client.RestTemplate;
import java.math.BigDecimal;
public class ApiTestData {
static Logger logger = Logger.getLogger(ApiTestData.class);
private static Data[] a;
public ApiTestData(RequestForTechnicalCalculation requestForTechnicalCalculation) throws Exception {
//setting request body
JSONObject jsonObject = new JSONObject();
jsonObject.put("sectorId", requestForTechnicalCalculation.getSectorId());
//setting request headers
HttpHeaders httpHeaders = new HttpHeaders();
httpHeaders.setContentType(MediaType.APPLICATION_JSON);
//setting httpEntity as the request for server post request
HttpEntity<?> httpEntity = new HttpEntity<>(jsonObject.toString(), httpHeaders);
//installing restTemplate
RestTemplate restTemplate = new RestTemplate();
restTemplate.getMessageConverters().add(new MappingJackson2HttpMessageConverter());
ResponseEntity<Find> returnedObject = restTemplate.exchange("http://localhost:8081/pull365", HttpMethod.POST, httpEntity, Find.class);
a = returnedObject.getBody().getData();
logger.info("ApiData " + new Gson().toJson(a));
}
public int getDataSize() {
return a.length;
}
public BigDecimal[] getOpen(int index) {
return a[index].getOpen();
}
public BigDecimal[] getHigh(int index) {
return a[index].getHigh();
}
public BigDecimal[] getLow(int index) {
return a[index].getLow();
}
public BigDecimal[] getClose(int index) {
return a[index].getClose();
}
public BigDecimal[] getVolume(int index) {
return a[index].getVolume();
}
public String[] getDates(int index) {
return a[index].getDates();
}
public String getSymbols(int index) {
logger.info("stock name " +new Gson().toJson(a[index].getStockName()));
return a[index].getStockName();
}
}
This one is my calculation part to get the values of RSI. I have calculated another indicators also which is exactly correct according to my manual calculation of indicators but the problem seems to be in the calculation of RSI.
package com.infodev.Services.Indicators;
import com.infodev.Pojo.RequestForTechnicalCalculation;
import com.infodev.util.ApiTestData;
import eu.verdelhan.ta4j.Decimal;
import eu.verdelhan.ta4j.Tick;
import eu.verdelhan.ta4j.TimeSeries;
import eu.verdelhan.ta4j.indicators.candles.LowerShadowIndicator;
import eu.verdelhan.ta4j.indicators.helpers.*;
import eu.verdelhan.ta4j.indicators.oscillators.CMOIndicator;
import eu.verdelhan.ta4j.indicators.oscillators.PPOIndicator;
import eu.verdelhan.ta4j.indicators.oscillators.StochasticOscillatorDIndicator;
import eu.verdelhan.ta4j.indicators.oscillators.StochasticOscillatorKIndicator;
import eu.verdelhan.ta4j.indicators.simple.*;
import eu.verdelhan.ta4j.indicators.statistics.*;
import eu.verdelhan.ta4j.indicators.trackers.*;
import eu.verdelhan.ta4j.indicators.trackers.bollinger.BollingerBandWidthIndicator;
import eu.verdelhan.ta4j.indicators.trackers.bollinger.BollingerBandsLowerIndicator;
import eu.verdelhan.ta4j.indicators.trackers.bollinger.BollingerBandsMiddleIndicator;
import eu.verdelhan.ta4j.indicators.trackers.bollinger.BollingerBandsUpperIndicator;
import eu.verdelhan.ta4j.indicators.volatility.MassIndexIndicator;
import eu.verdelhan.ta4j.indicators.volume.ChaikinMoneyFlowIndicator;
import eu.verdelhan.ta4j.indicators.volume.OnBalanceVolumeIndicator;
import org.apache.log4j.Logger;
import org.joda.time.DateTime;
import org.joda.time.Period;
import org.springframework.stereotype.Service;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.math.BigDecimal;
import java.text.DecimalFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
#Service
public class IndicatorServiceImpl implements IndicatorService {
static Logger logger = Logger.getLogger(IndicatorServiceImpl.class);
private static DecimalFormat df = new DecimalFormat("###,###.##");
private static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");
List<Tick> ticks;
List<Tick> tickList;
TimeSeries series;
ClosePriceIndicator closePrice;
SMAIndicator shortSma;
SMAIndicator longSma;
EMAIndicator shortEma;
RSIIndicator rsi;
MACDIndicator macd;
BollingerBandsMiddleIndicator bbm;
BollingerBandsLowerIndicator bbl;
BollingerBandsUpperIndicator bbh;
BollingerBandWidthIndicator bbw;
ApiTestData apiData;
String symbol;
String[] date;
BigDecimal[] volume;
BigDecimal[] close;
BigDecimal[] low;
BigDecimal[] high;
BigDecimal[] open;
#Override
public List<Map<Object, Object>> getIndicators(RequestForTechnicalCalculation requestForTechnicalCalculation) {
System.out.println("service state");
List<Map<Object, Object>> finalList = new ArrayList<>();
try {
apiData = new ApiTestData(requestForTechnicalCalculation);
logger.info("----" + apiData.getDataSize());
for (int i = 0; i < apiData.getDataSize(); i++) {
logger.info("----" + i);
// getting the symbol from the api
symbol = apiData.getSymbols(i);
date = apiData.getDates(i);
volume = apiData.getVolume(i);
close = apiData.getClose(i);
low = apiData.getLow(i);
high = apiData.getHigh(i);
open = apiData.getOpen(i);
if (date.length == 0 || volume.length == 0 || close.length == 0 ||
low.length == 0 || high.length == 0 || open.length == 0) {
finalList.add(makeEmptyObject());
} else {
makeCalculation(i);
finalList.add(makeIndicatorObject());
}
}
//return finalList;
} catch (Exception e) {
e.printStackTrace();
finalList.add(makeEmptyObject());
}
return finalList;
}
private void makeCalculation(int ii) throws ParseException {
//instating tick to change the ohlc to Tick class array
ticks = new ArrayList<>();
logger.info("----" + ticks.size());
for (int i = 0; i < close.length; i++) {
this.ticks.add(new Tick(new DateTime(DATE_FORMAT.parse(date[i])), open[i].doubleValue(), high[i].doubleValue()
, low[i].doubleValue(), close[i].doubleValue(), volume[i].doubleValue()));
}
//converting the array to the list of tick
//generating the time Series of the sample data
series = new TimeSeries(apiData.getSymbols(ii), ticks);
if (series == null) {
throw new IllegalArgumentException("Series cannot be null");
} else {
//close price indicator
closePrice = new ClosePriceIndicator(this.series);
logger.info("ClosePrice: " + closePrice.getValue(series.getEnd()));
// Simple moving averages
shortSma = new SMAIndicator(closePrice, 5);
logger.info("shortSMA: " + shortSma.getValue(series.getEnd()));
longSma = new SMAIndicator(closePrice, 20);
logger.info("longSMA: " + longSma.getValue(series.getEnd()));
// Exponential moving averages
shortEma = new EMAIndicator(closePrice, 5);
logger.info("shortEMA: " + shortEma.getValue(series.getEnd()));
longEma = new EMAIndicator(closePrice, 20);
logger.info("longEMA: " + longEma.getValue(series.getEnd()));
rsi = new RSIIndicator(closePrice, 14);
series.getLastTick().addTrade(100, rsi.getValue(series.getEnd()).toDouble());
//newTick.addTrade(100, rsi.getValue(series.getEnd()).toDouble());
logger.info("RsiIndicator: " + rsi.getValue(series.getEnd()));
// Standard deviation
sd = new StandardDeviationIndicator(closePrice, 20);
logger.info("StandardDeviationIndicator: " + sd.getValue(series.getEnd()));
//macd indicator
macd = new MACDIndicator(closePrice, 12, 26);
logger.info("MACD indicator: " + macd.getValue(series.getEnd()));
//bollingerbandsmiddle indicator
bbm = new BollingerBandsMiddleIndicator(longSma);
logger.info("Bollinger Bands Middle Indicator :" + bbm.getValue(series.getEnd()));
bbl = new BollingerBandsLowerIndicator(bbm, sd);
logger.info("Bollinger bands lower indicator :" + bbl.getValue(series.getEnd()));
bbh = new BollingerBandsUpperIndicator(bbm, sd);
logger.info("Bollinger bands upper indicator :" + bbh.getValue(series.getEnd()));
bbw = new BollingerBandWidthIndicator(bbh, bbm, bbl);
logger.info("Bollinger band width :" + bbw.getValue(series.getEnd()));
StringBuilder sb = new StringBuilder("timestamp,close,typical,variation,sma8,sma20,ema8,ema20,ppo,roc,rsi,williamsr,atr,sd\n");
/**
* Adding indicators values
*/
final int nbTick = series.getTickCount();
for (int i = 0; i < nbTick; i++) {
sb.append(series.getTick(i).getEndTime()).append(',')
.append(closePrice.getValue(i)).append(',')
.append(typicalPrice.getValue(i)).append(',')
.append(priceVariation.getValue(i)).append(',')
.append(shortSma.getValue(i)).append(',')
.append(longSma.getValue(i)).append(',')
.append(shortEma.getValue(i)).append(',')
.append(longEma.getValue(i)).append(',')
.append(ppo.getValue(i)).append(',')
.append(roc.getValue(i)).append(',')
.append(rsi.getValue(i)).append(',')
.append(williamsR.getValue(i)).append(',')
.append(atr.getValue(i)).append(',')
.append(sd.getValue(i)).append('\n');
}
/**
* Writing CSV file
*/
BufferedWriter writer = null;
try {
writer = new BufferedWriter(new FileWriter("C:\\Users\\Administrator\\Desktop\\fafa\\indicators.csv"));
writer.write(sb.toString());
} catch (IOException ioe) {
System.out.println(ioe);
} finally {
try {
if (writer != null) {
writer.close();
}
} catch (IOException ioe) {
}
}
}
}
private Map<Object, Object> makeIndicatorObject() {
// Map for indicator values.
try {
logger.info("map state of make indicator");
Map<Object, Object> indicators = new LinkedHashMap<>();
indicators.put("symbol", symbol);
indicators.put("ClosePrice", formatBigDecimal(closePrice.getValue(series.getEnd()).toDouble()));
indicators.put("ShortSMA", formatBigDecimal(shortSma.getValue(series.getEnd()).toDouble()));
indicators.put("LongSMA", formatBigDecimal(longSma.getValue(series.getEnd()).toDouble()));
indicators.put("ShortEMA", formatBigDecimal(shortEma.getValue(series.getEnd()).toDouble()));
indicators.put("LongEMA", formatBigDecimal(longEma.getValue(series.getEnd()).toDouble()));
indicators.put("RSI", formatBigDecimal(rsi.getValue(series.getEnd()).toDouble()));
indicators.put("SD", formatBigDecimal(sd.getValue(series.getEnd()).toDouble()));
indicators.put("MACD", formatBigDecimal(macd.getValue(series.getEnd()).toDouble()));
indicators.put("BBM", formatBigDecimal(bbm.getValue(series.getEnd()).toDouble()));
indicators.put("BBL", formatBigDecimal(bbl.getValue(series.getEnd()).toDouble()));
indicators.put("BBH", formatBigDecimal(bbh.getValue(series.getEnd()).toDouble()));
indicators.put("BBW", formatBigDecimal(bbw.getValue(series.getEnd()).toDouble()));
return indicators;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
private BigDecimal formatBigDecimal(double value) {
try {
return new BigDecimal(df.format(value));
} catch (Exception e) {
return new BigDecimal(0);
}
}
private Map<Object, Object> makeEmptyObject() {
logger.info("map state of empty object");
Map<Object, Object> indicators = new LinkedHashMap<>();
indicators.put("symbol", symbol);
indicators.put("ClosePrice", new BigDecimal(0));
indicators.put("ShortSMA", new BigDecimal(0));
indicators.put("LongSMA", new BigDecimal(0));
indicators.put("ShortEMA", new BigDecimal(0));
indicators.put("LongEMA", new BigDecimal(0));
indicators.put("RSI", new BigDecimal(0));
indicators.put("SD", new BigDecimal(0));
indicators.put("MACD", new BigDecimal(0));
indicators.put("BBM", new BigDecimal(0));
indicators.put("BBL", new BigDecimal(0));
indicators.put("BBH", new BigDecimal(0));
indicators.put("BBW", new BigDecimal(0));
return indicators;
}
}
This is the Json Output from the local API that is used in the first class (ApiTestData)

Hadoop facebook mutual friends using mapreduce

I was trying a mapreduce program in hadoop (Java version) , to find the mutual friends list from a json file . The json file content has the following pattern :
{"name":"abc","id":123} [{"name":"xyz","id":124},{"name":"def","id":125},{"name":"cxf","id":155}]
{"name":"cxf","id":155} [{"name":"xyz","id":124},{"name":"abc","id":123},{"name":"yyy","id":129}]
Pattern to be interpreted as follows :
friend json tab separated by array of related friends json's
Hence abc has xyz , def and cxf as friends
cxf has xyz abc and yyy as friends .
Given the above the mutual friends between abc and cxf is xyz .
tried to implement the same using mapreduce by creating custom writables , with the mapper emitting following key values , key being pair of friends and value being related friends of the first friend in the key (ie , pair of friends)
K->V
(abc,xyz) -> [xyz,def,cxf]
(abc,def) -> [xyz,def,cxf]
(abc,cxf) -> [xyz,def,cxf]
(cxf,xyz) -> [xyz,abc,yyy]
(cxf,abc) -> [xyz,abc,yyy]
(cxf,yyy) -> [xyz,abc,yyy]
The key here is actually a Custom writable , created a class which extends WritableComparable and i have overridden the compareTo method so that both these pairs (a,b) and (b,a) are same . But the problem i am facing is that the compareTo method is not invoked for all combinations of pairs and hence the reducer logic is failing.
Based on the above example , there are 6 K, V pairs emitted by the mapper . But compareTo is invoked only 5 times key1.compareTo(key2) , key2.compareTo(key3), key3.compareTo(key4),key4.compareTo(key5),,key5.compareTo(key6) .
Any idea why this is happening ?
Below is the code as per the logic suggested by f11ler
Driver class :
package com.facebook.updated;
import java.io.IOException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
public class FacebookMain extends Configured implements Tool
{
Logger logger = Logger.getLogger(FacebookMain.class);
public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new FacebookMain(), args));
}
#Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
logger.info("Running======>");
Job job = Job.getInstance();
job.setJarByClass(FacebookMain.class);
job.setJobName("FBApp");
job.setMapOutputKeyClass(Friend.class);
job.setMapOutputValueClass(Friend.class);
job.setOutputKeyClass(FriendPair.class);
job.setOutputValueClass(Friend.class);
job.setMapperClass(FacebookMapper.class);
job.setReducerClass(FacebookReducer.class);
job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean val = job.waitForCompletion(true);
return val ? 0 : 1;
}
}
The customWritables (used to represent a friend and friendpair)
package com.facebook.updated;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import lombok.Getter;
import lombok.Setter;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.log4j.Logger;
#Getter
#Setter
public class Friend implements WritableComparable<Friend> {
Logger logger = Logger.getLogger(Friend.class);
private IntWritable id;
private Text name;
public Friend() {
this.id = new IntWritable();
this.name = new Text();
}
#Override
public int compareTo(Friend arg0) {
int val = getId().compareTo(arg0.getId());
logger.info("compareTo Friend ======> " + arg0 + " and " + this + " compare is " + val);
return val;
}
#Override
public void readFields(DataInput in) throws IOException {
id.readFields(in);
name.readFields(in);
}
#Override
public void write(DataOutput out) throws IOException {
id.write(out);
name.write(out);
}
#Override
public boolean equals(Object obj) {
Friend f2 = (Friend) obj;
boolean val = this.getId().equals(f2.getId());
//logger.info("equals Friend ======> " + obj + " and " + this);
return val;
}
#Override
public String toString() {
return id + ":" + name + " ";
}
}
package com.facebook.updated;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import lombok.Getter;
import lombok.Setter;
import org.apache.hadoop.io.WritableComparable;
import org.apache.log4j.Logger;
#Getter
#Setter
public class FriendPair implements WritableComparable<FriendPair> {
Logger logger = Logger.getLogger(FriendPair.class);
private Friend first;
private Friend second;
public FriendPair() {
this.first = new Friend();
this.second = new Friend();
}
public FriendPair(Friend f1, Friend f2) {
this.first = f1;
this.second = f2;
}
#Override
public int compareTo(FriendPair o) {
logger.info("compareTo FriendPair ======> " + o + " and " + this);
FriendPair pair2 = o;
int cmp = -1;
if (getFirst().compareTo(pair2.getFirst()) == 0 || getFirst().compareTo(pair2.getSecond()) == 0) {
cmp = 0;
}
if (cmp != 0) {
// logger.info("compareTo FriendPair ======> " + o + " and " + this
// + " comparison is " + cmp);
return cmp;
}
cmp = -1;
if (getSecond().compareTo(pair2.getFirst()) == 0 || getSecond().compareTo(pair2.getSecond()) == 0) {
cmp = 0;
}
// logger.info("compareTo FriendPair ======> " + o + " and " + this +
// " comparison is " + cmp);
// logger.info("getFirst() " + getFirst());
// logger.info("pair2.getFirst() " + pair2.getFirst());
// logger.info("getFirst().compareTo(pair2.getFirst()) " +
// getFirst().compareTo(pair2.getFirst()));
// logger.info("getFirst().compareTo(pair2.getSecond()) " +
// getFirst().compareTo(pair2.getSecond()));
// logger.info("getSecond().compareTo(pair2.getFirst()) " +
// getSecond().compareTo(pair2.getFirst()));
// logger.info("getSecond().compareTo(pair2.getSecond()) " +
// getSecond().compareTo(pair2.getSecond()));
// logger.info("pair2.getSecond() " + pair2.getSecond());
// logger.info("getSecond() " + getSecond());
// logger.info("pair2.getFirst() " + pair2.getFirst());
// logger.info("pair2.getSecond() " + pair2.getSecond());
return cmp;
}
#Override
public boolean equals(Object obj) {
FriendPair pair1 = this;
FriendPair pair2 = (FriendPair) obj;
boolean eq = false;
logger.info("equals FriendPair ======> " + obj + " and " + this);
if (pair1.getFirst().equals(pair2.getFirst()) || pair1.getFirst().equals(pair2.getSecond()))
eq = true;
if (!eq) {
// logger.info("equals FriendPair ======> " + obj + " and " + this +
// " equality is " + eq);
return false;
}
if (pair1.getSecond().equals(pair2.getFirst()) || pair1.getSecond().equals(pair2.getSecond()))
eq = true;
// logger.info("equals FriendPair ======> " + obj + " and " + this +
// " equality is " + eq);
return eq;
}
#Override
public void readFields(DataInput in) throws IOException {
first.readFields(in);
second.readFields(in);
}
#Override
public void write(DataOutput out) throws IOException {
first.write(out);
second.write(out);
}
#Override
public String toString() {
return "[" + first + ";" + second + "]";
}
#Override
public int hashCode() {
logger.info("hashCode FriendPair ======> " + this);
return first.getId().hashCode() + second.getId().hashCode();
}
}
Mapper and Reducer
package com.facebook.updated;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.util.JSON;
public class FacebookMapper extends Mapper<LongWritable, Text, Friend, Friend> {
Logger log = Logger.getLogger(FacebookMapper.class);
#Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Friend, Friend>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer st = new StringTokenizer(line, "\t");
String person = st.nextToken();
String friends = st.nextToken();
BasicDBObject personObj = (BasicDBObject) JSON.parse(person);
BasicDBList friendsList = (BasicDBList) JSON.parse(friends);
List<Friend> frndJavaList = new ArrayList<>();
for (Object frndObj : friendsList) {
frndJavaList.add(getFriend((BasicDBObject) frndObj));
}
Friend frnd = getFriend(personObj);
Friend[] array = frndJavaList.toArray(new Friend[frndJavaList.size()]);
for (Friend f : array) {
log.info("Map output is " + f + " and " + frnd);
context.write(f, frnd);
}
}
private static Friend getFriend(BasicDBObject personObj) {
Friend frnd = new Friend();
frnd.setId(new IntWritable(personObj.getInt("id")));
frnd.setName(new Text(personObj.getString("name")));
frnd.setHomeTown(new Text(personObj.getString("homeTown")));
return frnd;
}
}
package com.facebook.updated;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.log4j.Logger;
public class FacebookReducer extends Reducer<Friend, Friend, FriendPair, Friend> {
Logger log = Logger.getLogger(FacebookReducer.class);
#Override
protected void reduce(Friend friend, Iterable<Friend> vals,
Reducer<Friend, Friend, FriendPair, Friend>.Context context) throws IOException, InterruptedException {
List<Friend> friends = new ArrayList<>();
for (Friend frnd : vals) {
friends.add(frnd);
}
log.info("Reducer output is " + friend + " and values are " + friends);
if (friends.size() == 2) {
FriendPair key = new FriendPair(friends.get(0), friends.get(1));
context.write(key, friend);
} else {
//log.info("Size of friends is not 2 key is " + friend + " and values are " + friends);
}
}
}
Input json file containing 2 lines
{"name":"abc","id":123} [{"name":"xyz","id":124},{"name":"def","id":125},{"name":"cxf","id":155}]
{"name":"cxf","id":155} [{"name":"xyz","id":124},{"name":"abc","id":123},{"name":"yyy","id":129}]
Output of reducer
(abc,abc)->xyz

compareTo method is required for sorting, this relation should be transitive. This mean that if a > b and b > c then a > c. Probably this is not true for your implementation.
Why you generate this kind of records in mapper?
If "being a friend" is a symmetric relation you can simply do a mapper-only job with this logic (pseudo-code):
for(int i = 0; i < values.length; ++i)
for(int j = 0; j < values.length; ++j)
if (i ==j)
continue
emmit (values[i], values[j]), key
Update:
If this is not symmetric (which means that "xyz has friend abc" not follows from "abc has friend xyz") then we need reverse records:
Mapper:
for(int i = 0; i < values.length; ++i)
emmit values[i], key
Reducer (same as mapper before):
for(int i = 0; i < values.length; ++i)
for(int j = 0; j < values.length; ++j)
if (i ==j)
continue
emmit (values[i], values[j]), key
Update2:
Lets see how this algorithm works with your example:
The result of mapper:
xyz -> abc
def -> abc
cxf -> abc
xyz -> cxf
abc -> cxf
yyy -> cxf
Mapreduce wiil group this values by key, so the input of reducer:
xyz -> [abc,cxf]
def -> [abc]
cxf -> [abc]
abc -> [cxf]
yyy -> [cxf]
In reducer we do a nested loop by values, but skip comparing with self. Result:
(abc, cxf) -> xyz
This is what we want to get.

printing content into Browser after reading xml file

XML file i have is
<xml>
<ticket>
<team>A</team>
<imp>I1</imp>
</ticket>
<ticket>
<team>A</team>
<imp>I2</imp>
</ticket>
<ticket>
<team>b</team>
<imp>I2</imp>
</ticket>
<ticket>
<team>A</team>
<imp>I1</imp>
</ticket>
<ticket>
<team>B</team>
<imp>I2</imp>
</ticket>
<ticket>
<team>c</team>
<imp>I1</imp>
ticketcount.java
package com.asn.model;
import java.awt.Desktop;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import com.asn.model.ticket;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.asn.model.ticket;
public class ticketcount {
public static void main(String[] args) {
List<ticket> ticketList = new ArrayList<ticket>();
try {
String Path = "C:\\Users\\";
File fXmlFile = new File(Path + "\\ticket.xml");
// File fXmlFile = new File(App.class.getClassLoader().getResource("C:\\Users\// \
// \tickets.xml").getFile());
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(fXmlFile);
doc.getDocumentElement().normalize();
NodeList ticketNodeList = doc.getElementsByTagName("ticket");
for (int temp = 0; temp < ticketNodeList.getLength(); temp++) {
Node varNode = ticketNodeList.item(temp);
if (varNode.getNodeType() == Node.ELEMENT_NODE) {
Element eElement = (Element) varNode;
NodeList teamList = eElement.getElementsByTagName("team");
NodeList varsionList = eElement.getElementsByTagName("imp");
Node teamNode = teamList.item(0);
Node impNode = varsionList.item(0);
if (teamNode.getNodeType() == Node.ELEMENT_NODE && impNode.getNodeType() ==
Node.ELEMENT_NODE) {
Element teamElement = (Element) teamNode;
Element impElement = (Element) impNode;
ticket ticket = new ticket(teamElement.getTextContent(),
impElement.getTextContent());
ticketList.add(ticket);
}
}
String content1 =
"<HTML><HEAD></HEAD><TABLE><tr><td>Chain</td><td><b>Priority</b></td<td></td><td>count</td><
/ tr > ";
String content = "";
File file = new File(Path + "\\result1.html");
if (!file.exists()) {
file.createNewFile();
}
FileWriter fw = new FileWriter(file.getAbsoluteFile());
BufferedWriter bw = new BufferedWriter(fw);
}
} catch (Exception e) {
e.printStackTrace();
}
Map<ticket, Integer> count = new HashMap<ticket, Integer>();
for (ticket c : ticketList)
if (!count.containsKey(c))
count.put(c, Collections.frequency(ticketList, c));
List<String> imps = getimps(count);
List<String> teams = getteams(count);
StringBuilder heading = new StringBuilder("ticket \t| ");
for (String s : imps) {
heading.append(s);
heading.append("\t| ");
}
System.out.println(heading);
System.out.println("---------------------------------");
for (String m : teams) {
System.out.println(m + "\t| " + getNumOfteams(m, imps, count));
content = content + "<tr><td>" + m + "</td><td>" + getNumOfteams(m, imps, count) + "</td></tr>";
}
bw.write(content1 + content + "</TABLE></HTML>");
bw.close();
Runtime rTime = Runtime.getRuntime();
String url = Path + "result.html";
//String url = "C:\\Users\\a561922\\Desktop\\TEST.html";//"D:/hi.html";
String browser = "C:/Program Files/Internet Explorer/iexplore.exe ";
File htmlFile = new File(url);
Desktop.getDesktop().browse(htmlFile.toURI());
// Process pc = rTime.exec(browser + url);
// pc.waitFor();
//Runtime.getRuntime().exec("C:\\Users\\a561922\\Desktop\\TEST.html");
}
private static List<String> getteams(Map<ticket, Integer> count) {
List<String> teams = new ArrayList<String>();
for (Map.Entry<ticket, Integer> ent : count.entrySet())
if (!teams.contains(ent.getKey().getteam()))
teams.add(ent.getKey().getteam());
return teams;
}
private static String getNumOfteams(String team, List<String> imps, Map<ticket, Integer>
count) {
StringBuilder builder = new StringBuilder();
for (String v : imps) {
Integer cnt = count.get(new ticket(team, v));
if (cnt == null) {
cnt = 0;
}
builder.append(cnt + "\t");
}
return builder.toString();
}
private static List<String> getimps(Map<ticket, Integer> count) {
List<String> imps = new ArrayList<String>();
for (Map.Entry<ticket, Integer> ent : count.entrySet())
if (!imps.contains(ent.getKey().getimp()))
imps.add(ent.getKey().getimp());
return imps;
}
}
ticket.java
package com.asn.model;
public class ticket {
private String team;
private String imp;
public ticket(String team, String imp) {
super();
this.team = team;
this.imp = imp;
}
public String getteam() {
return team;
}
public void setteam(String team) {
this.team = team;
}
public String getimp() {
return imp;
}
public void setimp(String imp) {
this.imp = imp;
}
#Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((team == null) ? 0 : team.hashCode());
result = prime * result + ((imp == null) ? 0 : imp.hashCode());
return result;
}
#Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
ticket other = (ticket) obj;
if (team == null) {
if (other.team != null)
return false;
} else if (!team.equals(other.team))
return false;
if (imp == null) {
if (other.imp != null)
return false;
} else if (!imp.equals(other.imp))
return false;
return true;
}
#Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("incident [team=");
builder.append(team);
builder.append(", imp=");
builder.append(imp);
builder.append("]");
return builder.toString();
}
}

you can do like this...change Path according to you
package com.asn.model;
import java.awt.Desktop;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
public class ticketcount {
public static void main(String[] args) throws IOException {
List<ticket> ticketList = new ArrayList<ticket>();
String content = "";
String content1 ="<HTML><HEAD></HEAD><TABLE border=3>";
FileWriter fw =null;
BufferedWriter bw=null;
String Path = "src";
try {
File fXmlFile = new File(Path + "\\file.xml");
DocumentBuilderFactory dbFactory = DocumentBuilderFactory
.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(fXmlFile);
doc.getDocumentElement().normalize();
NodeList ticketNodeList = doc.getElementsByTagName("ticket");
for (int temp = 0; temp < ticketNodeList.getLength(); temp++) {
Node varNode = ticketNodeList.item(temp);
if (varNode.getNodeType() == Node.ELEMENT_NODE) {
Element eElement = (Element) varNode;
NodeList teamList = eElement.getElementsByTagName("team");
NodeList varsionList = eElement.getElementsByTagName("imp");
Node teamNode = teamList.item(0);
Node impNode = varsionList.item(0);
if (teamNode.getNodeType() == Node.ELEMENT_NODE
&& impNode.getNodeType() ==
Node.ELEMENT_NODE) {
Element teamElement = (Element) teamNode;
Element impElement = (Element) impNode;
ticket ticket = new ticket(
teamElement.getTextContent(),
impElement.getTextContent());
ticketList.add(ticket);
}
}
File file = new File(Path + "\\result1.html");
if (!file.exists()) {
file.createNewFile();
}
fw = new FileWriter(file.getAbsoluteFile());
bw = new BufferedWriter(fw);
}
} catch (Exception e) {
e.printStackTrace();
}
Map<ticket, Integer> count = new HashMap<ticket, Integer>();
for (ticket c : ticketList)
if (!count.containsKey(c))
count.put(c, Collections.frequency(ticketList, c));
List<String> imps = getimps(count);
List<String> teams = getteams(count);
content=content+"<tr><th>ticket</th> ";
for (String s : imps) {
content=content+"<th>"+s+"</th>";
}
content=content+"</tr>";
System.out.println("---------------------------------");
for (String m : teams) {
System.out.println(m + "\t| " + getNumOfteams(m, imps, count));
content = content + "<tr><td>" + m + "</td>"
+ getNumOfteams(m, imps, count) + "</tr>";
}
bw.write(content1 + content + "</TABLE></HTML>");
bw.close();
Runtime rTime = Runtime.getRuntime();
String url = Path + "//result1.html";
// String url = "C:\\Users\\a561922\\Desktop\\TEST.html";//"D:/hi.html";
String browser = "C:/Program Files/Internet Explorer/iexplore.exe ";
File htmlFile = new File(url);
System.out.println(url);
Desktop.getDesktop().browse(htmlFile.toURI());
// Process pc = rTime.exec(browser + url);
// pc.waitFor();
// Runtime.getRuntime().exec("C:\\Users\\a561922\\Desktop\\TEST.html");
}
private static List<String> getteams(Map<ticket, Integer> count) {
List<String> teams = new ArrayList<String>();
for (Map.Entry<ticket, Integer> ent : count.entrySet())
if (!teams.contains(ent.getKey().getteam()))
teams.add(ent.getKey().getteam());
return teams;
}
private static String getNumOfteams(String team, List<String> imps,
Map<ticket, Integer>
count) {
StringBuilder builder = new StringBuilder();
for (String v : imps) {
Integer cnt = count.get(new ticket(team, v));
if (cnt == null) {
cnt = 0;
}
builder.append("<td>"+cnt + "</td>");
}
return builder.toString();
}
private static List<String> getimps(Map<ticket, Integer> count) {
List<String> imps = new ArrayList<String>();
for (Map.Entry<ticket, Integer> ent : count.entrySet())
if (!imps.contains(ent.getKey().getimp()))
imps.add(ent.getKey().getimp());
return imps;
}
}
Now the Output :-
I think this the output what you want ... Let me know if u face any issues

Develop Reference

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

How to obtain the cosine similarity with Lucene - java

Related

Add custom metadata to tiff

Getting error while using combinebykey in JavaDStream

calculation of RSI(Relative Strength Index) Using ta4j java library is incorrect

Hadoop facebook mutual friends using mapreduce

printing content into Browser after reading xml file

Categories

Resources