Porting code from lucene to elasticsearch - java

I have to following simple code that I want to port from lucene 6.5.x to elasticsearch 5.3.x.
However, the scores are different and I want to have the same score results like in lucene.
As example, the idf:
Lucenes docFreq is 3 (3 docs contains the term "d") and docCount is 4 (documents with this field). Elasticsearch has 1 docFreq and 2 docCount (or 1 and 1). I am not sure how these values relate to each other in elasticsearch...
The other different in scoring is the avgFieldLength:
Lucene is right with 14 / 4 = 3.5. Elasticsearch is different for each score result - but this should be the same for all documents...
Can you please tell me, which settings/mapping I missed in elasticsearch to get it to work like lucene?
IndexingExample.java:
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.document.Field;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
public class IndexingExample {
private static final String INDEX_DIR = "/tmp/lucene6idx";
private IndexWriter createWriter() throws IOException {
FSDirectory dir = FSDirectory.open(Paths.get(INDEX_DIR));
IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer());
return new IndexWriter(dir, config);
}
private List<Document> createDocs() {
List<Document> docs = new ArrayList<>();
FieldType summaryType = new FieldType();
summaryType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
summaryType.setStored(true);
summaryType.setTokenized(true);
Document doc1 = new Document();
doc1.add(new Field("title", "b c d d d", summaryType));
docs.add(doc1);
Document doc2 = new Document();
doc2.add(new Field("title", "b c d d", summaryType));
docs.add(doc2);
Document doc3 = new Document();
doc3.add(new Field("title", "b c d", summaryType));
docs.add(doc3);
Document doc4 = new Document();
doc4.add(new Field("title", "b c", summaryType));
docs.add(doc4);
return docs;
}
private IndexSearcher createSearcher() throws IOException {
Directory dir = FSDirectory.open(Paths.get(INDEX_DIR));
IndexReader reader = DirectoryReader.open(dir);
return new IndexSearcher(reader);
}
public static void main(String[] args) throws IOException, ParseException {
// indexing
IndexingExample app = new IndexingExample();
IndexWriter writer = app.createWriter();
writer.deleteAll();
List<Document> docs = app.createDocs();
writer.addDocuments(docs);
writer.commit();
writer.close();
// search
IndexSearcher searcher = app.createSearcher();
Query q1 = new TermQuery(new Term("title", "d"));
TopDocs hits = searcher.search(q1, 20);
System.out.println(hits.totalHits + " docs found for the query \"" + q1.toString() + "\"");
int num = 0;
for (ScoreDoc sd : hits.scoreDocs) {
Explanation expl = searcher.explain(q1, sd.doc);
System.out.println(expl);
}
}
}
Elasticsearch:
DELETE twitter
PUT twitter/tweet/1
{
"title" : "b c d d d"
}
PUT twitter/tweet/2
{
"title" : "b c d d"
}
PUT twitter/tweet/3
{
"title" : "b c d"
}
PUT twitter/tweet/4
{
"title" : "b c"
}
POST /twitter/tweet/_search
{
"explain": true,
"query": {
"term" : {
"title" : "d"
}
}
}

Problem solved with the help of jimczy:
Don't forget that ES creates an index with 5 shards by default and
that docFreq and docCount are computed per shard. You can create an
index with 1 shard or use the dfs mode to compute distributed stats:
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-search-type.html#dfs-query-then-fetch
This search query (dfs_query_then_fetch) worked like expected:
POST /twitter/tweet/_search?search_type=dfs_query_then_fetch
{
"explain": true,
"query": {
"term" : {
"title" : "d"
}
}
}

Related

How to keep this code repeating more than once

My code pulls the links and adds them to the HashSet. I want the link to replace the original link and repeat the process till no more new links can be found to add. The program keeps running but the link isn't updating and the program gets stuck in an infinite loop doing nothing. How do I get the link to update so the program can repeat until no more links can be found?
package downloader;
import java.io.IOException;
import java.net.URL;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Set;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Stage2 {
public static void main(String[] args) throws IOException {
int q = 0;
int w = 0;
HashSet<String> chapters = new HashSet();
String seen = new String("/manga/manabi-ikiru-wa-fuufu-no-tsutome/i1778063/v1/c1");
String source = new String("https://mangapark.net" + seen);
// 0123456789
while( q == w ) {
String source2 = new String(source.substring(21));
String last = new String(source.substring(source.length() - 12));
String last2 = new String(source.substring(source.length() - 1));
chapters.add(seen);
for (String link : findLinks(source)) {
if(link.contains("/manga") && !link.contains(last) && link.contains("/i") && link.contains("/c") && !chapters.contains(link)) {
chapters.add(link);
System.out.println(link);
seen = link;
System.out.print(chapters);
System.out.println(seen);
}
}
}
System.out.print(chapters);
}
private static Set<String> findLinks(String url) throws IOException {
Set<String> links = new HashSet<>();
Document doc = Jsoup.connect(url)
.data("query", "Java")
.userAgent("Mozilla")
.cookie("auth", "token")
.timeout(3000)
.get();
Elements elements = doc.select("a[href]");
for (Element element : elements) {
links.add(element.attr("href"));
}
return links;
}
}
Your progamm didn't stop becouse yout while conditions never change:
while( q == w )
is always true. I run your code without the while and I got 2 links print twice(!) and the programm stop.
If you want the links to the other chapters you have the same problem like me. In the element
Element element = doc.getElementById("sel_book_1");
the links are after the pseudoelement ::before. So they will not be in your Jsoup Document.
Here is my questsion to this topic:
How can I find a HTML tag with the pseudoElement ::before in jsoup

MongoDB's reduce-phase is not working as expected

I worked with a java-tutorial for mapReduce-Programming in MongoDB and ended up with the following Code:
package mapReduceExample;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.MapReduceCommand;
import com.mongodb.MapReduceOutput;
import com.mongodb.Mongo;
public class MapReduceExampleMain {
/**
* #param args
*/
public static void main(String[] args) {
Mongo mongo;
try {
mongo = new Mongo("localhost", 27017);
DB db = mongo.getDB("library");
DBCollection books = db.getCollection("books");
BasicDBObject book = new BasicDBObject();
book.put("name", "Understanding JAVA");
book.put("pages", 100);
books.insert(book);
book = new BasicDBObject();
book.put("name", "Understanding JSON");
book.put("pages", 200);
books.insert(book);
book = new BasicDBObject();
book.put("name", "Understanding XML");
book.put("pages", 300);
books.insert(book);
book = new BasicDBObject();
book.put("name", "Understanding Web Services");
book.put("pages", 400);
books.insert(book);
book = new BasicDBObject();
book.put("name", "Understanding Axis2");
book.put("pages", 150);
books.insert(book);
String map = "function()"
+ "{ "
+ "var category; "
+ "if ( this.pages > 100 ) category = 'Big Books'; "
+ "else category = 'Small Books'; "
+ "emit(category, {name: this.name});"
+ "}";
String reduce = "function(key, values)"
+ "{"
+ "return {books: values.length};"
+ "} ";
MapReduceCommand cmd = new MapReduceCommand(books, map, reduce,
null, MapReduceCommand.OutputType.INLINE, null);
MapReduceOutput out = books.mapReduce(cmd);
for (DBObject o : out.results()) {
System.out.println(o.toString());
}
//aufräumen
db.dropDatabase();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
This is a pretty simple reduce-Phase, but it does not what I want :(
The output is:
{ "_id" : "Big Books" , "value" : { "books" : 4.0}}
{ "_id" : "Small Books" , "value" : { "name" : "Understanding JAVA"}}
I would expect this:
{ "_id" : "Big Books" , "value" : { "books" : 4.0}}
{ "_id" : "Small Books" , "value" : { "books" : 1.0}}
Why does the reduce-Phase not give back the values.length in the case of a small book?
Greetings, Andre
Becuase if there is only one results the reduce is never run. Change it to be a finalise function or something.
A Basic Understanding of how mapReduce Works
Let us introduce the concepts of mapReduce
mapper - This is the stage that emit's the data to be fed into the reduce stage. It requires a key and a value be to sent. You can emit several times if you want in a mapper, but the requirements stay the same.
reducer - A reducer is called when there is more than one value of a given key to process the list of values that have been emitted for that key.
That said, since the mapper only emitted one key value your reducer was not called.
You can clean this up in finalise, but the behavior of the emit from the mapper going straight through is by standard design.

LUCENE: how to get all terms for a given doc by docNr without storing the data nor the TermVector (LUKE is able to show this, how?)

in my code example i create three document in a lucene index.
two of them not storing the field LASTNAME, but have stored termvector, one have non of them stored.
with LUKE i am able to iterate through all terms in this field (LASTNAME).
in my code example iterate through the TermFreqVectors, that works fine for document with stored TermVectors.
how can i get all this non stored Terms? how is LUKE doing that?
my original problem is, that i want to extend a big index (60GB) with nearly 100 fields with another field without re-creating the index from scratch, because with our db-setup it needs with 40 parallel computing server a couple of days.
it is very fast to read all the data from the index and just add this new field to all stored documents.
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.LuceneTestCase;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
public class TestDocTerms extends LuceneTestCase {
public void testDocTerms() throws IOException, ParseException {
Analyzer analyzer = new MockAnalyzer(random);
String fieldF = "FIRSTNAME";
String fieldL = "LASTNAME";
// To store an index on disk, use this instead:
Directory directory = NIOFSDirectory.open(new File("/tmp/_index_tester/"));
RandomIndexWriter iwriter = new RandomIndexWriter(random, directory, analyzer);
iwriter.w.setInfoStream(VERBOSE ? System.out : null);
Document doc = new Document();
doc.add(newField(fieldF, "Alex", Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField(fieldL, "Miller", Field.Store.NO,Field.Index.ANALYZED,Field.TermVector.YES));
iwriter.addDocument(doc);
doc = new Document();
doc.add(newField(fieldF, "Chris", Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField(fieldL, "Smith", Field.Store.NO, Field.Index.ANALYZED));
iwriter.addDocument(doc);
doc = new Document();
doc.add(newField(fieldF, "Alex", Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField(fieldL, "Beatle", Field.Store.NO, Field.Index.ANALYZED,Field.TermVector.YES));
iwriter.addDocument(doc);
iwriter.close();
// Now search the index:
IndexSearcher isearcher = new IndexSearcher(directory, true); // read-only=true
QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, fieldF, analyzer);
Query query = parser.parse(fieldF + ":" + "Alex");
TopDocs hits = isearcher.search(query, null, 2);
assertEquals(2, hits.totalHits);
// Iterate through the results:
for (int i = 0; i < hits.scoreDocs.length; i++) {
Document hitDoc = isearcher.doc(hits.scoreDocs[i].doc);
assertEquals("Alex", hitDoc.get(fieldF));
System.out.println("query for:" +query.toString()+ " with this results firstN:" + hitDoc.get(fieldF) + " and lastN:" + hitDoc.get(fieldL));
}
parser = new QueryParser(TEST_VERSION_CURRENT, fieldL, analyzer);
query = parser.parse(fieldL + ":" + "Miller");
hits = isearcher.search(query, null, 2);
assertEquals(1, hits.totalHits);
// Iterate through the results:
for (int i = 0; i < hits.scoreDocs.length; i++) {
Document hitDoc = isearcher.doc(hits.scoreDocs[i].doc);
assertEquals("Alex", hitDoc.get(fieldF));
System.out.println("query for:" + query.toString() + " with this results firstN:" +hitDoc.get(fieldF)+ " and lastN:" +hitDoc.get(fieldL));
}
isearcher.close();
// examine terms
IndexReader ireader = IndexReader.open(directory, true); // read-only=true
int numDocs = ireader.numDocs();
for (int i = 0; i < numDocs; i++) {
doc = ireader.document(i);
System.out.println("docNum:" + i + " with:" + doc.toString());
TermFreqVector t = ireader.getTermFreqVector(i, fieldL);
if (t != null){
System.out.println("Field:" + fieldL + " contains terms:" + t.toString());
}
TermFreqVector[] termFreqVectors = ireader.getTermFreqVectors(i);
if (termFreqVectors != null){
for (TermFreqVector tfv : termFreqVectors){
String[] terms = tfv.getTerms();
String field = tfv.getField();
System.out.println("Field:" +field+ " contains terms:" + Arrays.toString(terms));
}
}
}
ireader.close();
}
}
Reconstructing unstored documents is necessarily a best effort. You can't generally reverse changes made to the value by the analyzer.
When TermVectors are not available, Luke enumerates the terms associated with the field. This may not respect the ordering of the terms, or any formatting. That may be neither here nor there, though. I don't know what your newField method does exactly, but I suspect it's default is not Field.TermVector.NO.
If you want to know more of the implementation details, I would grab the Luke source code, and read org.getopt.luke.DocReconstructor

Search for '$' using RegexQuery (NOT any other) in a Lucene index

I have the following program:
public class RegexQueryExample {
public static String[] terms = {
"US $65M dollars",
"USA",
"$35",
"355",
"US $33",
"U.S.A",
"John Keates",
"Tom Dick Harry",
"Southeast' Asia"
};
private static Directory directory;
public static void main(String[] args) throws CorruptIndexException, IOException {
String searchString = ".*\\$.*";
createIndex();
searchRegexIndex(searchString);
}
/**
* Creates an index for the files in the data directory.
*/
private static void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
directory = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
for (String term : terms) {
Document document = new Document();
if (term.indexOf('$') >= 0) {
document.add(new Field("type", "currency", Field.Store.YES, Field.Index.NOT_ANALYZED));
} else {
document.add(new Field("type", "simple_field", Field.Store.YES, Field.Index.NOT_ANALYZED));
}
document.add(new Field("term", term, Field.Store.YES, Field.Index.NOT_ANALYZED));
indexWriter.addDocument(document);
}
indexWriter.close();
}
/**
* searches for a regular expression satisfied by a file path.
*
* #param searchString the string to be searched.
*/
private static void searchRegexIndex(String regexString) throws CorruptIndexException, IOException {
regexString = regexString;
IndexSearcher searcher = new IndexSearcher(directory);
RegexQuery rquery = new RegexQuery(new Term("term", regexString));
BooleanQuery queryin = new BooleanQuery();
BooleanQuery query = new BooleanQuery();
query.add(new TermQuery(new Term("type", "simple_field")), BooleanClause.Occur.MUST);
query.add(rquery, BooleanClause.Occur.MUST);
TopDocs hits = searcher.search(query, terms.length);
ScoreDoc[] alldocs = hits.scoreDocs;
for (int i = 0; i < alldocs.length; i++) {
Document d = searcher.doc(alldocs[i].doc);
System.out.println((i + 1) + ". " + d.get("term"));
}
}
}
The createIndex() function creates the Lucene index while searchRegexIndex() performs a regex query. In the main() function I search for .*\\$.* expecting it to return the terms containing the $ sign. But, it did not work. How do I make it work? Is this some problem with the Analyzer?
Edit:
My Lucene index snapshot from Luke:
You are using StandardAnalyzer, which removes the dollar signs from the tokens. E.g. "US $65M dollars" becomes three tokens: "us", "65m", "dollars". You need to use another analyzer that does not remove the dollar signs. Luke provides an excellent analyzer tool in which you can try out different analyzers and check their outputs.

Why does apache Mahout frequent pattern minnig algorithm return only 1 item itemsets?

I'm currently testing Apache Mahout Parallel Frequent Pattern Mining . Before using it in the real project, I started with a simple code, just to be sure it works as I expect it to do...
I did not find complete example with code, data, and output.
I have currently a compiling and executing version (see java / scala code below), but the returned frequent patterns contain only one tuple (see sample output below).
Is this the intended behavior?
What did I do wrong?
Thanks for your help...
scala code :
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth
import java.util.HashSet
import org.apache.mahout.common.iterator.StringRecordIterator
import org.apache.mahout.common.iterator.FileLineIterable
import org.apache.mahout.fpm.pfpgrowth.convertors._
import org.apache.mahout.fpm.pfpgrowth.convertors.integer._
import org.apache.mahout.fpm.pfpgrowth.convertors.string._
import org.apache.hadoop.io.SequenceFile.Writer
import org.apache.mahout.fpm.pfpgrowth.convertors.StatusUpdater
import org.apache.hadoop.mapred.OutputCollector
import scala.collection.JavaConversions._
import java.util.{ List => JList }
import org.apache.mahout.common.{ Pair => JPair }
import java.lang.{ Long => JLong }
import org.apache.hadoop.io.{ Text => JText }
val minSupport = 5L
val k: Int = 50
val fps: FPGrowth[String] = new FPGrowth[String]()
val milk = "milk"
val bread = "bread"
val butter = "butter"
val bier = "bier"
val transactionStream: Iterator[JPair[JList[String], JLong]] = Iterator(
new JPair(List(milk, bread), 10L),
new JPair(List(butter), 10L),
new JPair(List(bier), 10L),
new JPair(List(milk, bread, butter), 5L),
new JPair(List(milk, bread, bier), 5L),
new JPair(List(bread), 10L)
)
val frequencies: Collection[JPair[String, JLong]] = fps.generateFList(
transactionStream, minSupport.toInt)
println("freqList :" + frequencies)
var returnableFeatures: Collection[String] = List(
milk, bread, butter, bier)
var output: OutputCollector[String, JList[JPair[JList[String], JLong]]] = (
new OutputCollector[String, JList[JPair[JList[String], JLong]]] {
def collect(x1: String,
x2: JList[JPair[JList[String], JLong]]) = {
println(x1 + ":" +
x2.map(pair => "[" + pair.getFirst.mkString(",") + "] : " +
pair.getSecond).mkString("; "))
}
}
)
val updater: StatusUpdater = new StatusUpdater {
def update(status: String) = println("updater : " + status)
}
fps.generateTopKFrequentPatterns(
transactionStream,
frequencies,
minSupport,
k,
null, //returnableFeatures
output,
updater)
java code :
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;
import java.io.IOException;
import java.util.*;
import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;
import org.apache.hadoop.io.Text;
class FPGrowthDemo {
public static void main(String[] args) {
long minSupport = 1L;
int k = 50;
FPGrowth<String> fps = new FPGrowth<String>();
String milk = "milk";
String bread = "bread";
String butter = "butter";
String bier = "bier";
LinkedList<Pair<List<String>, Long>> data =
new LinkedList<Pair<List<String>, Long>>();
data.add(new Pair(Arrays.asList(milk, bread), 1L));
data.add(new Pair(Arrays.asList(butter), 1L));
data.add(new Pair(Arrays.asList(bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread), 1L));
Iterator<Pair<List<String>, Long>> transactions = data.iterator();
Collection<Pair<String, Long>> frequencies = fps.generateFList(
transactions, (int) minSupport);
System.out.println("freqList :" + frequencies);
Collection<String> returnableFeatures =
Arrays.asList(milk, bread, butter, bier);
OutputCollector<String, List<Pair<List<String>, Long>>> output =
new OutputCollector<String, List<Pair<List<String>, Long>>>() {
#Override
public void collect(String x1,
List<Pair<List<String>, Long>> listPair)
throws IOException {
StringBuffer sb = new StringBuffer();
sb.append(x1 + ":");
for (Pair<List<String>, Long> pair : listPair) {
sb.append("[");
String sep = "";
for (String item : pair.getFirst()) {
sb.append(item + sep);
sep = ", ";
}
sb.append("]:" + pair.getSecond());
}
System.out.println(" " + sb.toString());
}
};
StatusUpdater updater = new StatusUpdater() {
public void update(String status){
System.out.println("updater :" + status);
}
};
try {
fps.generateTopKFrequentPatterns(
transactions,
frequencies,
minSupport,
k,
null, //returnableFeatures
output,
updater);
}catch (Exception e){
e.printStackTrace();
}
}
}
sample output :
freqList :[(bread,4), (milk,4), (bier,2), (butter,2)]
17:48:19,108 INFO ~ Number of unique items 4
17:48:19,109 INFO ~ Number of unique pruned items 4
17:48:19,121 INFO ~ Number of Nodes in the FP Tree: 0
17:48:19,122 INFO ~ Mining FTree Tree for all patterns with 3
updater :FPGrowth Algorithm for a given feature: 3
butter:[butter]:2
17:48:19,130 INFO ~ Found 1 Patterns with Least Support 2
17:48:19,130 INFO ~ Mining FTree Tree for all patterns with 2
updater :FPGrowth Algorithm for a given feature: 2
updater :FPGrowth Algorithm for a given feature: 3
bier:[bier]:2
17:48:19,130 INFO ~ Found 1 Patterns with Least Support 2
17:48:19,130 INFO ~ Mining FTree Tree for all patterns with 1
updater :FPGrowth Algorithm for a given feature: 1
updater :FPGrowth Algorithm for a given feature: 2
updater :FPGrowth Algorithm for a given feature: 3
milk:[milk]:4
17:48:19,131 INFO ~ Found 1 Patterns with Least Support 4
17:48:19,131 INFO ~ Mining FTree Tree for all patterns with 0
updater :FPGrowth Algorithm for a given feature: 0
updater :FPGrowth Algorithm for a given feature: 1
updater :FPGrowth Algorithm for a given feature: 2
updater :FPGrowth Algorithm for a given feature: 3
bread:[bread]:4
17:48:19,131 INFO ~ Found 1 Patterns with Least Support 4
17:48:19,131 INFO ~ Tree Cache: First Level: Cache hits=6 Cache Misses=4
The code is buggy : the iterator on transactions is called first to compute the frequencies, and will be called again by the fp-growth algorithm. The problem is that this second call will return no value, because the iterator has reached its end...
For reference, here is the correct java code :
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;
import java.io.IOException;
import java.util.*;
import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;
import org.apache.hadoop.io.Text;
class FPGrowthDemo {
public static void main(String[] args) {
long minSupport = 1L;
int k = 50;
FPGrowth<String> fps = new FPGrowth<String>();
String milk = "milk";
String bread = "bread";
String butter = "butter";
String bier = "bier";
LinkedList<Pair<List<String>, Long>> data =
new LinkedList<Pair<List<String>, Long>>();
data.add(new Pair(Arrays.asList(milk, bread), 1L));
data.add(new Pair(Arrays.asList(butter), 1L));
data.add(new Pair(Arrays.asList(bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread), 1L));
// This lines is removed...
// Iterator<Pair<List<String>, Long>> transactions = data.iterator();
Collection<Pair<String, Long>> frequencies = fps.generateFList(
data.iterator(), // use an iterator here...
(int) minSupport);
System.out.println("freqList :" + frequencies);
OutputCollector<String, List<Pair<List<String>, Long>>> output =
new OutputCollector<String, List<Pair<List<String>, Long>>>() {
#Override
public void collect(String x1,
List<Pair<List<String>, Long>> listPair)
throws IOException {
StringBuffer sb = new StringBuffer();
sb.append(x1 + ":");
for (Pair<List<String>, Long> pair : listPair) {
sb.append("[");
String sep = "";
for (String item : pair.getFirst()) {
sb.append(item + sep);
sep = ", ";
}
sb.append("]:" + pair.getSecond());
}
System.out.println(" " + sb.toString());
}
};
StatusUpdater updater = new StatusUpdater() {
public void update(String status) {
System.out.println("updater :" + status);
}
};
try {
fps.generateTopKFrequentPatterns(
// changed here (previously : transactions)
data.iterator(), // use a "fresh" iterator
frequencies,
minSupport,
k,
null,
output,
updater);
} catch (Exception e) {
e.printStackTrace();
}
}
}

Categories

Resources