Java Lucene 4.5 how to search by case insensitive - java

We have implemented Java Lucene search engine 4.5, I am trying to search the content even if the field value is case insensitive (e.g., if I search a city with name "Banglore" I get a result, but when I search a city with name "banglore" I get 0 results).
I have used StandardAnalyzer for analyzing the data and WildcardQuery to match a Like condition (I tried as mentioned here without success).
I am not sure where I have gone wrong. I appreciate any guidance on fixing this case sensitivity problem.
public SearchHelper
{
Analyzer analyzer;
Directory index;
public IndexSearcher searcher = null;
public IndexWriter indexWriter = null;
public QueryParser parser = null;
private static int hitsPerPage = 100;
/**
* #param indexFileLocation
* #throws IOException
*/
public SearchHelper(String indexFileLocation) throws IOException
{
// this.analyzer =new StandardAnalyzer();
this.analyzer = new CaseStandardAnalyzer();
// analyzer = new ThaiAnalyzer();
this.index = FSDirectory.open(java.nio.file.Paths.get(indexFileLocation));
}
/**
* #param create
* #return
* #throws IOException
*/
public IndexWriter getIndexWriter(boolean create) throws IOException
{
if (indexWriter == null)
{
IndexWriterConfig iwc = new IndexWriterConfig(this.analyzer);
this.indexWriter = new IndexWriter(this.index, iwc);
}
return this.indexWriter;
} //End of getIndexWriter
/**
* #throws IOException
*/
public void closeIndexWriter() throws IOException
{
if (this.indexWriter != null)
{
this.indexWriter.commit();//optimize(); LUCENE_36
this.indexWriter.close();
}
} //End closeIndexWriter
/**
* #param indexFileLocation
* #throws CorruptIndexException
* #throws IOException
*/
public void startSearch(String indexFileLocation) throws CorruptIndexException, IOException
{
// searcher = new IndexSearcher(FSDirectory.open(new File(indexFileLocation)));
IndexReader reader = DirectoryReader.open(FSDirectory.open(java.nio.file.Paths.get(indexFileLocation)));
// IndexReader.open(this.index);
// open(getIndexWriter(true), true);
this.searcher = new IndexSearcher(reader);
}
/**
* #param fieldNames
* #param fieldValues
* #return
* #throws IOException
* #throws ParseException
*
* <p></p>
* https://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
*/
public ScoreDoc[] searchSEO(String[] fieldNames, String[] fieldValues, int limitSize) throws IOException, ParseException
{
this.analyzer = new StandardAnalyzer();
int searchFieldSize = (null == fieldNames) ? 0 : fieldNames.length;
BooleanQuery booleanQuery = new BooleanQuery();
for (int i = 0; i < searchFieldSize; i++)
{
Query query1 = searchIndexWithWildcardQuery(fieldNames[i], fieldValues[i]);
addQueries(booleanQuery, query1, 2);
}
TopScoreDocCollector collector = null; // Or use by default hitsPerPage instead limitSize
if (limitSize > 0)
{
collector = TopScoreDocCollector.create(limitSize);
} else {
collector = TopScoreDocCollector.create(hitsPerPage);
}
this.searcher.search(booleanQuery,collector);
return collector.topDocs().scoreDocs;
}
/**
* #param whichField
* #param searchString
* #return
* #throws IOException
* #throws ParseException
*/
public Query searchIndexWithWildcardQuery(String whichField, String searchString) throws IOException, ParseException
{
Term term = addTerm(whichField, "*" + searchString + "*");
Query query = new WildcardQuery(term);
return query;
}
/**
* #param whichField
* #param searchString
* #return
*/
public Term addTerm(String whichField, String searchString)
{
Term term = new Term(whichField, searchString);
return term;
}
/**
* #param searchString
* #param operation
* #return
* #throws ParseException
*/
public Query addConditionOpertaion(String searchString, String operation) throws ParseException
{
Query query = null;
if ("and".equals(operation))
{
parser.setDefaultOperator(QueryParser.AND_OPERATOR);
} else if("or".equals(operation)) {
parser.setDefaultOperator(QueryParser.AND_OPERATOR);
}
query = parser.parse(searchString);
return query;
}
/**
* #param booleanQuery <code>BooleanQuery</code>
* #param q <code>Query</code>
* #param type <code>int</code> , 1--> Must, 2-->Should, 3 --> Must Not
*/
public void addQueries(BooleanQuery booleanQuery, Query q, int type)
{
switch(type)
{
case 1: booleanQuery.add(q, Occur.MUST);
break;
case 2: booleanQuery.add(q, Occur.SHOULD);
break;
default:booleanQuery.add(q, Occur.MUST_NOT);
break;
} //End of switch
}
public QueryParser getParser()
{
return parser;
}
public void setParser(String fieldName)
{
this.parser = new QueryParser(fieldName, this.analyzer);
}
public void getDefaultByStatus(int status)
{
this.analyzer = new StandardAnalyzer();
this.parser = new QueryParser("status", this.analyzer);
}
protected void doClear(File dir,boolean deleteSubDir)
{
for (File file: dir.listFiles())
{
if (file.isDirectory() && deleteSubDir)
{
doClear(file,deleteSubDir);
}
file.delete();
}
} //End of doClear();
protected void doClose() throws IOException
{
this.searcher.getIndexReader().close();
}
public boolean add(Object Obj) throws Exception
{
User currentUser = (User)Obj;
boolean isAdded = false;
org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
luceneDoc.add(new IntField("oid", currentUser.getOid(), Field.Store.YES));
luceneDoc.add(new IntField("status", currentUser.getStatus(), Field.Store.YES));
luceneDoc.add(new StringField("login", currentUser.getLogin(), Field.Store.YES));
luceneDoc.add(new StringField("fName", currentUser.getFirstName(), Field.Store.YES));
luceneDoc.add(new StringField("lName", currentUser.getLastName(), Field.Store.NO));
luceneDoc.add(new StringField("email", currentUser.getEmailId(), Field.Store.YES));
luceneDoc.add(new StringField("city", currentUser.getCity(), Field.Store.YES));
// addRelatedFields(luceneDoc,city.getStateCode());
IndexWriter writer = getIndexWriter(false);
writer.addDocument(luceneDoc);
closeIndexWriter();
isAdded = true;
System.out.println(isAdded);
return isAdded;
} // End of add
public boolean update(Object Obj) throws Exception
{
boolean isUpdated = false;
User currentUser = (User) Obj;
org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
// luceneDoc.add(new IntField("oid", currentUser.getOid(), Field.Store.YES));
luceneDoc.add(new IntField("oid", currentUser.getOid(), Field.Store.YES));
luceneDoc.add(new StringField("login", currentUser.getLogin(), Field.Store.YES));
luceneDoc.add(new IntField("status", currentUser.getStatus(), Field.Store.YES));
luceneDoc.add(new StringField("fName", currentUser.getFirstName(), Field.Store.YES));
luceneDoc.add(new StringField("lName", currentUser.getLastName(), Field.Store.NO));
luceneDoc.add(new StringField("email", currentUser.getEmailId(), Field.Store.YES));
luceneDoc.add(new StringField("city", currentUser.getCity(), Field.Store.YES));
// addRelatedFields(luceneDoc,city.getStateCode());
IndexWriter writer = getIndexWriter(false);
writer.updateDocument(new Term("login", currentUser.getLogin()),luceneDoc);
closeIndexWriter();
isUpdated = true;
return isUpdated;
} // End of update
public boolean delete(Object Obj) throws Exception
{
boolean isDeleted = false;
User currentUser = (User) Obj;
Term deleteTerm = new Term("login", currentUser.getLogin());
IndexWriter writer = getIndexWriter(false);
writer.deleteDocuments(deleteTerm); // Or use Query
writer.forceMergeDeletes();
closeIndexWriter();
isDeleted = true;
return isDeleted;
} // End of delete
#Override
public Object search(String[] fieldNames, String[] fieldValues, int returnType, int limit) throws Exception
{
Object obj = null;
org.apache.lucene.search.ScoreDoc[] hits = searchSEO(fieldNames,fieldValues, limit);
int hitSize = (null == hits) ? 0 : hits.length;
System.out.println("total:" + hitSize);
doClose();
return obj;
} // End of search
public void addThreadUser()
{
User user = new User();
addUserPojo(user);
add(user);
}
public void updateThreadUser()
{
User user = new User();
addUserPojo(user);
update(user);
}
public void deleteThreadUser()
{
User user = new User();
addUserPojo(user);
delete(user);
}
private void addUserPojo(User user)
{
user.setOid(3);
user.setLogin("senthil");
user.setFirstName("Semthil");
user.setLastName("Semthil");
user.setStatus(1);
user.setCity("Combiatore");
user.setEmailId("semthil#xyz.com");
}
public void searchUser()
{
searchUser(new String[] {"login"}, new String[] {"Se"}, null);
}
public static void main(String[] args)
{
SearchHelper test = new SearchHelper();
test.searchUser();
}
}

You are usingStringField to index your data but this field will bypass the analyzer chain and always index your term verbatim as one token, regardless of your analyzer. You should use TextField if you want to have your data analyzed and the StandardAnalyzer already does lower-casing.
Other than that, the WildcardQuery does not analyze its term, so if you search for Banglore, it won't match the now-lower-case banglore from the index. You have to lowercase the searchterm yourself (or use an analyzer on it).

Use the LowerCaseFilter as the post you referenced suggests:
TokenStream stream = new StandardFilter(Version.LUCENE_CURRENT, tokenizer);
stream = new LowerCaseFilter(Version.LUCENE_CURRENT, stream);
A more complete example is in this post.

You can use custome compare class
class CaseIgonreCompare extends FieldComparator<String>{
private String field;
private String bottom;
private String topValue;
private BinaryDocValues cache;
private String[] values;
public CaseIgonreCompare(String field, int numHits) {
this.field = field;
this.values = new String[numHits];
}
#Override
public int compare(int arg0, int arg1) {
return compareValues(values[arg0], values[arg1]);
}
#Override
public int compareBottom(int arg0) throws IOException {
return compareValues(bottom, cache.get(arg0).utf8ToString());
}
#Override
public int compareTop(int arg0) throws IOException {
return compareValues(topValue, cache.get(arg0).utf8ToString());
}
public int compareValues(String first, String second) {
int val = first.length() - second.length();
return val == 0 ? first.compareToIgnoreCase(second) : val;
};
#Override
public void copy(int arg0, int arg1) throws IOException {
values[arg0] = cache.get(arg1).utf8ToString();
}
#Override
public void setBottom(int arg0) {
this.bottom = values[arg0];
}
#Override
public FieldComparator<String> setNextReader(AtomicReaderContext arg0)
throws IOException {
this.cache = FieldCache.DEFAULT.getTerms(arg0.reader(),
field , true);
return this;
}
#Override
public void setTopValue(String arg0) {
this.topValue = arg0;
}
#Override
public String value(int arg0) {
return values[arg0];
}
}

Related

SAXParser doesn't retrieves XML data from URL

My implemented SAXParser class which uses URL address to process XML data does not returns the result. The class uses additional Currency class which in turn stores two variables currId and rate with setters/getters. When I run my class nothing shows up in java console. Here is the code:
public class MySAXParser extends DefaultHandler {
private static List<Currencies> currencies = new ArrayList<Currencies>();
private static Currencies curr = null;
private static String text = null;
public static void main(String[] args) {
String url = "http://nbt.tj/en/kurs/export_xml.php?date=2016-08-01&export=xmlout";
try {
SAXParserFactory spf = SAXParserFactory.newInstance();
SAXParser sp = spf.newSAXParser();
MySAXParser handler = new MySAXParser();
URL uri = new URL(url);
sp.parse(new InputSource(uri.openStream()), handler);
} catch (Exception ex) {
ex.printStackTrace();
}
for (Currencies curr : currencies) {
System.out.println(curr.toString());
}
}
public void startElement (String s, String s1, String elementName, Attributes atts) throws SAXException {
if (elementName.equalsIgnoreCase("valute")) {
curr = new Currencies();
curr.setCurrId(atts.getValue("id"));
}
}
public void endElement (String s, String s1, String element) throws SAXException {
if (element.equals("valute")) {
currencies.add(curr);
}
if (element.equalsIgnoreCase("value")) {
curr.setRate(Double.parseDouble(text));
}
}
#Override
public void characters (char[] ch, int start, int length) throws SAXException {
text = String.copyValueOf(ch, start, length).trim();
}
}
So, what I missed or doing wrong? Any help would be appreciated.
Here is my attempt that works fine with Java 1.8:
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class MySAXParser extends DefaultHandler {
private List<Currency> currencies = new ArrayList<>();
private Currency curr = null;
private StringBuilder sb;
public static void main(String[] args) {
String url = "http://nbt.tj/en/kurs/export_xml.php?date=2016-08-01&export=xmlout";
try {
SAXParserFactory spf = SAXParserFactory.newInstance();
spf.setNamespaceAware(true);
SAXParser sp = spf.newSAXParser();
MySAXParser handler = new MySAXParser();
sp.parse(new InputSource(url), handler);
for (Currency curr : handler.getCurrencies()) {
System.out.println(curr.toString());
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
public List<Currency> getCurrencies() {
return currencies;
}
#Override
public void startElement(String s, String localName, String elementName, Attributes atts) throws SAXException {
if (elementName.equalsIgnoreCase("valute")) {
curr = new Currency();
currencies.add(curr);
curr.setCurrId(atts.getValue("ID"));
} else if (elementName.equalsIgnoreCase("value") || elementName.equalsIgnoreCase("CharCode")) {
sb = new StringBuilder();
}
}
#Override
public void endElement(String s, String localName, String elementName) throws SAXException {
if (elementName.equalsIgnoreCase("value")) {
curr.setRate(Double.parseDouble(sb.toString()));
sb = null;
}
else if (elementName.equalsIgnoreCase("CharCode")) {
curr.setCharCode(sb.toString());
sb = null;
}
}
#Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (sb != null) {
sb.append(ch, start, length);
}
}
}
The class is
public class Currency {
private String currId;
/**
* Get the value of currId
*
* #return the value of currId
*/
public String getCurrId() {
return currId;
}
/**
* Set the value of currId
*
* #param currId new value of currId
*/
public void setCurrId(String currId) {
this.currId = currId;
}
private double rate;
/**
* Get the value of rate
*
* #return the value of rate
*/
public double getRate() {
return rate;
}
/**
* Set the value of rate
*
* #param rate new value of rate
*/
public void setRate(double rate) {
this.rate = rate;
}
private String charCode;
/**
* Get the value of charCode
*
* #return the value of charCode
*/
public String getCharCode() {
return charCode;
}
/**
* Set the value of charCode
*
* #param charCode new value of charCode
*/
public void setCharCode(String charCode) {
this.charCode = charCode;
}
#Override
public String toString() {
return "Currency{" + "currId=" + currId + ", rate=" + rate + ", charCode=" + charCode + '}';
}
}
A sample output I get is
Currency{currId=840, rate=7.8683, charCode=USD}
Currency{currId=978, rate=8.7448, charCode=EUR}
Currency{currId=960, rate=10.9395, charCode=XDR}
Currency{currId=156, rate=1.1828, charCode=CNY}
Currency{currId=756, rate=8.075, charCode=CHF}
Currency{currId=810, rate=0.1146, charCode=RUB}
Currency{currId=860, rate=0.2655, charCode=UZS}
Currency{currId=417, rate=1.1643, charCode=KGS}
Currency{currId=398, rate=0.2234, charCode=KZT}
Currency{currId=933, rate=3.9424, charCode=BYR}
Currency{currId=364, rate=0.2617, charCode=IRR}
Currency{currId=971, rate=1.139, charCode=AFN}
Currency{currId=586, rate=0.7504, charCode=PKR}
Currency{currId=949, rate=2.6076, charCode=TRY}
Currency{currId=934, rate=2.2481, charCode=TMT}
Currency{currId=826, rate=10.3618, charCode=GBP}
Currency{currId=36, rate=5.9162, charCode=AUD}
Currency{currId=208, rate=1.1755, charCode=DKK}
Currency{currId=352, rate=0.659, charCode=ISK}
Currency{currId=124, rate=5.9699, charCode=CAD}
Currency{currId=414, rate=26.004, charCode=KWD}
Currency{currId=578, rate=0.9193, charCode=NOK}
Currency{currId=702, rate=5.8215, charCode=SGD}
Currency{currId=752, rate=0.9136, charCode=SEK}
Currency{currId=392, rate=0.761, charCode=JPY}
Currency{currId=944, rate=4.9639, charCode=AZN}
Currency{currId=51, rate=1.6516, charCode=AMD}
Currency{currId=981, rate=3.3539, charCode=GEL}
Currency{currId=498, rate=0.3979, charCode=MDL}
Currency{currId=980, rate=0.317, charCode=UAH}
Currency{currId=784, rate=2.1421, charCode=AED}
Currency{currId=682, rate=2.0979, charCode=SAR}
Currency{currId=356, rate=1.175, charCode=INR}
Currency{currId=985, rate=2.0039, charCode=PLN}
Currency{currId=458, rate=1.9313, charCode=MYR}
Currency{currId=764, rate=0.2258, charCode=THB}

Facing some strange issue while bookmarking the paragraph

public class BookmarkAdd extends AbstractSample {
public static JAXBContext context = org.docx4j.jaxb.Context.jc;
/**
* #param args
*/
#SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
String inputfilepath = "Chapter_3.docx";
File file = new java.io.File(inputfilepath);
WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.load(new java.io.File(inputfilepath));
MainDocumentPart documentPart = wordMLPackage.getMainDocumentPart();
String outputfilepath = System.getProperty("user.dir")+"/ 1.docx";
ClassFinder finder = new ClassFinder(P.class); // <----- change this to suit
new TraversalUtil(documentPart.getContent(), finder);
int Counter = 0;
System.out.println(finder.results.size());
for (Object o : finder.results)
{
P para =(P)o;
String name = "para" + Counter;
bookmarkPara(para, 0, para.getParagraphContent().size(), name, Counter);
Counter++;
SaveToZipFile saver = new SaveToZipFile(wordMLPackage);
saver.save(outputfilepath);
// wordMLPackage.save(new java.io.File(inputfilepath));
}
}
/**
* Surround the specified r in the specified p
* with a bookmark (with specified name and id)
* #param p
* #param r
* #param name
* #param id
*/
public static void bookmarkPara(P p, int StartIndex,int EndIndex, String name, int id) {
ObjectFactory factory = Context.getWmlObjectFactory();
BigInteger ID = BigInteger.valueOf(id);
// Add bookmark end first
CTMarkupRange mr = factory.createCTMarkupRange();
mr.setId(ID);
JAXBElement<CTMarkupRange> bmEnd = factory.createBodyBookmarkEnd(mr);
p.getParagraphContent().add(EndIndex, bmEnd); // from 2.7.0, use getContent()
// Next, bookmark start
CTBookmark bm = factory.createCTBookmark();
bm.setId(ID);
bm.setName(name);
JAXBElement<CTBookmark> bmStart = factory.createBodyBookmarkStart(bm);
p.getParagraphContent().add(StartIndex, bmStart);
}
public static List<Object> getAllElementFromObject(Object obj, Class<?> toSearch) {
List<Object> result = new ArrayList<Object>();
if (obj instanceof JAXBElement)
obj = ((JAXBElement<?>) obj).getValue();
if (obj.getClass().equals(toSearch))
result.add(obj);
else if (obj instanceof ContentAccessor) {
List<?> children = ((ContentAccessor) obj).getContent();
for (Object child : children) {
result.addAll(getAllElementFromObject(child, toSearch));
}
}
return result;
}
}
Using this code I bookmarks each paragraph as para0 to paran and this code works very fine for most of the document But I am not able to bookmark for two of my docx file I don't know why it shows the following error.
java.lang.IllegalArgumentException: obj parameter must not be null
at javax.xml.bind.helpers.AbstractMarshallerImpl.checkNotNull(Unknown Source)
at javax.xml.bind.helpers.AbstractMarshallerImpl.marshal(Unknown Source)
at org.docx4j.openpackaging.parts.JaxbXmlPart.marshal(JaxbXmlPart.java:361)
at org.docx4j.openpackaging.parts.JaxbXmlPart.marshal(JaxbXmlPart.java:330)
at org.docx4j.openpackaging.io.SaveToZipFile.saveRawXmlPart(SaveToZipFile.java:249)
at org.docx4j.openpackaging.io.SaveToZipFile.saveRawXmlPart(SaveToZipFile.java:198)
at org.docx4j.openpackaging.io.SaveToZipFile.savePart(SaveToZipFile.java:424)
at org.docx4j.openpackaging.io.SaveToZipFile.addPartsFromRelationships(SaveToZipFile.java:387)
at org.docx4j.openpackaging.io.SaveToZipFile.savePart(SaveToZipFile.java:442)
at org.docx4j.openpackaging.io.SaveToZipFile.addPartsFromRelationships(SaveToZipFile.java:387)
at org.docx4j.openpackaging.io.SaveToZipFile.save(SaveToZipFile.java:168)
at org.docx4j.openpackaging.io.SaveToZipFile.save(SaveToZipFile.java:97)
at Backup.BookmarkAdd.main(BookmarkAdd.java:64)
.....

PIG Custom loader's getNext() is being called again and again

I have started working with Apache Pig for one of our projects. I have to create a custom input format to load our data files. For this, I followed this example Hadoop:Custom Input format. I also created my custom RecordReader implementation to read the data (we get our data in binary format from some other application) and parse that to proper JSON format.
The problem occurs when I use my custom loader in Pig script. As soon as my loader's getNext() method is invoked, it calls my custom RecordReader's nextKeyValue() method, which works fine. It reads the data properly, passes it back to my loader which parses the data and returns a Tuple. So far so good.
The problem arises when my loader's getNext() method is called again and again. It gets called, works fine, and returns the proper output (I debugged it till return statement). But then, instead of letting the execution go further, my loader gets called again. I tried to see the number of times my loader is called, and I could see the number go till 20K!
Can somebody please help me understand the problem in my code?
Loader
public class SimpleTextLoaderCustomFormat extends LoadFunc {
protected RecordReader in = null;
private byte fieldDel = '\t';
private ArrayList<Object> mProtoTuple = null;
private TupleFactory mTupleFactory = TupleFactory.getInstance();
#Override
public Tuple getNext() throws IOException {
Tuple t = null;
try {
boolean notDone = in.nextKeyValue();
if (!notDone) {
return null;
}
String value = (String) in.getCurrentValue();
byte[] buf = value.getBytes();
int len = value.length();
int start = 0;
for (int i = 0; i < len; i++) {
if (buf[i] == fieldDel) {
readField(buf, start, i);
start = i + 1;
}
}
// pick up the last field
readField(buf, start, len);
t = mTupleFactory.newTupleNoCopy(mProtoTuple);
mProtoTuple = null;
} catch (InterruptedException e) {
int errCode = 6018;
String errMsg = "Error while reading input";
e.printStackTrace();
throw new ExecException(errMsg, errCode,
PigException.REMOTE_ENVIRONMENT, e);
}
return t;
}
private void readField(byte[] buf, int start, int end) {
if (mProtoTuple == null) {
mProtoTuple = new ArrayList<Object>();
}
if (start == end) {
// NULL value
mProtoTuple.add(null);
} else {
mProtoTuple.add(new DataByteArray(buf, start, end));
}
}
#Override
public InputFormat getInputFormat() throws IOException {
//return new TextInputFormat();
return new CustomStringInputFormat();
}
#Override
public void setLocation(String location, Job job) throws IOException {
FileInputFormat.setInputPaths(job, location);
}
#Override
public void prepareToRead(RecordReader reader, PigSplit split)
throws IOException {
in = reader;
}
Custom InputFormat
public class CustomStringInputFormat extends FileInputFormat<String, String> {
#Override
public RecordReader<String, String> createRecordReader(InputSplit arg0,
TaskAttemptContext arg1) throws IOException, InterruptedException {
return new CustomStringInputRecordReader();
}
}
Custom RecordReader
public class CustomStringInputRecordReader extends RecordReader<String, String> {
private String fileName = null;
private String data = null;
private Path file = null;
private Configuration jc = null;
private static int count = 0;
#Override
public void close() throws IOException {
// jc = null;
// file = null;
}
#Override
public String getCurrentKey() throws IOException, InterruptedException {
return fileName;
}
#Override
public String getCurrentValue() throws IOException, InterruptedException {
return data;
}
#Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
#Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) genericSplit;
file = split.getPath();
jc = context.getConfiguration();
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
InputStream is = FileSystem.get(jc).open(file);
StringWriter writer = new StringWriter();
IOUtils.copy(is, writer, "UTF-8");
data = writer.toString();
fileName = file.getName();
writer.close();
is.close();
System.out.println("Count : " + ++count);
return true;
}
}
Try this in Loader
//....
boolean notDone = ((CustomStringInputFormat)in).nextKeyValue();
//...
Text value = new Text(((CustomStringInputFormat))in.getCurrentValue().toString())

Android XML (RSS) Ignores quotation marks (")

I'm having a problem with SAX XML parser.
It Does parse everything, except quotation marks (").
For example, if the text is hell"3o in a node, the result is hell.
Here are my codes:
XML Handler:
public class MyXMLHandler extends DefaultHandler {
Boolean currentElement = false;
String currentValue = null;
public static SitesList sitesList = null;
public static SitesList getSitesList() {
return sitesList;
}
public static void setSitesList(SitesList sitesList) {
MyXMLHandler.sitesList = sitesList;
}
/** Called when tag starts ( ex:- <name>AndroidPeople</name>
* -- <name> )*/
#Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
currentElement = true;
if (localName.equals("channel"))
{
/** Start */
sitesList = new SitesList();
} else if (localName.equals("item")) {
String attr=attributes.getValue("item");
sitesList.setItem(attr);
} else if (localName.equals("title")) {
/** Get attribute value */
String attr = attributes.getValue("title");
sitesList.setTitle(attr);
}
else if (localName.equals("link")) {
/** Get attribute value */
String attr = attributes.getValue("link");
sitesList.setLink(attr);
}
else if (localName.equals("description")) {
/** Get attribute value */
String attr = attributes.getValue("description");
sitesList.setDescription(attr);
}
else if (localName.equalsIgnoreCase("pubDate")) {
/** Get attribute value */
String attr = attributes.getValue("pubDate");
sitesList.setPubDate(attr);
}
}
/** Called when tag closing ( ex:- <name>AndroidPeople</name>
* -- </name> )*/
#Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
currentElement = false;
/** set value */
if (localName.equalsIgnoreCase("item"))
sitesList.setItem(currentValue);
else if (localName.equalsIgnoreCase("title"))
sitesList.setTitle(currentValue);
else if (localName.equalsIgnoreCase("link"))
sitesList.setLink(currentValue);
else if (localName.equalsIgnoreCase("description"))
sitesList.setDescription(currentValue);
else if (localName.equalsIgnoreCase("pubDate"))
sitesList.setPubDate(currentValue);
}
/** Called to get tag characters ( ex:- <name>AndroidPeople</name>
* -- to get AndroidPeople Character ) */
#Override
public void characters(char[] ch, int start, int length)
throws SAXException {
if (currentElement) {
currentValue = new String(ch, start, length);
currentElement = false;
}
}
}
Getter and Setter:
import java.util.ArrayList;
/** Contains getter and setter method for variables */
public class SitesList {
/** Variables */
private ArrayList<String> title = new ArrayList<String>();
private ArrayList<String> link = new ArrayList<String>();
private ArrayList<String> description = new ArrayList<String>();
private ArrayList<String> pubDate = new ArrayList<String>();
private ArrayList<String> item=new ArrayList<String>();
/** In Setter method default it will return arraylist
* change that to add */
public ArrayList<String> getTitle() {
return title;
}
public void setTitle(String title) {
this.title.add(title);
}
public ArrayList<String> getLink() {
return link;
}
public void setLink(String link) {
this.link.add(link);
}
public ArrayList<String> getDescription() {
return description;
}
public void setDescription(String description) {
this.description.add(description);
}
public ArrayList<String> getPubDate() {
return this.pubDate;
}
public void setPubDate(String PubDate) {
this.pubDate.add(PubDate);
}
public ArrayList<String> getItem() {
return this.item;
}
public void setItem(String item) {
this.item.add(item);
}
}
And RSS Thread class:
public class RssThread {
private String title,html,pubDate;
public RssThread(String title,String html,String pubDate)
{
this.title=title;
this.html=html;
this.pubDate=CovertToDate(pubDate);
}
private String CovertToDate(String pubDate) {
// TODO Auto-generated method stub
//Wed, 28 Sep 2011 11:40:51//
String newDate="";
if (pubDate.substring(0,pubDate.indexOf(",")).equals("Sun"))
newDate+="יום ראשון";
else if (pubDate.subSequence(0, pubDate.indexOf(",")).equals("Mon"))
newDate+="יום שני";
else if (pubDate.subSequence(0, pubDate.indexOf(",")).equals("Tue"))
newDate+="יום שלישי";
else if (pubDate.subSequence(0, pubDate.indexOf(",")).equals("Wed"))
newDate+="יום רביעי";
else if (pubDate.subSequence(0, pubDate.indexOf(",")).equals("Thu"))
newDate+="יום חמישי";
else if (pubDate.subSequence(0, pubDate.indexOf(",")).equals("Fri"))
newDate+="יום שישי";
else if (pubDate.subSequence(0, pubDate.indexOf(",")).equals("Sat"))
newDate+="יום שבת";
newDate+=", ";
String[] splited = pubDate.split(" ");
newDate += splited[1]+".";
if (splited[2].equals("Jan"))
newDate+="1.";
else if (splited[2].equals("Feb"))
newDate+="2.";
else if (splited[2].equals("Mar"))
newDate+="3.";
else if (splited[2].equals("Apr"))
newDate+="4.";
else if (splited[2].equals("May"))
newDate+="5.";
else if (splited[2].equals("Jun"))
newDate+="6.";
else if (splited[2].equals("Jul"))
newDate+="7.";
else if (splited[2].equals("Aug"))
newDate+="8.";
else if (splited[2].equals("Sep"))
newDate+="9.";
else if (splited[2].equals("Oct"))
newDate+="10.";
else if (splited[2].equals("Nov"))
newDate+="11.";
else if (splited[2].equals("Dec"))
newDate+="12.";
newDate+=splited[3];
newDate+=", בשעה "+splited[4].substring(0,splited[4].lastIndexOf(":"));
return newDate;
}
public String getTitle() {
return this.title;
}
public String getHTML() {
return html;
}
public String getPubDate() {
return this.pubDate;
}
}
I have forgotten to put another class:
public class XMLParsingExample {
private static String[] RssString;
/** Create Object For SiteList Class */
SitesList sitesList = null;
/** Called when the activity is first created. */
/** Create a new textview array to display the results */
String[] title;
String[] link;
String[] pubDate;
{
try {
/** Handling XML */
SAXParserFactory spf = SAXParserFactory.newInstance();
SAXParser sp = spf.newSAXParser();
XMLReader xr = sp.getXMLReader();
/** Send URL to parse XML Tags */
URL sourceUrl = new URL(
"http://www.blich.co.il/rss.xml");
/** Create handler to handle XML Tags ( extends DefaultHandler ) */
MyXMLHandler myXMLHandler = new MyXMLHandler();
xr.setContentHandler(myXMLHandler);
xr.parse(new InputSource(sourceUrl.openStream()));
} catch (Exception e) {
System.out.println("XML Pasing Excpetion = " + e);
}
/** Get result from MyXMLHandler SitlesList Object */
sitesList = MyXMLHandler.sitesList;
/** Assign textview array lenght by arraylist size */
title = new String[sitesList.getTitle().size()];
link = new String[sitesList.getTitle().size()];
pubDate = new String[sitesList.getTitle().size()];
/** Set the result text in textview and add it to layout */
RssString=new String[sitesList.getItem().size()/2];
for (int i=0;i<RssString.length;i++)
RssString[i]="";
int counter=1;
for (int i = 0; i < sitesList.getItem().size(); i++) {
if (i%2!=0) {
title[i-counter]=sitesList.getTitle().get(i);
if (title[i-counter]!=null)
RssString[i-counter]+=title[i-counter]+"~";
link[i-counter]=sitesList.getLink().get(i);
if (link[i-counter]!=null)
RssString[i-counter]+=link[i-counter]+"~";
pubDate[i-counter]=sitesList.getPubDate().get(i);
if (pubDate[i-counter]!=null)
RssString[i-counter]+=pubDate[i-counter]+"~";
counter++;
}
}
}
public static String[] getRSSarray() {
return RssString;
}
}
I gave you all the codes so you can see everything.
the characters method can be called several times. try to look what you get in there and try to accumulate values in a stringbuffer
You can create an HTML object that will convert the HTML codes to the appropriate symbol (i.e. " to "), then convert that back to a String (or a SpannedString if you want to format it)
CharSequence seq = Html.fromHtml(title);
String str = new String(seq);
Maybe what both of you told me would work, but it would be complicated.
I have found an easier and more simple solution:
Other parser.
It uses 1 class (vs the sax parser that uses 3 classes), much much easier to understand, and of course, doesn't ignore quotation marks :D
Thanks anyway.
Have you tried to convert quotation marks to an XML entity, before you parse the element?
Several characters have special XML entity references:
& &
< <
> >
" "
' &apos;

collecting text within <p> from html pages

I have a blog dataset which has a huge number of blog pages, with blog posts, comments and all blog features.
I need to extract only blog post from this collection and store it in a .txt file.
I need to modify this program as this program should collect blogposts tag starts with <p> and ends with </p> and avoiding other tags.
Currently I use HTMLParser to do the job, here is what I have so far:
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.tags.MetaTag;
public class HTMLParserTest {
public static void main(String... args) {
Parser parser = new Parser();
HasAttributeFilter filter = new HasAttributeFilter("P");
try {
parser.setResource("d://Blogs/asample.txt");
NodeList list = parser.parse(filter);
Node node = list.elementAt(0);
if (node instanceof MetaTag) {
MetaTag meta = (MetaTag) node;
String description = meta.getAttribute("content");
System.out.println(description);
}
} catch (ParserException e) {
e.printStackTrace();
}
}
}
thanks in advance
Provided the HTML is well formed, the following method should do what you need:
private static String extractText(File file) throws IOException {
final ArrayList<String> list = new ArrayList<String>();
FileReader reader = new FileReader(file);
ParserDelegator parserDelegator = new ParserDelegator();
ParserCallback parserCallback = new ParserCallback() {
private int append = 0;
public void handleText(final char[] data, final int pos) {
if(append > 0) {
list.add(new String(data));
}
}
public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
if (Tag.P.equals(tag)) {
append++;
}
}
public void handleEndTag(Tag tag, final int pos) {
if (Tag.P.equals(tag)) {
append--;
}
}
public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
public void handleComment(final char[] data, final int pos) { }
public void handleError(final java.lang.String errMsg, final int pos) { }
};
parserDelegator.parse(reader, parserCallback, false);
reader.close();
String text = "";
for(String s : list) {
text += " " + s;
}
return text;
}
EDIT: Change to handle nested P tags.

Categories

Resources