pdfbox getcharacterbyarticle() rendering the vector for last page - java

I am trying to get text details like co-ordinates, width and height using the following code (took up this solution from here), but the output was only the text from the last page.
Code
public static void main( String[] args ) throws IOException {
PDDocument document = null;
String fileName = "apache.pdf"
PDFParser parser = new PDFParser(new FileInputStream(fileName));
parser.parse();
StringWriter outString = new StringWriter();
CustomPDFTextStripper stripper = new CustomPDFTextStripper();
stripper.writeText(parser.getPDDocument(), outString);
Vector<List<TextPosition>> vectorlistoftps = stripper.getCharactersByArticle();
for (int i = 0; i < vectorlistoftps.size(); i++) {
List<TextPosition> tplist = vectorlistoftps.get(i);
for (int j = 0; j < tplist.size(); j++) {
TextPosition text = tplist.get(j);
System.out.println(" String "
+ "[x: " + text.getXDirAdj() + ", y: "
+ text.getY() + ", height:" + text.getHeightDir()
+ ", space: " + text.getWidthOfSpace() + ", width: "
+ text.getWidthDirAdj() + ", yScale: " + text.getYScale() + "]"
+ text.getCharacter() +" Font "+ text.getFont().getBaseFont() + " PageNUm "+ (i+1));
}
}
}
CustomPDFTextStripper class:
class CustomPDFTextStripper extends PDFTextStripper
{
//Vector<Vector<List<TextPosition>>> data = new Vector<Vector<List<TextPosition>>>();
public CustomPDFTextStripper() throws IOException {
super();
}
public Vector<List<TextPosition>> getCharactersByArticle(){
// data.add(charactersByArticle);
return charactersByArticle;
}
}
I tried to add the vectors to a list, but when calling the stripper() it is iterating through all the pages and the last page details are stored in charactersByArticle vector and thus returning the same. How do I get info for all pages??

Temporary Fix:
Changed the main method to set the current page as end page and getting the text info. Not a good idea though.
for (int page = 0; page < pageCount; page++)
{
stripper.setStartPage(0);
stripper.setEndPage(page + 1);
stripper.writeText(parser.getPDDocument(), outString);
Vector vectorlistoftps = stripper.getCharactersByArticle();
PDPage thisPage = stripper.getCurrentPage();
for (int i = 0; i < vectorlistoftps.size(); i++) {
List<TextPosition> tplist = vectorlistoftps.get(i);
}
}

Related

How to convert string of the form {a=2 ,b=5} to JSON of the form {"a":2 , "b":5} in java?

I am getting result in the form {a=2 ,b=5} after spel expression evaluation.
I want to convert it to json.
How do I do it?
Please help!
here is your solution:
public static void main(String[] args) {
String something = "{a=2 ,b=5}";
something = something.replace("{", "");
something = something.replace("}", "");
String[] pairs = something.split(",");
ArrayList<String> list = new ArrayList<String>();
for (String pair : pairs) {
list.add(pair);
}
for (int i = 0; i < list.size(); i++) {
String[] temp = list.get(i).split("=");
temp[0] = "\"" + temp[0] + "\"";
list.set(i, temp[0] + ":" + temp[1]);
}
String contents = "";
for (int i = 0; i < list.size(); i++) {
contents = contents + ", " + list.get(i);
}
contents = contents.replaceFirst(", ", "");
contents = "{" + contents + "}";
System.out.println("Contents: " + contents);
}
And here is your result:
Contents: {"a":2 , "b":5}

Why Lucene query not working for other field like StringField type

for a requirement, I have to store information for many to many relationship.
For each zip code there are N courseIds, for each course code, there are N zip codes.
I am storing information in Lucene version 6.4 like this.
private static void updateDoc(IndexWriter w, String id, String zipCode, String courseId) throws IOException {
Term term = new Term("id", id);
Document doc = new Document();
doc.add(new StringField("id", id, Field.Store.YES));
doc.add(new StringField("zipCode", zipCode, Field.Store.YES));
doc.add(new StringField("courseName", courseId, Field.Store.YES));
w.updateDocument(term, doc);
}
public static void main(String[] args) throws Exception {
File dataDir = new File("E:/local/data/");
IndexWriter indexWriter = getIndexWriter(dataDir);
//Zip Code
for (int i = 0; i < 5; i++) {
String zipCode = String.format("%05d", i);
for (int j = 0; j < 10; j++) {
String courseCode = "Course" + String.format("%03d", j);
String id = getID();
System.out.println(zipCode + "<---->" + courseCode);
updateDoc(indexWriter, id, zipCode, courseCode);
}
}
try {
indexWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
But the problem is while searching, I do not get the hits for Course Ids where I get for zip code.
For the below code , I get the proper result.
String queryStr = "00003";//For zip code only
StandardAnalyzer analyzer = new StandardAnalyzer();
Query q = new QueryParser("zipCode", analyzer).parse(queryStr);
System.out.println(q.toString());
IndexSearcher searcher = getIndexSearcher();
TopDocs docs = searcher.search(q, 10);
System.out.println("Total :::" + docs.totalHits);
ScoreDoc[] hits = docs.scoreDocs;
System.out.println("Found " + hits.length + " hits.");
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + "---" + d.get("id") + " " + d.get("zipCode") + "\t" + d.get("courseName"));
}
But for the below code, I do not get the result, Ideally it should provide the result. I am trying to retrieve the result for course code in the similar fashion which I have done for zip code.
String queryStr = "Course005";//For course code only
StandardAnalyzer analyzer = new StandardAnalyzer();
Query q = new QueryParser("courseName", analyzer).parse(queryStr);
System.out.println(q.toString());
IndexSearcher searcher = getIndexSearcher();
TopDocs docs = searcher.search(q, 10);
System.out.println("Total :::" + docs.totalHits);
ScoreDoc[] hits = docs.scoreDocs;
System.out.println("Found " + hits.length + " hits.");
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + "---" + d.get("id") + " " + d.get("zipCode") + "\t" + d.get("courseName"));
}
If you look into the above two code snippets, both are same but the field names are different.
Please help me to understand and also let me know why it is not working.
Please help how to achieve the result by passing only query string and the field name.

return array from inside an if,for statement

I am building a tag reader for inventory purpose. Using the for loop to iterate through the tags to count/total the ids. I get an error on my return line "tagsFound cannot be resolved into a variable". How do i use the variable inside the for loop and then access it outside the loop?
public String[] getTags(AlienClass1Reader reader)throws AlienReaderException{
int coneCount = 0;
int drumCount = 0;
// Open a connection to the reader
reader.open();
// Ask the reader to read tags and print them
Tag tagList[] = reader.getTagList();
if (tagList == null) {
System.out.println("No Tags Found");
} else {
System.out.println("Tag(s) found: " + tagList.length);
for (int i=0; i<tagList.length; i++) {
Tag tag = tagList[i];
System.out.println("ID:" + tag.getTagID() +
", Discovered:" + tag.getDiscoverTime() +
", Last Seen:" + tag.getRenewTime() +
", Antenna:" + tag.getAntenna() +
", Reads:" + tag.getRenewCount()
);
//tagFound[i]= "" + tag.getTagID();
String phrase = tag.getTagID();
tagFound[i] = phrase;
String delims = "[ ]+";
String[] tokens = phrase.split(delims);
if (tokens[0].equals("0CCE") && tokens[3].equals("1001")){drumCount++;}
if (tokens[0].equals("0CCE") && tokens[3].equals("1004")){coneCount++;}
String[] tagsFound;
tagsFound[i] = tag.getTagID();
}
System.out.println("Cones= " + coneCount);
System.out.println("Drums= " + drumCount);
// Close the connection
reader.close();
return tagsFound;
}
}
public String[] getTags(AlienClass1Reader reader)throws AlienReaderException{
int coneCount = 0;
int drumCount = 0;
// Open a connection to the reader
reader.open();
// Ask the reader to read tags and print them
Tag tagList[] = reader.getTagList();
if (tagList == null) {
System.out.println("No Tags Found");
} else {
System.out.println("Tag(s) found: " + tagList.length);
String[] tagsFound = new String[tagList.length];
for (int i=0; i<tagList.length; i++) {
tagsFound = "";
Tag tag = tagList[i];
System.out.println("ID:" + tag.getTagID() +
", Discovered:" + tag.getDiscoverTime() +
", Last Seen:" + tag.getRenewTime() +
", Antenna:" + tag.getAntenna() +
", Reads:" + tag.getRenewCount()
);
//tagFound[i]= "" + tag.getTagID();
String phrase = tag.getTagID();
tagFound[i] = phrase;
String delims = "[ ]+";
String[] tokens = phrase.split(delims);
if (tokens[0].equals("0CCE") && tokens[3].equals("1001")){drumCount++;}
if (tokens[0].equals("0CCE") && tokens[3].equals("1004")){coneCount++;}
tagsFound[i] = tag.getTagID();
}
System.out.println("Cones= " + coneCount);
System.out.println("Drums= " + drumCount);
// Close the connection
reader.close();
return tagsFound;
}
}
the returned array will have empty strings in the positions where the tag does not satisfy the criteria.

java.lang.ArrayIndexOutOfBoundsException :

I have a String = "abc model 123 abcd1862893007509396 abcd2862893007509404", if I provide space between abcd1 & number eg. abcd1 862893007509396 my code will work fine, but if there is no space like abcd1862893007509396, I will get java.lang.ArrayIndexOutOfBoundsException, please help ?:
PFB the code :
String text = "";
final String suppliedKeyword = "abc model 123 abcd1862893007509396 abcd2862893007509404";
String[] keywordarray = null;
String[] keywordarray2 = null;
String modelname = "";
String[] strIMEI = null;
if ( StringUtils.containsIgnoreCase( suppliedKeyword,"model")) {
keywordarray = suppliedKeyword.split("(?i)model");
if (StringUtils.containsIgnoreCase(keywordarray[1], "abcd")) {
keywordarray2 = keywordarray[1].split("(?i)abcd");
modelname = keywordarray2[0].trim();
if (keywordarray[1].trim().contains(" ")) {
strIMEI = keywordarray[1].split(" ");
for (int i = 0; i < strIMEI.length; i++) {
if (StringUtils.containsIgnoreCase(strIMEI[i],"abcd")) {
text = text + " " + strIMEI[i] + " "
+ strIMEI[i + 1];
System.out.println(text);
}
}
} else {
text = keywordarray2[1];
}
}
}
After looking at your code the only thing i can consider for cause of error is
if (StringUtils.containsIgnoreCase(strIMEI[i],"abcd")) {
text = text + " " + strIMEI[i] + " "
+ strIMEI[i + 1];
System.out.println(text);
}
You are trying to access strIMEI[i+1] which will throw an error if your last element in strIMEI contains "abcd".

Parse all PDF pages at once with iText

I am trying to parse a pdf file with "iText". What I am trying to achieve is to parse all pages at once.
try {
PdfReader reader = new PdfReader("D:\\hl_sv\\L04MF.pdf");
int pages = reader.getNumberOfPages();
String content = "";
for (int i = 0; i <= pages; i++) {
System.out.println("============PAGE NUMBER " + i + "=============" );
content = content + " " + PdfTextExtractor.getTextFromPage(reader, i);
}
System.out.println(content);
}
I am getting this error:
Exception in thread "main" java.lang.NullPointerException
at com.itextpdf.text.pdf.parser.PdfReaderContentParser.processContent(PdfReaderContentParser.java:77)
at com.itextpdf.text.pdf.parser.PdfTextExtractor.getTextFromPage(PdfTextExtractor.java:74)
at com.itextpdf.text.pdf.parser.PdfTextExtractor.getTextFromPage(PdfTextExtractor.java:89)
at com.pdf.PDF.main(PDF.java:18)
Other problem I am facing is that the - hyphen is being parsed as ? question mark. How can I fix that?
I appreciate any help.
Edit
It works for me like this but I cant still solve the hyphen bug.
try {
PdfReader reader = new PdfReader("D:\\hl_sv\\L04MF.pdf");
int pages = reader.getNumberOfPages();
for(int i = 1; i<= pages; i++) {
System.out.println("============PAGE NUMBER " + i + "=============" );
String line = PdfTextExtractor.getTextFromPage(reader,i);
System.out.println(line);
}
}
public static String extractPdfText() throws IOException {
PdfReader pdfReader = new PdfReader("/path/to/file/myfile.pdf");
int pages = pdfReader.getNumberOfPages();
String pdfText = "";
for (int ctr = 1; ctr < pages + 1; ctr++) {
pdfText += PdfTextExtractor.getTextFromPage(pdfReader, ctr); // Page number cannot be 0 or will throw NPE
}
pdfReader.close();
return pdfText;
}

Categories

Resources