getting exception: ClassNotFoundException
And I have included fontbox and pdfbox jar files in my classpath.
package com.KyaHub.action;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import javax.servlet.http.HttpServletRequest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.fontbox.cmap.*;
import org.xml.sax.SAXException;
public class PdfParser {
private HttpServletRequest request;
public String execute() throws IOException,TikaException, SAXException {
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
FileInputStream inputstream = new FileInputStream(new File("C:/Users/admin/Downloads/cmp_column_width_example.pdf"));
ParseContext pcontext = new ParseContext();
try{
//parsing the document using PDF parser
PDFParser pdfparser = new PDFParser();
pdfparser.parse(inputstream, handler, metadata,pcontext);
//getting the content of the document
System.out.println("Contents of the PDF :" + handler.toString());
//getting metadata of the document
System.out.println("Metadata of the PDF:");
String[] metadataNames = metadata.names();
for(String name : metadataNames) {
System.out.println(name+ " : " + metadata.get(name));
}
}
catch(Exception e)
{
e.printStackTrace();
}
return "success";
}
//getter and setter
public HttpServletRequest getRequest() {
return request;
}
public void setRequest(HttpServletRequest request) {
this.request = request;
}
}
whenever i am changing the file name with APJ.AbdulKalamAzad.pdf i got output. But when I change the file name with another pdf file then I got the exception mentioned above.
Related
While Running the following code in order to read an XML file and generating a corresponding PDF. I am facing the errors mentioned below the code.
package com.test.pdf;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import com.google.zxing.WriterException;
import com.itextpdf.text.Chunk;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.pdf.PdfDocument;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.xmp.impl.Base64;
//import com.itextpdf.text.pdf.codec.Base64;
import org.apache.log4j.Logger;
public class PDFGenerator {
final static Logger logger = Logger.getLogger(PDFGenerator.class);
private static final String TITLE = "TestReport";
public static final String PDF_EXTENSION = ".pdf";
public static String arg1 = "";
public static String arg2 = "";
public static String arg3 = "";
public static String createPDFBase64(String arg1 , String arg2 , String arg3) throws IOException, URISyntaxException, com.lowagie.text.DocumentException, WriterException {
byte[] encoded = null;
String out= null;
Document document = new Document();
try {
//arg1 = args[0];
//Document is not auto-closable hence need to close it separately
document = new Document(PageSize.LETTER);
System.out.println("Here i amn777");
File temp = File.createTempFile(TITLE ,PDF_EXTENSION);
PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(
temp)); // new File
HeaderFooter event = new HeaderFooter(arg2);
event.setHeader("Test Report");
writer.setPageEvent(event);
document.open();
PDFCreator pdfCreator = new PDFCreator();
pdfCreator.addMetaData(document, arg1 , arg2 );
pdfCreator.addTitlePage(document, arg2 );
//PDFCreator.addContent(document, dataObjList);
//String base64String = Base64.encodeFromFile("C:\\Users\\2000554\\Downloads\\HTMLToPDF\\TestReport.pdf");
//System.out.println("===============>>>" + base64String);
document.close();
byte[] inFileBytes = Files.readAllBytes(temp.toPath());
//PdfReader pReader = new PdfReader(inFileBytes);
//System.out.println("pReader.getFileLength()===============>>>" + pReader.getFileLength());
//System.out.println("pReader.getFileLength()===============>>>" + PdfTextExtractor.getTextFromPage(pReader, 1));
out = new String(Base64.encode(inFileBytes), "UTF-8");
System.out.println("Clear cache..");
pdfCreator.xmlData.clear();
pdfCreator.dataObjMRCList.clear();
pdfCreator.dataObjNRCList.clear();
pdfCreator.dataObjVASList.clear();
pdfCreator.dataObjDEVList.clear();
pdfCreator.stcPhoneNumDispList.clear();
pdfCreator.stcSvcMap.clear();
pdfCreator.stcSvcBandWthMap.clear();
pdfCreator.invoiceTaxDvcMap.clear();
//byte[] decoded = java.util.Base64.getDecoder().decode(out.getBytes());
/* byte[] decoded = Base64.decode(out.getBytes());
FileOutputStream fos = new FileOutputStream("C:\\Users\\2000554\\Downloads\\HTMLToPDF\\TestBaseReport.pdf");
fos.write(decoded);
fos.flush();
fos.close();*/
}catch ( FileNotFoundException e) {
System.out.println("FileNotFoundException occurs.." + e.getMessage());
e.printStackTrace();
}catch (DocumentException e) {
System.out.println("DocumentException occurs.." + e.getMessage());
e.printStackTrace();
} catch (Exception e) {
System.out.println("Exception occurs.." + e.getMessage());
e.printStackTrace();
return null;
}
finally{
if(null != document){
// document.close();
}
}
return out;
}
public static void main(String args[]) {
try {
if(args != null && args.length>1) {
FileReader fReader = new FileReader(new File("C:\\Users\\2004807\\Downloads\\XML\\Amendment.xml"));
BufferedReader bdr = new BufferedReader(fReader);
String line = null;
String xmlString = "";
while ((line=bdr.readLine())!=null){
xmlString += line;
}
createPDFBase64(xmlString,args[1],args[2]);
}else {
createPDFBase64("","","");
}
} catch (IOException e) {
e.printStackTrace();
} catch (URISyntaxException e) {
e.printStackTrace();
} catch (com.lowagie.text.DocumentException e) {
e.printStackTrace();
} catch (WriterException e) {
e.printStackTrace();
}
}
}
I rechecked the path and the XML format since the error mentioned is due to wrong formatting of XML in some cases. I still am getting the following error.
Here i amn777
1getting resourcesfile:/C:/Users/2004807/Desktop/B2B%20Java/HtmlToPdf/target/classes/new.PNG
Inside getXMLData
XML==>
[Fatal Error] :1:1: Premature end of file.
Error is :org.xml.sax.SAXParseException; lineNumber: 1; columnNumber: 1; Premature end of file.
xmlData size is :0
I am doing a coding project where I am trying to input a file into java and output information about the file. I have found code online that does this for PDF's. The line "import org.xml.sax.SAXException;" keeps giving me an error and stating that the package org.xml.sax is accessible to more than one module. Can someone help me with this?
Sorry to bother you all, I am a new coder just trying to figure this out.
Here is the code:
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
public class PDFTika
{
public static void main(final String[] args) throws
IOException,TikaException
{
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
FileInputStream inputstream = new FileInputStream(new
File("/Users/relli/OneDrive/Documents/Asparta/example.pdf"));
ParseContext pcontext = new ParseContext();
//parsing the document using PDF parser
PDFParser pdfparser = new PDFParser();
pdfparser.parse(inputstream, handler, metadata, pcontext);
//getting the content of the document
System.out.println("Contents of the PDF :" +
handler.toString());
//getting metadata of the document
System.out.println("Metadata of the PDF:");
String[] metadataNames = metadata.names();
for(String name : metadataNames)
{
System.out.println(name+ " : " + metadata.get(name));
}
}
}
Method 1: code is a copy of the code provided by Gabriel Katz. I have managed to fix the error just by adding another exception (SAXException) in code.
Method 2: is a simplified version of parsing the PDF content only.
Code Snippet Info:
This code is used to parse PDF data using the Apache Tika package. It will display the pdf content as string and print metadata of PDF file
Method 1: parse PDF and print PDF content and metadata
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
public class PDFTika {
public static void main(final String[] args) throws IOException, TikaException, SAXException {
File file = new File("example.pdf");
FileInputStream inputstream = new FileInputStream(file);
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext pcontext = new ParseContext();
//parsing the document using PDF parser
PDFParser pdfparser = new PDFParser();
pdfparser.parse(inputstream, handler, metadata, pcontext);
//getting the content of the document
System.out.println("Contents of the PDF :" + handler.toString());
//getting metadata of the document
System.out.println("Metadata of the PDF:");
String[] metadataNames = metadata.names();
for (String name : metadataNames) {
System.out.println(name + " : " + metadata.get(name));
}
}
}
Method 2: parse PDF data and print content as a string
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
public class TikaParser {
public static void main(String[] args) throws IOException, TikaException {
File file = new File("example.pdf");
FileInputStream inputstream = new FileInputStream(file);
Tika tika = new Tika();
String fileContent = tika.parseToString(inputstream);
System.out.println(fileContent);
}
}
<!--Please add following dependencies for testng-->
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.24.1</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.24.1</version>
</dependency>
</dependencies>
I have used TikaParser to extract plain text from '.doc' files
public static void main(String[] args) throws Exception {
ContentHandler handler = new ToHTMLContentHandler();
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
FileInputStream content = new FileInputStream("file.doc");
parser.parse(content, handler, metadata, context);
System.out.println(handler.toString());
String[] metadataNames = metadata.names();
for (String name : metadataNames) {
System.out.println(name + " : " + metadata.get(name));
}
FileOutputStream outStream = new FileOutputStream("file.doc.txt");
outStream.write(handler.toString().getBytes());
outStream.close();
content.close();
}
This is working for most of the files but for a specific file, it is throwing the following exception
Exception in thread "main" org.apache.tika.exception.TikaException: Unexpected RuntimeException from org.apache.tika.parser.microsoft.OfficeParser#7c417213
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:282)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
at com.goarya.app.resumestorage.migration.TikaParser.main(TikaParser.java:29)
Caused by: java.lang.IllegalArgumentException: The end (7161) must not be before the start (7162)
at org.apache.poi.hwpf.usermodel.Range.sanityCheckStartEnd(Range.java:208)
at org.apache.poi.hwpf.usermodel.Range.<init>(Range.java:194)
at org.apache.poi.hwpf.usermodel.Paragraph.<init>(Paragraph.java:165)
at org.apache.poi.hwpf.usermodel.Paragraph.newParagraph(Paragraph.java:144)
at org.apache.poi.hwpf.usermodel.Range.getParagraph(Range.java:766)
at org.apache.poi.hwpf.extractor.WordExtractor.getParagraphText(WordExtractor.java:168)
at org.apache.poi.hwpf.extractor.WordExtractor.getMainTextboxText(WordExtractor.java:145)
at org.apache.tika.parser.microsoft.WordExtractor.parse(WordExtractor.java:183)
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:169)
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:130)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
... 3 more
The doc file when opened in Microsoft Word shows no error.
Also, in C# using Microsoft.Office.Interop.Word gives plain text.
How do I overcome this issue using Apache Tika?
Edit: adding sample doc for this scenario
I am using tika cote1.2 jar and my program has been run successfully with the following code.
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ToHTMLContentHandler;
import org.xml.sax.SAXException;
public class Exmple2 {
public static void main(final String[] args) throws IOException,TikaException, SAXException {
ToHTMLContentHandler handler = new ToHTMLContentHandler();
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
FileInputStream content = new FileInputStream("/home/ist/FTRDocuments/taableDis.docx");
parser.parse(content, handler, metadata, context);
System.out.println(handler.toString());
String[] metadataNames = metadata.names();
for (String name : metadataNames) {
System.out.println(name + " : " + metadata.get(name));
}
FileOutputStream outStream = new FileOutputStream("/home/ist/file.doc.txt");
outStream.write(handler.toString().getBytes());
outStream.close();
content.close();
}
}
The only thing change with tika1.2 is ToHTMLContentHandler where you are using ContentHandler.
I know the question is asked several times before, but none of the selutions seems to work for me.
I'm trying to create a newsfeed from rss and write it to a pdf.
The pdf is created, but empty and I get a
(Location of error unknown)com.sun.org.apache.xerces.internal.impl.io.MalformedByteSequenceException: Invalid byte 2 of 3-byte UTF-8 sequence.
error.
These are my classes:
Get feed:
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.SyndFeedInput;
import com.sun.syndication.io.XmlReader;
public class GetFeed {
private String adress;
private URL url;
public GetFeed(String adress) {
super();
this.adress = adress;
try {
setUrl();
} catch (MalformedURLException e) {
System.out.println("This isn't a correct url");
e.printStackTrace();
}
}
public void setUrl() throws MalformedURLException {
url = new URL(adress);
}
public SyndFeed getFeed() throws IOException, IllegalArgumentException,
FeedException {
HttpURLConnection httpcon = (HttpURLConnection) url.openConnection();
SyndFeedInput input = new SyndFeedInput();
SyndFeed feed = input.build(new XmlReader(httpcon));
return feed;
}
Example of feedcreation:
public void homeland() {
GetFeed homeland = new GetFeed("http://www.standaard.be/rss/section/1f2838d4-99ea-49f0-9102-138784c7ea7c");
try {
feed = homeland.getFeed();
WriteToXml xml = new WriteToXml(feed);
} catch (IllegalArgumentException | IOException |FeedException e) {
e.printStackTrace();
}
}
Write the xmlfile:
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.SyndFeedOutput;
public class WriteToXml {
public WriteToXml(SyndFeed feed) throws IOException, FeedException {
Writer writer = new FileWriter("newsfeed.xml", true);
SyndFeedOutput output = new SyndFeedOutput();
output.output(feed, writer);
writer.close();
}
}
Create pdf
public class CreatePdf {
public void convertToPDF() throws IOException, FOPException, TransformerException {
File xsltFile = new File("template.xsl");
StreamSource xmlSource = new StreamSource(new File("newsfeed.xml"));
FopFactory fopFactory = FopFactory.newInstance(new File(".").toURI());
FOUserAgent foUserAgent = fopFactory.newFOUserAgent();
OutputStream out;
out = new java.io.FileOutputStream("newsfeed.pdf");
try {
Fop fop = fopFactory.newFop(MimeConstants.MIME_PDF, foUserAgent, out);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer transformer = factory.newTransformer(new StreamSource(xsltFile));
Result res = new SAXResult(fop.getDefaultHandler());
transformer.transform(xmlSource, res);
} finally {
out.close();
}
}
}
The mainapp:
import java.io.IOException;
import javax.xml.transform.TransformerException;
import org.apache.fop.apps.FOPException;
public class NewsfeedApp {
public static void main(String[] args) {
CreateFeeds feeds = new CreateFeeds();
CreatePdf pdf = new CreatePdf();
try {
pdf.convertToPDF();
} catch (FOPException | IOException | TransformerException e) {
e.printStackTrace();
}
}
}
Anny help would be greatly appreciated
Sorry for crappy eEnglish, I'm not native.
I created a simple class that using tika library to extract metadata from files like PDF, html, XLS, DOC,..
files can have custom metadata. I need to detect that and ignore for first step!
But i can see how to do that with Tika!
this is my simple code to extract all metadata:
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.PrintWriter;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
public class TikaParse {
public static String resPFldMeta = new String();
public static String resPFldMetaValue = new String();
#SuppressWarnings("deprecation")
public static String ParseFieldMetadata(String filename) throws Exception {
int j;
FileInputStream is = null;
File f = new File(filename);
is = new FileInputStream(f);
ContentHandler contenthandler = new BodyContentHandler(-1);
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName());
AutoDetectParser parser = new AutoDetectParser();
parser.parse(is, contenthandler, metadata,new ParseContext());
String[] metadataNames = metadata.names();
// get field name of all metadata
for(j=0;j<metadataNames.length-1; j++){
resPFldMeta += "\""+(metadataNames[j]).trim()+"\",";
}
resPFldMeta += "\""+(metadataNames[j]).trim()+"\"";
return resPFldMeta;
}
//.....
}
SO, My question is : how to check if the metadat detected is custom metadata or is normalized metadata??