I'm merging two pdf pages (test1.pdf & test2.pdf) using iText PDF and got the output in test_result.pdf. But the output page has not come out like the input page, it's cropped to half of the actual size. How to overcome this error? Here is my code:
public class MergePDF {
public static void main(String[] args) {
try {
List<InputStream> pdfs = new ArrayList<InputStream>();
pdfs.add(new FileInputStream("test1.pdf"));
pdfs.add(new FileInputStream("test2.pdf/"));
// pdfs.add(new FileInputStream("test_result.pdf/"));
OutputStream output = new FileOutputStream("/home/ant000112/merge_result.pdf");
MergePDF.concatPDFs(pdfs, output, true);
} catch (Exception e) {
e.printStackTrace();
}
}
public static void concatPDFs(List<InputStream> streamOfPDFFiles,
OutputStream outputStream, boolean paginate) {
Document document = new Document();
try {
List<InputStream> pdfs = streamOfPDFFiles;
List<PdfReader> readers = new ArrayList<PdfReader>();
int totalPages = 0;
Iterator<InputStream> iteratorPDFs = pdfs.iterator();
// Create Readers for the pdfs.
while (iteratorPDFs.hasNext()) {
InputStream pdf = iteratorPDFs.next();
PdfReader pdfReader = new PdfReader(pdf);
readers.add(pdfReader);
totalPages += pdfReader.getNumberOfPages();
}
// Create a writer for the outputstream
PdfWriter writer = PdfWriter.getInstance(document, outputStream);
document.open();
//BaseFont bf = BaseFont.createFont(BaseFont.HELVETICA,BaseFont.CP1252, BaseFont.NOT_EMBEDDED);
BaseFont bf = BaseFont.createFont(BaseFont.HELVETICA,BaseFont.CP1257, BaseFont.NOT_EMBEDDED);
PdfContentByte cb = writer.getDirectContent(); // Holds the PDF
// data
PdfImportedPage page;
int currentPageNumber = 0;
int pageOfCurrentReaderPDF = 0;
Iterator<PdfReader> iteratorPDFReader = readers.iterator();
// Loop through the PDF files and add to the output.
while (iteratorPDFReader.hasNext()) {
PdfReader pdfReader = iteratorPDFReader.next();
// Create a new page in the target for each source page.
while (pageOfCurrentReaderPDF < pdfReader.getNumberOfPages()) {
document.newPage();
pageOfCurrentReaderPDF++;
currentPageNumber++;
page = writer.getImportedPage(pdfReader,
pageOfCurrentReaderPDF);
cb.addTemplate(page, 0, 0);
// Code for pagination.
if (paginate) {
cb.beginText();
cb.setFontAndSize(bf, 9);
cb.showTextAligned(PdfContentByte.ALIGN_CENTER, ""+ currentPageNumber + " of " + totalPages, 520,5, 0);
cb.endText();
}
}
pageOfCurrentReaderPDF = 0;
}
outputStream.flush();
document.close();
outputStream.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (document.isOpen())
document.close();
System.out.println("ghghklh");
try {
if (outputStream != null)
outputStream.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
}
Did you try reading examples from the iText site? It appears you're doing it in a different way. Like you're not using PdfCopy.
/*
* This class is part of the book "iText in Action - 2nd Edition"
* written by Bruno Lowagie (ISBN: 9781935182610)
* For more info, go to: http://itextpdf.com/examples/
* This example only works with the AGPL version of iText.
*/
package part2.chapter06;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.SQLException;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfCopy;
import com.itextpdf.text.pdf.PdfReader;
import part1.chapter02.MovieHistory;
import part1.chapter02.MovieLinks1;
public class Concatenate {
/** The resulting PDF file. */
public static final String RESULT
= "results/part2/chapter06/concatenated.pdf";
/**
* Main method.
* #param args no arguments needed
* #throws DocumentException
* #throws IOException
* #throws SQLException
*/
public static void main(String[] args)
throws IOException, DocumentException, SQLException {
// using previous examples to create PDFs
MovieLinks1.main(args);
MovieHistory.main(args);
String[] files = { MovieLinks1.RESULT, MovieHistory.RESULT };
// step 1
Document document = new Document();
// step 2
PdfCopy copy = new PdfCopy(document, new FileOutputStream(RESULT));
// step 3
document.open();
// step 4
PdfReader reader;
int n;
// loop over the documents you want to concatenate
for (int i = 0; i < files.length; i++) {
reader = new PdfReader(files[i]);
// loop over the pages in that document
n = reader.getNumberOfPages();
for (int page = 0; page < n; ) {
copy.addPage(copy.getImportedPage(reader, ++page));
}
copy.freeReader(reader);
}
// step 5
document.close();
}
}
http://itextpdf.com/examples/iia.php?id=123
EDIT: Just to be fair I downloaded the library and I tried the example. It works like a charm.
Related
I'm a rookie, really. I'm building my first project (if I can finish it).
I want to extract PDF text with formatting and location, and then write to .docx file. I checked the PDFBox API documentation, but I'm not sure if I want to get the location of the text, then should I traverse the rows? Or traverse the characters? I studied these three carefully.
Text coordinates when stripping from PDFBox
Get font of each line using PDFBox
How to extract font styles of text contents using pdfbox?
And here is my DEMO:
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.IOException;
import java.util.List;
public class PDFTextExtractor extends PDFTextStripper {
/**
* Instantiate a new PDFTextStripper object.
*
* #throws IOException If there is an error loading the properties.
*/
public PDFTextExtractor() throws IOException {
}
String prevFont = "";
#Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
StringBuilder sb = new StringBuilder();
for (TextPosition position : textPositions){
String font = position.getFont().getName();
float x = position.getX();
float y = position.getY();
float fontSize = position.getFontSizeInPt();
if (font != null && !font.equals(prevFont)){
sb.append("[").append(font.split("-")[0]).append("+").append(font.split("-")[1]).append("+").append(fontSize).append("]");
prevFont = font;
}
sb.append(position.getUnicode());
}
writeString(sb.toString());
}
#Override
public String getText(PDDocument doc) throws IOException {
return super.getText(doc);
}
}
And i calling it like here:
FileOutputStream outputStream = new FileOutputStream(EXPORT_PATH + file.getName().split("\\.")[0] + ".docx");
try (PDDocument originalPDF = PDDocument.load(file);
XWPFDocument doc = new XWPFDocument()) {
//get All pages
PDPageTree pageList = originalPDF.getDocumentCatalog().getPages();
for (PDPage page : pageList){
//Parse Content
PDFTextStripper stripper = new PDFTextExtractor();
stripper.setSortByPosition(true);
String ss = stripper.getText(originalPDF);
System.out.println(ss);
//Write Content
XWPFParagraph paragraph = doc.createParagraph();
XWPFRun run = paragraph.createRun();
run.setText(ss);
run.addBreak(BreakType.PAGE);
}
doc.write(outputStream);
originalPDF.close();
outputStream.close();
}
I’m currently using PDFBox to read the text of a set of pdfs that I’ve inherited.
I’m only interested in reading the text, not making any changes to the file.
The code that works for most of the files is:
File pdfFile = myPath.toFile();
PDDocument document = PDDocument.load(pdfFile );
Writer sw = new StringWriter();
PDFTextStripper stripper = new PDFTextStripper();
stripper.setStartPage( 1 );
stripper.writeText( document, sw );
String documentText = sw.toString()
For most files, I wind up with the text in the documentText field.
But, for 3 of 24 files, the documentText content for the first file is “\r\n”, for the second “\r\n\r\n”, and for the third “\r\n\r\n\r\n:, But the three files are not consecutive. Multiple good files are between each of these files.
The File is derived from a java.nio.Path. The WindowsFileAttribute that is part of the Path has a size of 279K, so the file is not empty on disk.
I can open the file and view the data, and it looks like the other files that my code reads.
I’m using Java 8.0.121, and PDFBox 2.0.4. (this is the latest version, I believe.)
Any suggestions? Is there a better way to read the text? (I’m not interested in the formatting, or fonts used, just the text.)
Thanks.
Reading multiple PDF docs using pdfbox in java
package readwordfile;
import java.io.BufferedReader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
/**
* This is an example on how to extract words from PDF document
*
* #author saravanan
*/
public class GetWordsFromPDF extends PDFTextStripper {
static List<String> words = new ArrayList<String>();
public GetWordsFromPDF() throws IOException {
}
/**
* #param args
* #throws IOException If there is an error parsing the document.
*/
public static void main(String[] args) throws IOException {
String files;
// FileWriter fs = new FileWriter("C:\\Users\\saravanan\\Desktop\\New Text Document (2).txt");
// FileInputStream fstream1 = new FileInputStream("C:\\Users\\saravanan\\Desktop\\New Text Document (2).txt");
// DataInputStream in1 = new DataInputStream(fstream1);
// BufferedReader br1 = new BufferedReader(new InputStreamReader(in1));
String path = "C:\\Users\\saravanan\\Desktop\\New folder\\"; //local folder path name
File folder = new File(path);
File[] listOfFiles = folder.listFiles();
for (int i = 0; i < listOfFiles.length; i++) {
if (listOfFiles[i].isFile()) {
files = listOfFiles[i].getName();
if (files.endsWith(".pdf") || files.endsWith(".PDF")) {
String nfiles = "C:\\Users\\saravanan\\Desktop\\New folder\\";
String fileName1 = nfiles + files;
System.out.print("\n\n" + files+"\n");
PDDocument document = null;
try {
document = PDDocument.load(new File(fileName1));
PDFTextStripper stripper = new GetWordsFromPDF();
stripper.setSortByPosition(true);
stripper.setStartPage(0);
stripper.setEndPage(document.getNumberOfPages());
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
int x = 0;
System.out.println("");
for (String word : words) {
if (word.startsWith("xxxxxx")) { //here you can give your pdf doc starting word
x = 1;
}
if (x == 1) {
if (!(word.endsWith("YYYYYY"))) { //here you can give your pdf doc ending word
System.out.print(word + " ");
// fs.write(word);
} else {
x = 0;
break;
}
}
}
} finally {
if (document != null) {
document.close();
words.clear();
}
}
}
}
}
}
/**
* Override the default functionality of PDFTextStripper.writeString()
*
* #param str
* #param textPositions
* #throws java.io.IOException
*/
#Override
protected void writeString(String str, List<TextPosition> textPositions) throws IOException {
String[] wordsInStream = str.split(getWordSeparator());
if (wordsInStream != null) {
for (String word : wordsInStream) {
words.add(word); //store the pdf content into the List
}
}
}
}
I am trying to merge two documents lets say
Document 1: Merger1.doc
Document 2: Merger2.doc
I would like to store it into a new file doc2.docx.
I have used this piece of code to do this, but it is throwing some error.
CODE:
import java.io.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Range;
public class MergerFiles {
public static void main (String[] args) throws Exception {
// POI apparently can't create a document from scratch,
// so we need an existing empty dummy document
HWPFDocument doc = new HWPFDocument(new FileInputStream("C:\\Users\\pallavi123\\Desktop\\Merger1.docx"));
Range range = doc.getRange();
//I can get the entire Document and insert it in the tmp.doc
//However any formatting in my word document is lost.
HWPFDocument doc2 = new HWPFDocument(new FileInputStream("C:\\Users\\pallavi123\\Desktop\\Merger2.docx"));
Range range2 = doc2.getRange();
range.insertAfter(range2.text());
//I can get the information (text only) for each character run/paragraph or section.
//Again any formatting in my word document is lost.
HWPFDocument doc3 = new HWPFDocument(new FileInputStream("D:\\doc2.docx"));
Range range3 = doc3.getRange();
for(int i=0;i<range3.numCharacterRuns();i++){
CharacterRun run3 = range3.getCharacterRun(i);
range.insertAfter(run3.text());
}
OutputStream out = new FileOutputStream("D:\\result.doc");
doc.write(out);
out.flush();
out.close();
}
}
ERROR CODE:
Exception in thread "main" org.apache.poi.poifs.filesystem.OfficeXmlFileException: The supplied data appears to be in the Office 2007+ XML. You are calling the part of POI that deals with OLE2 Office Documents. You need to call a different part of POI to process this data (eg XSSF instead of HSSF)
at org.apache.poi.poifs.storage.HeaderBlockReader.<init>(HeaderBlockReader.java:108)
at org.apache.poi.poifs.filesystem.POIFSFileSystem.<init>(POIFSFileSystem.java:151)
at org.apache.poi.hwpf.HWPFDocument.verifyAndBuildPOIFS(HWPFDocument.java:120)
at org.apache.poi.hwpf.HWPFDocument.<init>(HWPFDocument.java:133)
at MergerFiles.main(MergerFiles.java:11)
Am i missing any jar file or the way am using code is wrong. Need your valuable suggestions.
Thanks in Advance.
I've developed the next class:
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
public class WordMerge {
private final OutputStream result;
private final List<InputStream> inputs;
private XWPFDocument first;
public WordMerge(OutputStream result) {
this.result = result;
inputs = new ArrayList<>();
}
public void add(InputStream stream) throws Exception{
inputs.add(stream);
OPCPackage srcPackage = OPCPackage.open(stream);
XWPFDocument src1Document = new XWPFDocument(srcPackage);
if(inputs.size() == 1){
first = src1Document;
} else {
CTBody srcBody = src1Document.getDocument().getBody();
first.getDocument().addNewBody().set(srcBody);
}
}
public void doMerge() throws Exception{
first.write(result);
}
public void close() throws Exception{
result.flush();
result.close();
for (InputStream input : inputs) {
input.close();
}
}
}
And its use:
public static void main(String[] args) throws Exception {
FileOutputStream faos = new FileOutputStream("/home/victor/result.docx");
WordMerge wm = new WordMerge(faos);
wm.add( new FileInputStream("/home/victor/001.docx") );
wm.add( new FileInputStream("/home/victor/002.docx") );
wm.doMerge();
wm.close();
}
I have a suggestion!
First the main method; the parameters are: test1=firstDocxFileName, test2=secondDocxFileName, dest=destinationFileName; document is a global variable;
public void mergeDocx(String test1, String test2, String dest){
try {
XWPFDocument doc1 = new XWPFDocument(new FileInputStream(new File(test1)));
XWPFDocument doc2 = new XWPFDocument(new FileInputStream(new File(test2)));
document = new XWPFDocument();
passaElementi(doc1);
passaElementi(doc2);
passaStili(doc1,doc2);
OutputStream out = new FileOutputStream(new File(dest));
document.write(out);
out.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
The private method 'passaElementi'copies and paste the body elements from doc1 to document object;I don't know what is XWPFSDT object...; (pay attention: i don't copy all the document but only the body!! .. for headers, sections, footers it proceed similarly) (the integer variables i and j are global and 0 at the beginning obviously)
private void passaElementi(XWPFDocument doc1){
for(IBodyElement e : doc1.getBodyElements()){
if(e instanceof XWPFParagraph){
XWPFParagraph p = (XWPFParagraph) e;
if(p.getCTP().getPPr()!=null && p.getCTP().getPPr().getSectPr()!=null){
continue;
}else{
document.createParagraph();
document.setParagraph(p, i);
i++;
}
}else if(e instanceof XWPFTable){
XWPFTable t = (XWPFTable)e;
document.createTable();
document.setTable(j, t);
j++;
}else if(e instanceof XWPFSDT){
// boh!
}
}
}
The private method 'passaStili' copies and paste styles from doc1 and doc2 to document object;
private void passaStili(XWPFDocument doc1, XWPFDocument doc2){
try {
CTStyles c1 = doc1.getStyle();
CTStyles c2 = doc2.getStyle();
int size1 = c1.getStyleList().size();
int size2 = c2.getStyleList().size();
for(int i = 0; i<size2; i++ ){
c1.addNewStyle();
c1.setStyleArray(size1+i, c2.getStyleList().get(i));
}
document.createStyles().setStyles(c1);
} catch (XmlException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
I don't handle exceptions to be fast!
Leave a like if you liked it!
Best regards!
B.M.
You should use XWPFDocument instead of HWPFDocument.
The documentation states:
The partner to HWPF for the new Word 2007 .docx format is XWPF. Whilst HWPF and XWPF provide similar features, there is not a common interface across the two of them at this time.
Change your code to:
XWPFDocument doc = new XWPFDocument(new FileInputStream("..."));
XWPFDocument doc2 = new XWPFDocument(new FileInputStream("..."));
XWPFDocument doc3 = new XWPFDocument(new FileInputStream("..."));
when you use HWPFDocument,should use doc file (not docx)
I am trying to use itext framework to convert a pdf file into a csv for import into excel.
The output is garbled and I pressume I am missing a step in regards to format conversion however I can't seem to find the information in the itext site and am looking for assistance.
Current is as below.
package com.pdf.convert;
import java.io.FileOutputStream;
import java.io.IOException;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.pdf.PdfImportedPage;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfWriter;
public class ThirdPDF {
private static String INPUTFILE = "/location/test.pdf";
private static String OUTPUTFILE = "/location/test.csv";
public static void main(String[] args) throws DocumentException,
IOException {
Document document = new Document();
PdfWriter writer = PdfWriter.getInstance(document,
new FileOutputStream(OUTPUTFILE));
document.open();
PdfReader reader = new PdfReader(INPUTFILE);
int n = reader.getNumberOfPages();
PdfImportedPage page;
// Go through all pages
for (int i = 1; i <= n; i++) {
// Only page number 2 will be included
if (i == 2) {
page = writer.getImportedPage(reader, i);
Image instance = Image.getInstance(page);
document.add(instance);
}
}
document.close();
}
}
Converting PDF file to CSV file.
Present Directory and File creation is based on Android Framework.
Change your path and Directory as per your Framework Accordingly.
private void convertPDFToCSV(String pdfFilePath) {
String myfolder = Environment.getExternalStorageDirectory() + "/Mycsv";
if (createFolder(myfolder)) {
try {
Document document = new Document();
document.open();
FileOutputStream fos=new FileOutputStream(myfolder + "/MyCSVFile.csv");
StringBuilder parsedText=new StringBuilder();
PdfReader reader1 = new PdfReader(pdfFilePath);
int n = reader1.getNumberOfPages();
for (int i = 0; i <n ; i++) {
parsedText.append(parsedText+PdfTextExtractor.getTextFromPage(reader1, i+1).trim()+"\n") ;
//Extracting the content fromx the different pages
}
StringReader stReader = new StringReader(parsedText.toString());
int t;
while((t=stReader.read())>0)
fos.write(t);
document.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
private boolean createFolder(String myfolder) {
File f = new File(myfolder);
if (!f.exists()) {
if (!f.mkdir()) {
return false;
} else {
return true;
}
}else{
return true;
}
}
I'm trying to solve a problem using java, iText, and the Java advanced imaging library. My software system uses ghostscript to create jpg thumbnail images etc... from PDF files. However on CentOS 5.x the highest version of ghostscript is 8.7 which has a known issue of not being able to handle PDF files containing JPEG 2000 images in them. My plan is to scan the file first and see if it contains jpeg2000 images (I've already got this part figured out); if so, then use iText and the Java Advanced Imaging library (contains the jpeg2000 read & write codecs) to convert the contained jpeg2000 files into regular jpeg files & then pass the new PDF file to ghostscript. The code below attempts this, but results in another file containing jpeg2000 files. Any help with this would be much appreciated.
public class ImageReplacer{
public static void main(String [] args){
try{
String RESULT = "";
PdfReader reader = new PdfReader("pdf_containing_jpeg2000_images.pdf");
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
MyImageRenderListener listener = new MyImageRenderListener(RESULT);
MyImageConverterListener clistener = new MyImageConverterListener(RESULT);
clistener.setReader(reader);
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
parser.processContent(i, clistener);
}
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream("out.pdf"));
stamper.close();
}catch(Exception e){
e.printStackTrace();
}
}
}
class MyImageConverterListener implements RenderListener {
protected String path = "";
protected PdfReader reader;
public MyImageConverterListener(String path) {
this.path = path;
}
public void beginTextBlock() { }
public void endTextBlock() { }
public void renderImage(ImageRenderInfo renderInfo) {
try {
PdfImageObject image = renderInfo.getImage();
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.JPXDECODE.equals(filter)) {
if(image.getDictionary().isStream()){
BufferedImage bi = image.getBufferedImage();
if (bi == null) return;
int width = (int)bi.getWidth();
int height = (int)bi.getHeight();
ByteArrayOutputStream imgBytes = new ByteArrayOutputStream();
ImageIO.write(bi, "JPG", imgBytes);
PRStream stream = new PRStream(reader,imgBytes.toByteArray());
stream.clear();
stream.setData(imgBytes.toByteArray(), false, PRStream.NO_COMPRESSION);
stream.put(PdfName.TYPE, PdfName.XOBJECT);
stream.put(PdfName.SUBTYPE, PdfName.IMAGE);
stream.put(new PdfName("foo"+Math.random()), new PdfName("bar"+Math.random()));
stream.put(PdfName.FILTER, PdfName.DCTDECODE);
stream.put(PdfName.WIDTH, new PdfNumber(width));
stream.put(PdfName.HEIGHT, new PdfNumber(height));
stream.put(PdfName.BITSPERCOMPONENT, new PdfNumber(8));
stream.put(PdfName.COLORSPACE, PdfName.DEVICERGB);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public void renderText(TextRenderInfo renderInfo) { }
public void setReader(PdfReader r){
reader = r;
}
}
So I managed to solve this one on my own (with a little help from iText in action by Bruno Lowagie - great book). Just to re-iterate, my intention is to scan a PDF using iText to see if it contains any JPEG2000 images and if it does output the same PDF but with the inner JPEG2000 images replaced with regular JPEG images. This solves the fatal ghostscript 8.7 'Unable to process JPXDecode data' error, but would also provide useful for making PDF's iOS compatible.
So without further a do kids; here goes...
Step 1) Download iText 5.x .jar file, and download jai_imageio-1.1.jar (the Java advanced imaging library that allows you to convert JPEG2000 files)
Step 2) Create a file called PDFConverter.java and put this code in it:
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.parser.PdfImageObject;
import com.itextpdf.text.pdf.PdfNumber;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import com.itextpdf.text.pdf.PdfStamper;
import java.io.*;
public class PDFConverter{
public static void main(String [] args){
if(args.length==1){
if(hasJpeg2000(args[0])){
System.out.println("Contains JPEG2000 images: Converting them to JPEG...");
convertPDF(args[0]);
System.out.println("Done...");
}else{
System.out.println("Doesn't contain any JPEG2000 images: Nothing to be done...");
}
}else{
System.out.println("Please specify a PDF filename as a command line argument!");
}
}
public static boolean hasJpeg2000(String s){
try{
PdfReader reader = new PdfReader(s);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream())continue;
stream = (PRStream)object;
PdfImageObject image = new PdfImageObject(stream);
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.JPXDECODE.equals(filter)) {
return true;
}
}
}catch(Exception e){
e.printStackTrace();
}
return false;
}
public static void convertPDF(String s){
try{
PdfReader reader = new PdfReader(s);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream())continue;
stream = (PRStream)object;
PdfImageObject image = new PdfImageObject(stream);
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.JPXDECODE.equals(filter)) {
BufferedImage bi = image.getBufferedImage();
if (bi == null) continue;
int width = (int)(bi.getWidth());
int height = (int)(bi.getHeight());
ByteArrayOutputStream imgBytes = new ByteArrayOutputStream();
ImageIO.write(bi, "JPG", imgBytes);
stream.clear();
stream.setData(imgBytes.toByteArray(),false, PRStream.NO_COMPRESSION);
stream.put(PdfName.TYPE, PdfName.XOBJECT);
stream.put(PdfName.SUBTYPE, PdfName.IMAGE);
stream.put(new PdfName("foo"+Math.random()), new PdfName("bar"+Math.random()));
stream.put(PdfName.FILTER, PdfName.DCTDECODE);
stream.put(PdfName.WIDTH, new PdfNumber(width));
stream.put(PdfName.HEIGHT, new PdfNumber(height));
stream.put(PdfName.BITSPERCOMPONENT,new PdfNumber(8));
stream.put(PdfName.COLORSPACE, PdfName.DEVICERGB);
}
}
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream("out.pdf")); stamper.close();
}catch(Exception e){
e.printStackTrace();
}
}
}
Step 3)Compile the above file in the following manner:
javac -cp .:iText-5.0.4.jar:jai_imageio-1.1.jar PDFConverter.java
Step 4)Run the program with a PDF...
java -cp .:iText-5.0.4.jar:jai_imageio-1.1.jar PDFConverter PDFFileName.pdf
Booyah...
Works great, but I had some issues with GlassFish v3.1. Glassfish acted as if there was no jai_imageio-1.1.jar in Classpath. I fixed this putting jai_imageio.jar in my "/path/to/glassfish/domains/domain1/lib/ext/" folder.
I have had some NullPointer problems with the PDFConverter from Reece, because my PDF had different types of embedded elements inside GhostScript on CentOS 5.3 - Unable to process JPXDecode data. So I make some Object/Type checks and added the output filename to command line.
Everything else is great and worked perfect to the image jpeg2000 problem. Thanks to Reece :)
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.*;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.parser.PdfImageObject;
import com.itextpdf.text.pdf.PdfNumber;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import com.itextpdf.text.pdf.PdfStamper;
import java.io.*;
public class PDFConverter{
public static void main(String [] args){
if(args.length==2){
if(hasJpeg2000(args[0])){
System.out.println("Contains JPEG2000 images: Converting them to JPEG...");
convertPDF(args[0], args[1]);
System.out.println("Done...");
}else{
System.out.println("Doesn't contain any JPEG2000 images: Nothing to be done...");
}
}else{
System.out.println("Please specify a PDF filename and a output filename as a command line arguments!");
}
}
public static boolean hasJpeg2000(String s){
try{
PdfReader reader = new PdfReader(s);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream())continue;
stream = (PRStream)object;
PdfObject pdfsubtype = stream.get(PdfName.SUBTYPE);
System.out.println(pdfsubtype);
if (pdfsubtype != null && pdfsubtype.toString().equals(PdfName.IMAGE.toString())) {
PdfImageObject image = new PdfImageObject(stream);
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.JPXDECODE.equals(filter)) {
return true;
}
}
}
}catch(Exception e){
e.printStackTrace();
}
return false;
}
private static void filterObject(PdfImageObject image,PdfName filter,PRStream stream) throws java.io.IOException {
if (PdfName.JPXDECODE.equals(filter)) {
BufferedImage bi = image.getBufferedImage();
if (bi == null) return;
int width = (int)(bi.getWidth());
int height = (int)(bi.getHeight());
ByteArrayOutputStream imgBytes = new ByteArrayOutputStream();
ImageIO.write(bi, "JPG", imgBytes);
stream.clear();
stream.setData(imgBytes.toByteArray(),false, PRStream.NO_COMPRESSION);
stream.put(PdfName.TYPE, PdfName.XOBJECT);
stream.put(PdfName.SUBTYPE, PdfName.IMAGE);
stream.put(new PdfName("foo"+Math.random()), new PdfName("bar"+Math.random()));
stream.put(PdfName.FILTER, PdfName.DCTDECODE);
stream.put(PdfName.WIDTH, new PdfNumber(width));
stream.put(PdfName.HEIGHT, new PdfNumber(height));
stream.put(PdfName.BITSPERCOMPONENT,new PdfNumber(8));
stream.put(PdfName.COLORSPACE, PdfName.DEVICERGB);
}
}
public static void convertPDF(String s, String out){
try{
PdfReader reader = new PdfReader(s);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream())continue;
stream = (PRStream)object;
PdfObject pdfsubtype = stream.get(PdfName.SUBTYPE);
if (pdfsubtype != null && pdfsubtype.toString().equals(PdfName.IMAGE.toString())) {
PdfImageObject image = new PdfImageObject(stream);
Object listOrName = image.get(PdfName.FILTER);
if (listOrName instanceof PdfName) {
PdfName filter = (PdfName)image.get(PdfName.FILTER);
filterObject(image, filter, stream);
}
else if (listOrName instanceof PdfArray) {
PdfArray list = (PdfArray)image.get(PdfName.FILTER);
for (int j = 0; j < list.size(); j++) {
PdfName filter = list.getAsName(j);
filterObject(image, filter, stream);
}
}
else {
System.err.println("Unknown Obejcttype: " + listOrName);
}
}
}
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(out)); stamper.close();
} catch(Exception e){
e.printStackTrace();
}
}
}