iText PDF; howto convert jpeg2000 to jpg using Java - java

I'm trying to solve a problem using java, iText, and the Java advanced imaging library. My software system uses ghostscript to create jpg thumbnail images etc... from PDF files. However on CentOS 5.x the highest version of ghostscript is 8.7 which has a known issue of not being able to handle PDF files containing JPEG 2000 images in them. My plan is to scan the file first and see if it contains jpeg2000 images (I've already got this part figured out); if so, then use iText and the Java Advanced Imaging library (contains the jpeg2000 read & write codecs) to convert the contained jpeg2000 files into regular jpeg files & then pass the new PDF file to ghostscript. The code below attempts this, but results in another file containing jpeg2000 files. Any help with this would be much appreciated.
public class ImageReplacer{
public static void main(String [] args){
try{
String RESULT = "";
PdfReader reader = new PdfReader("pdf_containing_jpeg2000_images.pdf");
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
MyImageRenderListener listener = new MyImageRenderListener(RESULT);
MyImageConverterListener clistener = new MyImageConverterListener(RESULT);
clistener.setReader(reader);
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
parser.processContent(i, clistener);
}
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream("out.pdf"));
stamper.close();
}catch(Exception e){
e.printStackTrace();
}
}
}
class MyImageConverterListener implements RenderListener {
protected String path = "";
protected PdfReader reader;
public MyImageConverterListener(String path) {
this.path = path;
}
public void beginTextBlock() { }
public void endTextBlock() { }
public void renderImage(ImageRenderInfo renderInfo) {
try {
PdfImageObject image = renderInfo.getImage();
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.JPXDECODE.equals(filter)) {
if(image.getDictionary().isStream()){
BufferedImage bi = image.getBufferedImage();
if (bi == null) return;
int width = (int)bi.getWidth();
int height = (int)bi.getHeight();
ByteArrayOutputStream imgBytes = new ByteArrayOutputStream();
ImageIO.write(bi, "JPG", imgBytes);
PRStream stream = new PRStream(reader,imgBytes.toByteArray());
stream.clear();
stream.setData(imgBytes.toByteArray(), false, PRStream.NO_COMPRESSION);
stream.put(PdfName.TYPE, PdfName.XOBJECT);
stream.put(PdfName.SUBTYPE, PdfName.IMAGE);
stream.put(new PdfName("foo"+Math.random()), new PdfName("bar"+Math.random()));
stream.put(PdfName.FILTER, PdfName.DCTDECODE);
stream.put(PdfName.WIDTH, new PdfNumber(width));
stream.put(PdfName.HEIGHT, new PdfNumber(height));
stream.put(PdfName.BITSPERCOMPONENT, new PdfNumber(8));
stream.put(PdfName.COLORSPACE, PdfName.DEVICERGB);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public void renderText(TextRenderInfo renderInfo) { }
public void setReader(PdfReader r){
reader = r;
}
}

So I managed to solve this one on my own (with a little help from iText in action by Bruno Lowagie - great book). Just to re-iterate, my intention is to scan a PDF using iText to see if it contains any JPEG2000 images and if it does output the same PDF but with the inner JPEG2000 images replaced with regular JPEG images. This solves the fatal ghostscript 8.7 'Unable to process JPXDecode data' error, but would also provide useful for making PDF's iOS compatible.
So without further a do kids; here goes...
Step 1) Download iText 5.x .jar file, and download jai_imageio-1.1.jar (the Java advanced imaging library that allows you to convert JPEG2000 files)
Step 2) Create a file called PDFConverter.java and put this code in it:
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.parser.PdfImageObject;
import com.itextpdf.text.pdf.PdfNumber;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import com.itextpdf.text.pdf.PdfStamper;
import java.io.*;
public class PDFConverter{
public static void main(String [] args){
if(args.length==1){
if(hasJpeg2000(args[0])){
System.out.println("Contains JPEG2000 images: Converting them to JPEG...");
convertPDF(args[0]);
System.out.println("Done...");
}else{
System.out.println("Doesn't contain any JPEG2000 images: Nothing to be done...");
}
}else{
System.out.println("Please specify a PDF filename as a command line argument!");
}
}
public static boolean hasJpeg2000(String s){
try{
PdfReader reader = new PdfReader(s);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream())continue;
stream = (PRStream)object;
PdfImageObject image = new PdfImageObject(stream);
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.JPXDECODE.equals(filter)) {
return true;
}
}
}catch(Exception e){
e.printStackTrace();
}
return false;
}
public static void convertPDF(String s){
try{
PdfReader reader = new PdfReader(s);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream())continue;
stream = (PRStream)object;
PdfImageObject image = new PdfImageObject(stream);
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.JPXDECODE.equals(filter)) {
BufferedImage bi = image.getBufferedImage();
if (bi == null) continue;
int width = (int)(bi.getWidth());
int height = (int)(bi.getHeight());
ByteArrayOutputStream imgBytes = new ByteArrayOutputStream();
ImageIO.write(bi, "JPG", imgBytes);
stream.clear();
stream.setData(imgBytes.toByteArray(),false, PRStream.NO_COMPRESSION);
stream.put(PdfName.TYPE, PdfName.XOBJECT);
stream.put(PdfName.SUBTYPE, PdfName.IMAGE);
stream.put(new PdfName("foo"+Math.random()), new PdfName("bar"+Math.random()));
stream.put(PdfName.FILTER, PdfName.DCTDECODE);
stream.put(PdfName.WIDTH, new PdfNumber(width));
stream.put(PdfName.HEIGHT, new PdfNumber(height));
stream.put(PdfName.BITSPERCOMPONENT,new PdfNumber(8));
stream.put(PdfName.COLORSPACE, PdfName.DEVICERGB);
}
}
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream("out.pdf")); stamper.close();
}catch(Exception e){
e.printStackTrace();
}
}
}
Step 3)Compile the above file in the following manner:
javac -cp .:iText-5.0.4.jar:jai_imageio-1.1.jar PDFConverter.java
Step 4)Run the program with a PDF...
java -cp .:iText-5.0.4.jar:jai_imageio-1.1.jar PDFConverter PDFFileName.pdf
Booyah...

Works great, but I had some issues with GlassFish v3.1. Glassfish acted as if there was no jai_imageio-1.1.jar in Classpath. I fixed this putting jai_imageio.jar in my "/path/to/glassfish/domains/domain1/lib/ext/" folder.

I have had some NullPointer problems with the PDFConverter from Reece, because my PDF had different types of embedded elements inside GhostScript on CentOS 5.3 - Unable to process JPXDecode data. So I make some Object/Type checks and added the output filename to command line.
Everything else is great and worked perfect to the image jpeg2000 problem. Thanks to Reece :)
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.*;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.parser.PdfImageObject;
import com.itextpdf.text.pdf.PdfNumber;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import com.itextpdf.text.pdf.PdfStamper;
import java.io.*;
public class PDFConverter{
public static void main(String [] args){
if(args.length==2){
if(hasJpeg2000(args[0])){
System.out.println("Contains JPEG2000 images: Converting them to JPEG...");
convertPDF(args[0], args[1]);
System.out.println("Done...");
}else{
System.out.println("Doesn't contain any JPEG2000 images: Nothing to be done...");
}
}else{
System.out.println("Please specify a PDF filename and a output filename as a command line arguments!");
}
}
public static boolean hasJpeg2000(String s){
try{
PdfReader reader = new PdfReader(s);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream())continue;
stream = (PRStream)object;
PdfObject pdfsubtype = stream.get(PdfName.SUBTYPE);
System.out.println(pdfsubtype);
if (pdfsubtype != null && pdfsubtype.toString().equals(PdfName.IMAGE.toString())) {
PdfImageObject image = new PdfImageObject(stream);
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.JPXDECODE.equals(filter)) {
return true;
}
}
}
}catch(Exception e){
e.printStackTrace();
}
return false;
}
private static void filterObject(PdfImageObject image,PdfName filter,PRStream stream) throws java.io.IOException {
if (PdfName.JPXDECODE.equals(filter)) {
BufferedImage bi = image.getBufferedImage();
if (bi == null) return;
int width = (int)(bi.getWidth());
int height = (int)(bi.getHeight());
ByteArrayOutputStream imgBytes = new ByteArrayOutputStream();
ImageIO.write(bi, "JPG", imgBytes);
stream.clear();
stream.setData(imgBytes.toByteArray(),false, PRStream.NO_COMPRESSION);
stream.put(PdfName.TYPE, PdfName.XOBJECT);
stream.put(PdfName.SUBTYPE, PdfName.IMAGE);
stream.put(new PdfName("foo"+Math.random()), new PdfName("bar"+Math.random()));
stream.put(PdfName.FILTER, PdfName.DCTDECODE);
stream.put(PdfName.WIDTH, new PdfNumber(width));
stream.put(PdfName.HEIGHT, new PdfNumber(height));
stream.put(PdfName.BITSPERCOMPONENT,new PdfNumber(8));
stream.put(PdfName.COLORSPACE, PdfName.DEVICERGB);
}
}
public static void convertPDF(String s, String out){
try{
PdfReader reader = new PdfReader(s);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream())continue;
stream = (PRStream)object;
PdfObject pdfsubtype = stream.get(PdfName.SUBTYPE);
if (pdfsubtype != null && pdfsubtype.toString().equals(PdfName.IMAGE.toString())) {
PdfImageObject image = new PdfImageObject(stream);
Object listOrName = image.get(PdfName.FILTER);
if (listOrName instanceof PdfName) {
PdfName filter = (PdfName)image.get(PdfName.FILTER);
filterObject(image, filter, stream);
}
else if (listOrName instanceof PdfArray) {
PdfArray list = (PdfArray)image.get(PdfName.FILTER);
for (int j = 0; j < list.size(); j++) {
PdfName filter = list.getAsName(j);
filterObject(image, filter, stream);
}
}
else {
System.err.println("Unknown Obejcttype: " + listOrName);
}
}
}
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(out)); stamper.close();
} catch(Exception e){
e.printStackTrace();
}
}
}

Related

Extract Only Images from PDF File in java using Apache Tika or PDFBox? [duplicate]

I m trying to extract images from a pdf using pdfbox. The example pdf here
But i m getting blank images only.
The code i m trying:-
public static void main(String[] args) {
PDFImageExtract obj = new PDFImageExtract();
try {
obj.read_pdf();
} catch (IOException ex) {
System.out.println("" + ex);
}
}
void read_pdf() throws IOException {
PDDocument document = null;
try {
document = PDDocument.load("C:\\Users\\Pradyut\\Documents\\MCS-034.pdf");
} catch (IOException ex) {
System.out.println("" + ex);
}
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
int i =1;
String name = null;
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
Map pageImages = resources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
i ++;
}
}
}
}
Thanks
Here is code using PDFBox 2.0.1 that will get a list of all images from the PDF. This is different than the other code in that it will recurse through the document instead of trying to get the images from the top level.
public List<RenderedImage> getImagesFromPDF(PDDocument document) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (PDPage page : document.getPages()) {
images.addAll(getImagesFromResources(page.getResources()));
}
return images;
}
private List<RenderedImage> getImagesFromResources(PDResources resources) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (COSName xObjectName : resources.getXObjectNames()) {
PDXObject xObject = resources.getXObject(xObjectName);
if (xObject instanceof PDFormXObject) {
images.addAll(getImagesFromResources(((PDFormXObject) xObject).getResources()));
} else if (xObject instanceof PDImageXObject) {
images.add(((PDImageXObject) xObject).getImage());
}
}
return images;
}
The below GetImagesFromPDF java class get all images in 04-Request-Headers.pdf file and save those files into destination folder PDFCopy.
import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
#SuppressWarnings({ "unchecked", "rawtypes", "deprecation" })
public class GetImagesFromPDF {
public static void main(String[] args) {
try {
String sourceDir = "C:/PDFCopy/04-Request-Headers.pdf";// Paste pdf files in PDFCopy folder to read
String destinationDir = "C:/PDFCopy/";
File oldFile = new File(sourceDir);
if (oldFile.exists()) {
PDDocument document = PDDocument.load(sourceDir);
List<PDPage> list = document.getDocumentCatalog().getAllPages();
String fileName = oldFile.getName().replace(".pdf", "_cover");
int totalImages = 1;
for (PDPage page : list) {
PDResources pdResources = page.getResources();
Map pageImages = pdResources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage pdxObjectImage = (PDXObjectImage) pageImages.get(key);
pdxObjectImage.write2file(destinationDir + fileName+ "_" + totalImages);
totalImages++;
}
}
}
} else {
System.err.println("File not exists");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
For PDFBox 2.0.1, pudaykiran's answer must be slightly modified since some APIs have been changed.
public static void testPDFBoxExtractImages() throws Exception {
PDDocument document = PDDocument.load(new File("D:/Temp/Test.pdf"));
PDPageTree list = document.getPages();
for (PDPage page : list) {
PDResources pdResources = page.getResources();
for (COSName c : pdResources.getXObjectNames()) {
PDXObject o = pdResources.getXObject(c);
if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
File file = new File("D:/Temp/" + System.nanoTime() + ".png");
ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject)o).getImage(), "png", file);
}
}
}
}
Just add the .jpeg to the end of your path:
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i + ".jpeg");
That works for me.
You can use PDPage.convertToImage() function which can convert the PDF page into a BufferedImage. Next you can use the BufferedImage to create an Image.
Use the following reference for further detail:
All PDF realated classes in PDFBox you can get in
Apache PDFBox 1.8.3 API
Here you can see PDPage related documentation.
And do not forget to look for PDPage.convertToImage() function in PDPage class.
This is a kotlin version of #Matt's answer.
fun <R> PDResources.onImageResources(block: (RenderedImage) -> (R)): List<R> =
this.xObjectNames.flatMap {
when (val xObject = this.getXObject(it)) {
is PDFormXObject -> xObject.resources.onImageResources(block)
is PDImageXObject -> listOf(block(xObject.image))
else -> emptyList()
}
}
You can use it on PDPage Resources like this:
page.resources.onImageResources { image ->
Files.createTempFile("image", "xxx").also { path->
if(!ImageIO.write(it, "xxx", file.toFile()))
IllegalStateException("Couldn't write image to file")
}
}
Where "xxx" is the format you need (like "jpeg")
For someone who want just copy and paste this ready to use code
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.UUID;
public class ExtractImagesUseCase extends PDFStreamEngine{
private final String filePath;
private final String outputDir;
// Constructor
public ExtractImagesUseCase(String filePath,
String outputDir){
this.filePath = filePath;
this.outputDir = outputDir;
}
// Execute
public void execute(){
try{
File file = new File(filePath);
PDDocument document = PDDocument.load(file);
for(PDPage page : document.getPages()){
processPage(page);
}
}catch(IOException e){
e.printStackTrace();
}
}
#Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException{
String operation = operator.getName();
if("Do".equals(operation)){
COSName objectName = (COSName) operands.get(0);
PDXObject pdxObject = getResources().getXObject(objectName);
if(pdxObject instanceof PDImageXObject){
// Image
PDImageXObject image = (PDImageXObject) pdxObject;
BufferedImage bImage = image.getImage();
// File
String randomName = UUID.randomUUID().toString();
File outputFile = new File(outputDir,randomName + ".png");
// Write image to file
ImageIO.write(bImage, "PNG", outputFile);
}else if(pdxObject instanceof PDFormXObject){
PDFormXObject form = (PDFormXObject) pdxObject;
showForm(form);
}
}
else super.processOperator(operator, operands);
}
}
Demo
public class ExtractImageDemo{
public static void main(String[] args){
String filePath = "C:\\Users\\John\\Downloads\\Documents\\sample-file.pdf";
String outputDir = "C:\\Users\\John\\Downloads\\Documents\\Output";
ExtractImagesUseCase useCase = new ExtractImagesUseCase(
filePath,
outputDir
);
useCase.execute();
}
}
Instead of calling
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
You can use the ImageIO.write() static method to write the RGB image out in whatever format you need. Here I've used PNG:
File outputFile = new File( "C:\\Users\\Pradyut\\Documents\\image" + i + ".png");
ImageIO.write( image.getRGBImage(), "png", outputFile);

How change pdf file in image (jpeg, jpg ) format and crop

How to convert pdf file to image(jpg ,jpeg..) format and how to crop the image as well. Any pointers(any existing API maybe)?
import java.io.File;
public class PDFtoJPGConverter {
public static void main(String[] args)
{
try
{
PDFDocument doc = new PDFDocument();
doc.loadPDF("Sample.pdf");
int pageCount = doc.getPageCount();
for(int i = 0; i < pageCount; i++)
{
BufferedImage image = doc.toImage(i);
ImageIO.write(image,"jpg", new File("output"+ i +".jpg"));
}
}
catch (Exception e) {
e.printStackTrace();
}
}
}
You could try pdf-renderer it is a pure java solution.
Code Solution:
package com.pdf.pdfbox.examples;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.List;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
#SuppressWarnings("unchecked")
public class ConvertPDFPagesToImages {
public static void main(String[] args) {
try {
String sourceDir = "C:/Documents/04-Request-Headers.pdf"; // Pdf files are read from this folder
String destinationDir = "C:/Documents/Converted_PdfFiles_to_Image/"; // converted images from pdf document are saved here
File sourceFile = new File(sourceDir);
File destinationFile = new File(destinationDir);
if (!destinationFile.exists()) {
destinationFile.mkdir();
System.out.println("Folder Created -> "+ destinationFile.getAbsolutePath());
}
if (sourceFile.exists()) {
System.out.println("Images copied to Folder: "+ destinationFile.getName());
PDDocument document = PDDocument.load(sourceDir);
List<PDPage> list = document.getDocumentCatalog().getAllPages();
System.out.println("Total files to be converted -> "+ list.size());
String fileName = sourceFile.getName().replace(".pdf", "");
int pageNumber = 1;
for (PDPage page : list) {
BufferedImage image = page.convertToImage();
File outputfile = new File(destinationDir + fileName +"_"+ pageNumber +".png");
System.out.println("Image Created -> "+ outputfile.getName());
ImageIO.write(image, "png", outputfile);
pageNumber++;
}
document.close();
System.out.println("Converted Images are saved at -> "+ destinationFile.getAbsolutePath());
} else {
System.err.println(sourceFile.getName() +" File not exists");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}

Converting PDF to PDF/A with PDFBox

currently I'm trying to convert a PDF to PDF/A.
However somehow I don't know if I can convert the colorspace is there any way by doing so?
this is my code, yet:
PDDocumentInformation info = doc.getDocumentInformation();
System.out.println("Page Count=" + doc.getNumberOfPages());
System.out.println("Title=" + info.getTitle());
System.out.println("Author=" + info.getAuthor());
System.out.println("Subject=" + info.getSubject());
System.out.println("Keywords=" + info.getKeywords());
System.out.println("Creator=" + info.getCreator());
System.out.println("Producer=" + info.getProducer());
System.out.println("Creation Date=" + info.getCreationDate());
System.out.println("Modification Date=" + info.getModificationDate());
System.out.println("Trapped=" + info.getTrapped());
PDDocumentCatalog cat = doc.getDocumentCatalog();
XMPMetadata xmp = XMPMetadata.createXMPMetadata();
PDFAIdentificationSchema pdfaid = xmp.createAndAddPFAIdentificationSchema();
pdfaid.setConformance("A");
pdfaid.setPart(3);
pdfaid.setAboutAsSimple(null);
DublinCoreSchema dublinCoreSchema = xmp.createAndAddDublinCoreSchema();
dublinCoreSchema.setTitle(info.getTitle());
dublinCoreSchema.addCreator(info.getAuthor());
AdobePDFSchema adobePDFSchema = xmp.createAndAddAdobePDFSchema();
adobePDFSchema.setProducer(info.getProducer());
XMPBasicSchema xmpBasicSchema = xmp.createAndAddXMPBasicSchema();
xmpBasicSchema.setCreatorTool(info.getCreator());
xmpBasicSchema.setCreateDate(info.getCreationDate());
xmpBasicSchema.setModifyDate(info.getModificationDate());
xmp.addSchema(pdfaid);
XmpSerializer serializer = new XmpSerializer();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
serializer.serialize(xmp, baos, true);
InputStream colorProfile = PdfConverter.class.getResourceAsStream("/sRGBColorSpaceProfile.icm");
PDOutputIntent oi = new PDOutputIntent(doc, colorProfile);
oi.setInfo("sRGB IEC61966-2.1");
oi.setOutputCondition("sRGB IEC61966-2.1");
oi.setOutputConditionIdentifier("sRGB IEC61966-2.1");
oi.setRegistryName("http://www.color.org");
cat.addOutputIntent(oi);
PDMetadata metadata = new PDMetadata(doc);
metadata.importXMPMetadata(baos.toByteArray());
cat.setMetadata(metadata);
The colorspace gets added however on validation i get:
2.3.2 : Unexpected key in Graphic object definition, The ColorSpace is unknown
For every page/element whatever, it appears quite often.
Could I do anything against it? Like converting the ColorsSpace? Using antoher library?
I have found this trick to convert pdf to pdfA.
Fill the PDF form
Convert it to image
Create a valid PDFA form as explained in PDFBox website
Fill the image created as the result
In this example, I used : OoPdfFormExample.pdf that can be found easily in internet.
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.graphics.color.PDOutputIntent;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.preflight.Format;
import org.apache.pdfbox.preflight.PreflightDocument;
import org.apache.pdfbox.preflight.ValidationResult;
import org.apache.pdfbox.preflight.exception.SyntaxValidationException;
import org.apache.pdfbox.preflight.parser.PreflightParser;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.xmpbox.XMPMetadata;
import org.apache.xmpbox.schema.DublinCoreSchema;
import org.apache.xmpbox.schema.PDFAIdentificationSchema;
import org.apache.xmpbox.type.BadFieldValueException;
import org.apache.xmpbox.xml.XmpSerializer;
import javax.xml.transform.TransformerException;
import java.awt.image.BufferedImage;
import java.io.*;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Random;
public class CreatePDFAFile {
private static final String OUTPUT_DIR = "tmp";
static String separator = FileSystems.getDefault().getSeparator();
public static void main(String[] args) throws IOException {
Path tmpDir = getRandomPath();
String fileInput = fillForm("template/OoPdfFormExample.pdf", tmpDir);
String image = PDF2Image(fileInput, tmpDir);
String pdfa = createPDFA(image, tmpDir);
checkPDFAValidation(pdfa);
}
private static String fillForm(String formTemplate, Path path) throws IOException {
String fileOut = path + separator + "FillForm.pdf";
try (PDDocument pdfDocument = PDDocument.load(new File(formTemplate))) {
PDAcroForm acroForm = pdfDocument.getDocumentCatalog().getAcroForm();
if (acroForm != null) {
acroForm.getField(acroForm.getFields().get(0).getFullyQualifiedName()).setValue("TEST");
}
acroForm.refreshAppearances();
acroForm.flatten();
pdfDocument.save(fileOut);
}
return fileOut;
}
public static String PDF2Image(String fileInput, Path path) {
String fileName = "";
try (final PDDocument document = PDDocument.load(new File(fileInput))) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
for (int page = 0; page < document.getNumberOfPages(); ++page) {
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
fileName = path + separator + "image-" + page + ".png";
ImageIOUtil.writeImage(bim, fileName, 300);
}
} catch (IOException e) {
System.err.println("Exception while trying to create pdf document - " + e);
}
return fileName;
}
public static String createPDFA(String imagePath, Path path) throws IOException {
try (PDDocument doc = new PDDocument()) {
PDPage page = new PDPage();
doc.addPage(page);
PDFont font = PDType0Font.load(doc, new File("template" + separator + "LiberationSans-Regular.ttf"));
if (!font.isEmbedded()) {
throw new IllegalStateException("PDF/A compliance requires that all fonts used for"
+ " text rendering in rendering modes other than rendering mode 3 are embedded.");
}
try (PDPageContentStream contents = new PDPageContentStream(doc, page)) {
contents.beginText();
contents.setFont(font, 12);
contents.newLineAtOffset(100, 700);
contents.showText("");
contents.endText();
}
// add XMP metadata
XMPMetadata xmp = XMPMetadata.createXMPMetadata();
String fileName = path + separator + "FinalPDFAFile.pdf";
try {
DublinCoreSchema dc = xmp.createAndAddDublinCoreSchema();
dc.setTitle(fileName);
PDFAIdentificationSchema id = xmp.createAndAddPFAIdentificationSchema();
id.setPart(1);
id.setConformance("B");
XmpSerializer serializer = new XmpSerializer();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
serializer.serialize(xmp, baos, true);
PDMetadata metadata = new PDMetadata(doc);
metadata.importXMPMetadata(baos.toByteArray());
doc.getDocumentCatalog().setMetadata(metadata);
} catch (BadFieldValueException | TransformerException e) {
throw new IllegalArgumentException(e);
}
// sRGB output intent
InputStream colorProfile = new FileInputStream(new File("template/sRGB.icc"));
PDOutputIntent intent = new PDOutputIntent(doc, colorProfile);
intent.setInfo("");
intent.setOutputCondition("");
intent.setOutputConditionIdentifier("");
intent.setRegistryName("");
doc.getDocumentCatalog().addOutputIntent(intent);
PDImageXObject pdImage = PDImageXObject.createFromFile(imagePath, doc);
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
float scale = 1 / 5f;
contentStream.drawImage(pdImage, 20, 20, pdImage.getWidth() * scale, pdImage.getHeight() * scale);
}
doc.save(fileName);
return fileName;
}
}
private static void checkPDFAValidation(String fileName) throws IOException {
ValidationResult result = null;
PreflightParser parser = new PreflightParser(fileName);
try {
parser.parse(Format.PDF_A1B);
PreflightDocument document = parser.getPreflightDocument();
document.validate();
// Get validation result
result = document.getResult();
document.close();
} catch (SyntaxValidationException e) {
result = e.getResult();
}
if (result.isValid()) {
System.out.println("The file " + fileName + " is a valid PDF/A-1b file");
} else {
System.out.println("The file" + fileName + " is not valid, error(s) :");
for (ValidationResult.ValidationError error : result.getErrorsList()) {
System.out.println(error.getErrorCode() + " : " + error.getDetails());
}
}
}
private static Path getRandomPath() throws IOException {
String path = generateRandom();
Path tmpDir = Paths.get(OUTPUT_DIR + separator + path + separator);
Files.createDirectory(tmpDir);
return tmpDir;
}
private static String generateRandom() {
String aToZ = "ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890";
Random rand = new Random();
StringBuilder res = new StringBuilder();
for (int i = 0; i < 17; i++) {
int randIndex = rand.nextInt(aToZ.length());
res.append(aToZ.charAt(randIndex));
}
return res.toString();
}
}

itext Converting PDF to csv

I am trying to use itext framework to convert a pdf file into a csv for import into excel.
The output is garbled and I pressume I am missing a step in regards to format conversion however I can't seem to find the information in the itext site and am looking for assistance.
Current is as below.
package com.pdf.convert;
import java.io.FileOutputStream;
import java.io.IOException;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.pdf.PdfImportedPage;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfWriter;
public class ThirdPDF {
private static String INPUTFILE = "/location/test.pdf";
private static String OUTPUTFILE = "/location/test.csv";
public static void main(String[] args) throws DocumentException,
IOException {
Document document = new Document();
PdfWriter writer = PdfWriter.getInstance(document,
new FileOutputStream(OUTPUTFILE));
document.open();
PdfReader reader = new PdfReader(INPUTFILE);
int n = reader.getNumberOfPages();
PdfImportedPage page;
// Go through all pages
for (int i = 1; i <= n; i++) {
// Only page number 2 will be included
if (i == 2) {
page = writer.getImportedPage(reader, i);
Image instance = Image.getInstance(page);
document.add(instance);
}
}
document.close();
}
}
Converting PDF file to CSV file.
Present Directory and File creation is based on Android Framework.
Change your path and Directory as per your Framework Accordingly.
private void convertPDFToCSV(String pdfFilePath) {
String myfolder = Environment.getExternalStorageDirectory() + "/Mycsv";
if (createFolder(myfolder)) {
try {
Document document = new Document();
document.open();
FileOutputStream fos=new FileOutputStream(myfolder + "/MyCSVFile.csv");
StringBuilder parsedText=new StringBuilder();
PdfReader reader1 = new PdfReader(pdfFilePath);
int n = reader1.getNumberOfPages();
for (int i = 0; i <n ; i++) {
parsedText.append(parsedText+PdfTextExtractor.getTextFromPage(reader1, i+1).trim()+"\n") ;
//Extracting the content fromx the different pages
}
StringReader stReader = new StringReader(parsedText.toString());
int t;
while((t=stReader.read())>0)
fos.write(t);
document.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
private boolean createFolder(String myfolder) {
File f = new File(myfolder);
if (!f.exists()) {
if (!f.mkdir()) {
return false;
} else {
return true;
}
}else{
return true;
}
}

How to solve this iText PDF error?

I'm merging two pdf pages (test1.pdf & test2.pdf) using iText PDF and got the output in test_result.pdf. But the output page has not come out like the input page, it's cropped to half of the actual size. How to overcome this error? Here is my code:
public class MergePDF {
public static void main(String[] args) {
try {
List<InputStream> pdfs = new ArrayList<InputStream>();
pdfs.add(new FileInputStream("test1.pdf"));
pdfs.add(new FileInputStream("test2.pdf/"));
// pdfs.add(new FileInputStream("test_result.pdf/"));
OutputStream output = new FileOutputStream("/home/ant000112/merge_result.pdf");
MergePDF.concatPDFs(pdfs, output, true);
} catch (Exception e) {
e.printStackTrace();
}
}
public static void concatPDFs(List<InputStream> streamOfPDFFiles,
OutputStream outputStream, boolean paginate) {
Document document = new Document();
try {
List<InputStream> pdfs = streamOfPDFFiles;
List<PdfReader> readers = new ArrayList<PdfReader>();
int totalPages = 0;
Iterator<InputStream> iteratorPDFs = pdfs.iterator();
// Create Readers for the pdfs.
while (iteratorPDFs.hasNext()) {
InputStream pdf = iteratorPDFs.next();
PdfReader pdfReader = new PdfReader(pdf);
readers.add(pdfReader);
totalPages += pdfReader.getNumberOfPages();
}
// Create a writer for the outputstream
PdfWriter writer = PdfWriter.getInstance(document, outputStream);
document.open();
//BaseFont bf = BaseFont.createFont(BaseFont.HELVETICA,BaseFont.CP1252, BaseFont.NOT_EMBEDDED);
BaseFont bf = BaseFont.createFont(BaseFont.HELVETICA,BaseFont.CP1257, BaseFont.NOT_EMBEDDED);
PdfContentByte cb = writer.getDirectContent(); // Holds the PDF
// data
PdfImportedPage page;
int currentPageNumber = 0;
int pageOfCurrentReaderPDF = 0;
Iterator<PdfReader> iteratorPDFReader = readers.iterator();
// Loop through the PDF files and add to the output.
while (iteratorPDFReader.hasNext()) {
PdfReader pdfReader = iteratorPDFReader.next();
// Create a new page in the target for each source page.
while (pageOfCurrentReaderPDF < pdfReader.getNumberOfPages()) {
document.newPage();
pageOfCurrentReaderPDF++;
currentPageNumber++;
page = writer.getImportedPage(pdfReader,
pageOfCurrentReaderPDF);
cb.addTemplate(page, 0, 0);
// Code for pagination.
if (paginate) {
cb.beginText();
cb.setFontAndSize(bf, 9);
cb.showTextAligned(PdfContentByte.ALIGN_CENTER, ""+ currentPageNumber + " of " + totalPages, 520,5, 0);
cb.endText();
}
}
pageOfCurrentReaderPDF = 0;
}
outputStream.flush();
document.close();
outputStream.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (document.isOpen())
document.close();
System.out.println("ghghklh");
try {
if (outputStream != null)
outputStream.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
}
Did you try reading examples from the iText site? It appears you're doing it in a different way. Like you're not using PdfCopy.
/*
* This class is part of the book "iText in Action - 2nd Edition"
* written by Bruno Lowagie (ISBN: 9781935182610)
* For more info, go to: http://itextpdf.com/examples/
* This example only works with the AGPL version of iText.
*/
package part2.chapter06;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.SQLException;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfCopy;
import com.itextpdf.text.pdf.PdfReader;
import part1.chapter02.MovieHistory;
import part1.chapter02.MovieLinks1;
public class Concatenate {
/** The resulting PDF file. */
public static final String RESULT
= "results/part2/chapter06/concatenated.pdf";
/**
* Main method.
* #param args no arguments needed
* #throws DocumentException
* #throws IOException
* #throws SQLException
*/
public static void main(String[] args)
throws IOException, DocumentException, SQLException {
// using previous examples to create PDFs
MovieLinks1.main(args);
MovieHistory.main(args);
String[] files = { MovieLinks1.RESULT, MovieHistory.RESULT };
// step 1
Document document = new Document();
// step 2
PdfCopy copy = new PdfCopy(document, new FileOutputStream(RESULT));
// step 3
document.open();
// step 4
PdfReader reader;
int n;
// loop over the documents you want to concatenate
for (int i = 0; i < files.length; i++) {
reader = new PdfReader(files[i]);
// loop over the pages in that document
n = reader.getNumberOfPages();
for (int page = 0; page < n; ) {
copy.addPage(copy.getImportedPage(reader, ++page));
}
copy.freeReader(reader);
}
// step 5
document.close();
}
}
http://itextpdf.com/examples/iia.php?id=123
EDIT: Just to be fair I downloaded the library and I tried the example. It works like a charm.

Categories

Resources