How change pdf file in image (jpeg, jpg ) format and crop - java

How to convert pdf file to image(jpg ,jpeg..) format and how to crop the image as well. Any pointers(any existing API maybe)?
import java.io.File;
public class PDFtoJPGConverter {
public static void main(String[] args)
{
try
{
PDFDocument doc = new PDFDocument();
doc.loadPDF("Sample.pdf");
int pageCount = doc.getPageCount();
for(int i = 0; i < pageCount; i++)
{
BufferedImage image = doc.toImage(i);
ImageIO.write(image,"jpg", new File("output"+ i +".jpg"));
}
}
catch (Exception e) {
e.printStackTrace();
}
}
}

You could try pdf-renderer it is a pure java solution.
Code Solution:
package com.pdf.pdfbox.examples;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.List;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
#SuppressWarnings("unchecked")
public class ConvertPDFPagesToImages {
public static void main(String[] args) {
try {
String sourceDir = "C:/Documents/04-Request-Headers.pdf"; // Pdf files are read from this folder
String destinationDir = "C:/Documents/Converted_PdfFiles_to_Image/"; // converted images from pdf document are saved here
File sourceFile = new File(sourceDir);
File destinationFile = new File(destinationDir);
if (!destinationFile.exists()) {
destinationFile.mkdir();
System.out.println("Folder Created -> "+ destinationFile.getAbsolutePath());
}
if (sourceFile.exists()) {
System.out.println("Images copied to Folder: "+ destinationFile.getName());
PDDocument document = PDDocument.load(sourceDir);
List<PDPage> list = document.getDocumentCatalog().getAllPages();
System.out.println("Total files to be converted -> "+ list.size());
String fileName = sourceFile.getName().replace(".pdf", "");
int pageNumber = 1;
for (PDPage page : list) {
BufferedImage image = page.convertToImage();
File outputfile = new File(destinationDir + fileName +"_"+ pageNumber +".png");
System.out.println("Image Created -> "+ outputfile.getName());
ImageIO.write(image, "png", outputfile);
pageNumber++;
}
document.close();
System.out.println("Converted Images are saved at -> "+ destinationFile.getAbsolutePath());
} else {
System.err.println(sourceFile.getName() +" File not exists");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}

Related

Extract Only Images from PDF File in java using Apache Tika or PDFBox? [duplicate]

I m trying to extract images from a pdf using pdfbox. The example pdf here
But i m getting blank images only.
The code i m trying:-
public static void main(String[] args) {
PDFImageExtract obj = new PDFImageExtract();
try {
obj.read_pdf();
} catch (IOException ex) {
System.out.println("" + ex);
}
}
void read_pdf() throws IOException {
PDDocument document = null;
try {
document = PDDocument.load("C:\\Users\\Pradyut\\Documents\\MCS-034.pdf");
} catch (IOException ex) {
System.out.println("" + ex);
}
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
int i =1;
String name = null;
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
Map pageImages = resources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
i ++;
}
}
}
}
Thanks
Here is code using PDFBox 2.0.1 that will get a list of all images from the PDF. This is different than the other code in that it will recurse through the document instead of trying to get the images from the top level.
public List<RenderedImage> getImagesFromPDF(PDDocument document) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (PDPage page : document.getPages()) {
images.addAll(getImagesFromResources(page.getResources()));
}
return images;
}
private List<RenderedImage> getImagesFromResources(PDResources resources) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (COSName xObjectName : resources.getXObjectNames()) {
PDXObject xObject = resources.getXObject(xObjectName);
if (xObject instanceof PDFormXObject) {
images.addAll(getImagesFromResources(((PDFormXObject) xObject).getResources()));
} else if (xObject instanceof PDImageXObject) {
images.add(((PDImageXObject) xObject).getImage());
}
}
return images;
}
The below GetImagesFromPDF java class get all images in 04-Request-Headers.pdf file and save those files into destination folder PDFCopy.
import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
#SuppressWarnings({ "unchecked", "rawtypes", "deprecation" })
public class GetImagesFromPDF {
public static void main(String[] args) {
try {
String sourceDir = "C:/PDFCopy/04-Request-Headers.pdf";// Paste pdf files in PDFCopy folder to read
String destinationDir = "C:/PDFCopy/";
File oldFile = new File(sourceDir);
if (oldFile.exists()) {
PDDocument document = PDDocument.load(sourceDir);
List<PDPage> list = document.getDocumentCatalog().getAllPages();
String fileName = oldFile.getName().replace(".pdf", "_cover");
int totalImages = 1;
for (PDPage page : list) {
PDResources pdResources = page.getResources();
Map pageImages = pdResources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage pdxObjectImage = (PDXObjectImage) pageImages.get(key);
pdxObjectImage.write2file(destinationDir + fileName+ "_" + totalImages);
totalImages++;
}
}
}
} else {
System.err.println("File not exists");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
For PDFBox 2.0.1, pudaykiran's answer must be slightly modified since some APIs have been changed.
public static void testPDFBoxExtractImages() throws Exception {
PDDocument document = PDDocument.load(new File("D:/Temp/Test.pdf"));
PDPageTree list = document.getPages();
for (PDPage page : list) {
PDResources pdResources = page.getResources();
for (COSName c : pdResources.getXObjectNames()) {
PDXObject o = pdResources.getXObject(c);
if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
File file = new File("D:/Temp/" + System.nanoTime() + ".png");
ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject)o).getImage(), "png", file);
}
}
}
}
Just add the .jpeg to the end of your path:
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i + ".jpeg");
That works for me.
You can use PDPage.convertToImage() function which can convert the PDF page into a BufferedImage. Next you can use the BufferedImage to create an Image.
Use the following reference for further detail:
All PDF realated classes in PDFBox you can get in
Apache PDFBox 1.8.3 API
Here you can see PDPage related documentation.
And do not forget to look for PDPage.convertToImage() function in PDPage class.
This is a kotlin version of #Matt's answer.
fun <R> PDResources.onImageResources(block: (RenderedImage) -> (R)): List<R> =
this.xObjectNames.flatMap {
when (val xObject = this.getXObject(it)) {
is PDFormXObject -> xObject.resources.onImageResources(block)
is PDImageXObject -> listOf(block(xObject.image))
else -> emptyList()
}
}
You can use it on PDPage Resources like this:
page.resources.onImageResources { image ->
Files.createTempFile("image", "xxx").also { path->
if(!ImageIO.write(it, "xxx", file.toFile()))
IllegalStateException("Couldn't write image to file")
}
}
Where "xxx" is the format you need (like "jpeg")
For someone who want just copy and paste this ready to use code
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.UUID;
public class ExtractImagesUseCase extends PDFStreamEngine{
private final String filePath;
private final String outputDir;
// Constructor
public ExtractImagesUseCase(String filePath,
String outputDir){
this.filePath = filePath;
this.outputDir = outputDir;
}
// Execute
public void execute(){
try{
File file = new File(filePath);
PDDocument document = PDDocument.load(file);
for(PDPage page : document.getPages()){
processPage(page);
}
}catch(IOException e){
e.printStackTrace();
}
}
#Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException{
String operation = operator.getName();
if("Do".equals(operation)){
COSName objectName = (COSName) operands.get(0);
PDXObject pdxObject = getResources().getXObject(objectName);
if(pdxObject instanceof PDImageXObject){
// Image
PDImageXObject image = (PDImageXObject) pdxObject;
BufferedImage bImage = image.getImage();
// File
String randomName = UUID.randomUUID().toString();
File outputFile = new File(outputDir,randomName + ".png");
// Write image to file
ImageIO.write(bImage, "PNG", outputFile);
}else if(pdxObject instanceof PDFormXObject){
PDFormXObject form = (PDFormXObject) pdxObject;
showForm(form);
}
}
else super.processOperator(operator, operands);
}
}
Demo
public class ExtractImageDemo{
public static void main(String[] args){
String filePath = "C:\\Users\\John\\Downloads\\Documents\\sample-file.pdf";
String outputDir = "C:\\Users\\John\\Downloads\\Documents\\Output";
ExtractImagesUseCase useCase = new ExtractImagesUseCase(
filePath,
outputDir
);
useCase.execute();
}
}
Instead of calling
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
You can use the ImageIO.write() static method to write the RGB image out in whatever format you need. Here I've used PNG:
File outputFile = new File( "C:\\Users\\Pradyut\\Documents\\image" + i + ".png");
ImageIO.write( image.getRGBImage(), "png", outputFile);

Converting PDF to PDF/A with PDFBox

currently I'm trying to convert a PDF to PDF/A.
However somehow I don't know if I can convert the colorspace is there any way by doing so?
this is my code, yet:
PDDocumentInformation info = doc.getDocumentInformation();
System.out.println("Page Count=" + doc.getNumberOfPages());
System.out.println("Title=" + info.getTitle());
System.out.println("Author=" + info.getAuthor());
System.out.println("Subject=" + info.getSubject());
System.out.println("Keywords=" + info.getKeywords());
System.out.println("Creator=" + info.getCreator());
System.out.println("Producer=" + info.getProducer());
System.out.println("Creation Date=" + info.getCreationDate());
System.out.println("Modification Date=" + info.getModificationDate());
System.out.println("Trapped=" + info.getTrapped());
PDDocumentCatalog cat = doc.getDocumentCatalog();
XMPMetadata xmp = XMPMetadata.createXMPMetadata();
PDFAIdentificationSchema pdfaid = xmp.createAndAddPFAIdentificationSchema();
pdfaid.setConformance("A");
pdfaid.setPart(3);
pdfaid.setAboutAsSimple(null);
DublinCoreSchema dublinCoreSchema = xmp.createAndAddDublinCoreSchema();
dublinCoreSchema.setTitle(info.getTitle());
dublinCoreSchema.addCreator(info.getAuthor());
AdobePDFSchema adobePDFSchema = xmp.createAndAddAdobePDFSchema();
adobePDFSchema.setProducer(info.getProducer());
XMPBasicSchema xmpBasicSchema = xmp.createAndAddXMPBasicSchema();
xmpBasicSchema.setCreatorTool(info.getCreator());
xmpBasicSchema.setCreateDate(info.getCreationDate());
xmpBasicSchema.setModifyDate(info.getModificationDate());
xmp.addSchema(pdfaid);
XmpSerializer serializer = new XmpSerializer();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
serializer.serialize(xmp, baos, true);
InputStream colorProfile = PdfConverter.class.getResourceAsStream("/sRGBColorSpaceProfile.icm");
PDOutputIntent oi = new PDOutputIntent(doc, colorProfile);
oi.setInfo("sRGB IEC61966-2.1");
oi.setOutputCondition("sRGB IEC61966-2.1");
oi.setOutputConditionIdentifier("sRGB IEC61966-2.1");
oi.setRegistryName("http://www.color.org");
cat.addOutputIntent(oi);
PDMetadata metadata = new PDMetadata(doc);
metadata.importXMPMetadata(baos.toByteArray());
cat.setMetadata(metadata);
The colorspace gets added however on validation i get:
2.3.2 : Unexpected key in Graphic object definition, The ColorSpace is unknown
For every page/element whatever, it appears quite often.
Could I do anything against it? Like converting the ColorsSpace? Using antoher library?
I have found this trick to convert pdf to pdfA.
Fill the PDF form
Convert it to image
Create a valid PDFA form as explained in PDFBox website
Fill the image created as the result
In this example, I used : OoPdfFormExample.pdf that can be found easily in internet.
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.graphics.color.PDOutputIntent;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.preflight.Format;
import org.apache.pdfbox.preflight.PreflightDocument;
import org.apache.pdfbox.preflight.ValidationResult;
import org.apache.pdfbox.preflight.exception.SyntaxValidationException;
import org.apache.pdfbox.preflight.parser.PreflightParser;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.xmpbox.XMPMetadata;
import org.apache.xmpbox.schema.DublinCoreSchema;
import org.apache.xmpbox.schema.PDFAIdentificationSchema;
import org.apache.xmpbox.type.BadFieldValueException;
import org.apache.xmpbox.xml.XmpSerializer;
import javax.xml.transform.TransformerException;
import java.awt.image.BufferedImage;
import java.io.*;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Random;
public class CreatePDFAFile {
private static final String OUTPUT_DIR = "tmp";
static String separator = FileSystems.getDefault().getSeparator();
public static void main(String[] args) throws IOException {
Path tmpDir = getRandomPath();
String fileInput = fillForm("template/OoPdfFormExample.pdf", tmpDir);
String image = PDF2Image(fileInput, tmpDir);
String pdfa = createPDFA(image, tmpDir);
checkPDFAValidation(pdfa);
}
private static String fillForm(String formTemplate, Path path) throws IOException {
String fileOut = path + separator + "FillForm.pdf";
try (PDDocument pdfDocument = PDDocument.load(new File(formTemplate))) {
PDAcroForm acroForm = pdfDocument.getDocumentCatalog().getAcroForm();
if (acroForm != null) {
acroForm.getField(acroForm.getFields().get(0).getFullyQualifiedName()).setValue("TEST");
}
acroForm.refreshAppearances();
acroForm.flatten();
pdfDocument.save(fileOut);
}
return fileOut;
}
public static String PDF2Image(String fileInput, Path path) {
String fileName = "";
try (final PDDocument document = PDDocument.load(new File(fileInput))) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
for (int page = 0; page < document.getNumberOfPages(); ++page) {
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
fileName = path + separator + "image-" + page + ".png";
ImageIOUtil.writeImage(bim, fileName, 300);
}
} catch (IOException e) {
System.err.println("Exception while trying to create pdf document - " + e);
}
return fileName;
}
public static String createPDFA(String imagePath, Path path) throws IOException {
try (PDDocument doc = new PDDocument()) {
PDPage page = new PDPage();
doc.addPage(page);
PDFont font = PDType0Font.load(doc, new File("template" + separator + "LiberationSans-Regular.ttf"));
if (!font.isEmbedded()) {
throw new IllegalStateException("PDF/A compliance requires that all fonts used for"
+ " text rendering in rendering modes other than rendering mode 3 are embedded.");
}
try (PDPageContentStream contents = new PDPageContentStream(doc, page)) {
contents.beginText();
contents.setFont(font, 12);
contents.newLineAtOffset(100, 700);
contents.showText("");
contents.endText();
}
// add XMP metadata
XMPMetadata xmp = XMPMetadata.createXMPMetadata();
String fileName = path + separator + "FinalPDFAFile.pdf";
try {
DublinCoreSchema dc = xmp.createAndAddDublinCoreSchema();
dc.setTitle(fileName);
PDFAIdentificationSchema id = xmp.createAndAddPFAIdentificationSchema();
id.setPart(1);
id.setConformance("B");
XmpSerializer serializer = new XmpSerializer();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
serializer.serialize(xmp, baos, true);
PDMetadata metadata = new PDMetadata(doc);
metadata.importXMPMetadata(baos.toByteArray());
doc.getDocumentCatalog().setMetadata(metadata);
} catch (BadFieldValueException | TransformerException e) {
throw new IllegalArgumentException(e);
}
// sRGB output intent
InputStream colorProfile = new FileInputStream(new File("template/sRGB.icc"));
PDOutputIntent intent = new PDOutputIntent(doc, colorProfile);
intent.setInfo("");
intent.setOutputCondition("");
intent.setOutputConditionIdentifier("");
intent.setRegistryName("");
doc.getDocumentCatalog().addOutputIntent(intent);
PDImageXObject pdImage = PDImageXObject.createFromFile(imagePath, doc);
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true, true)) {
float scale = 1 / 5f;
contentStream.drawImage(pdImage, 20, 20, pdImage.getWidth() * scale, pdImage.getHeight() * scale);
}
doc.save(fileName);
return fileName;
}
}
private static void checkPDFAValidation(String fileName) throws IOException {
ValidationResult result = null;
PreflightParser parser = new PreflightParser(fileName);
try {
parser.parse(Format.PDF_A1B);
PreflightDocument document = parser.getPreflightDocument();
document.validate();
// Get validation result
result = document.getResult();
document.close();
} catch (SyntaxValidationException e) {
result = e.getResult();
}
if (result.isValid()) {
System.out.println("The file " + fileName + " is a valid PDF/A-1b file");
} else {
System.out.println("The file" + fileName + " is not valid, error(s) :");
for (ValidationResult.ValidationError error : result.getErrorsList()) {
System.out.println(error.getErrorCode() + " : " + error.getDetails());
}
}
}
private static Path getRandomPath() throws IOException {
String path = generateRandom();
Path tmpDir = Paths.get(OUTPUT_DIR + separator + path + separator);
Files.createDirectory(tmpDir);
return tmpDir;
}
private static String generateRandom() {
String aToZ = "ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890";
Random rand = new Random();
StringBuilder res = new StringBuilder();
for (int i = 0; i < 17; i++) {
int randIndex = rand.nextInt(aToZ.length());
res.append(aToZ.charAt(randIndex));
}
return res.toString();
}
}

How to download All the URLs one by one and keep in different folders

I have one html file where I have kept all the URLs(Download link for CSV files).I want a tool/program that has to go through each url one by one and download the file, Then keep the file in the specified folder which will be written in the same html file itself.
html file is a table with 3 columns
File name,File location and download URL
Url will download the CSV file after opening a new window (target=_blank).Also after download it will close the child window automatically if there is no error.
I have tried the automation(Selenium using java)
But there are some challenges as follows.
It should wait until the download completes
Sometimes the url may show error,in that case it should close the child window and return to parent window
I have resolved the 1st case by keeping a watcher which will check whether the file is downloaded or not each second(by counting the number of csv files in the folder)
I can switch to child window and check whether there is any error but if there is no error my driver is got stuck over there.
How to resolve this
Code to check whether error is there in child window
public boolean foundError(FirefoxDriver driver) {
System.out.println(browser.getWindowHandle() + "Parent" + parentHandle);
String child = "";
int numberOfWindows = 0;
//return true;
if (driver.getWindowHandles().size() > 1) {
for (String winHandle : driver.getWindowHandles()) {
numberOfWindows++;
if (!parentHandle.equals(winHandle)) {
child = winHandle;
System.out.println("Child" + winHandle);
}
}
}
if (numberOfWindows > 1) {
System.out.println("tostring1" + driver.toString());
if (!parentHandle.equals(child)) {
driver.switchTo().window(child);
}
System.out.println("Switched to child");
Set set = driver.getWindowHandles();
System.out.println("Number of windows=" + set.size());
// System.out.println("Number of windows="+set.size()+"driver url"+driver.getCurrentUrl());
// System.out.println("tostring2"+driver.toString());
try {
// WebDriverWait wait1 = new WebDriverWait(driver, 5);
System.out.println("Body text" + driver.findElementByTagName("body").getText());/////////////////////////////Here driver will get stuck
//System.out.println("text"+driver.findElementByClassName("body").toString());
// List<WebElement> elements=driver.findElementsByClassName("ErrorBody");elements.size()>0
if (!driver.findElementByTagName("body").getText().equals("")) {
driver.close();
driver.switchTo().window(parentHandle);
return true;
}
System.out.println("No error");
driver.switchTo().window(parentHandle);
System.out.println("Switched to parent");
} catch (Exception e) {
System.out.println("Error Catch block page time out:" + e);
driver.switchTo().window(parentHandle);
return false;
// driver.switchTo().window(parentHandle);
}
}
return false;
}
I used different method
using Jsoup to parse the html file and downloading
import java.io.File;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
/**
*
* #author nudanesh
*/
public class URLDownload {
private Document doc;
String url = "", folder, file;
private final File sourceFile;
int i = 1;
int r = 1, c = 1;
int anchorCol = 3;
Library lib;
URLDownload() {
lib = new Library();
sourceFile = new File("Download.html");
try {
doc = Jsoup.parse(sourceFile, "UTF-8");
} catch (IOException ex) {
Logger.getLogger(URLDownload.class.getName()).log(Level.SEVERE, null, ex);
}
//Elements links = doc.select("a[href]");
Elements rows = doc.select("tr");
System.out.println("Size=" + rows.size());
for (Element row : rows) {
Elements cols = row.getElementsByTag("td");
c = 1;
for (Element col : cols) {
System.out.println("Row"+r);
if (c == 1) {
file = col.text();//System.out.println("File in main"+file);
} else if (c == 2) {
folder = col.text();//System.out.println("Folder in main"+folder);
} else {
try {
url = col.getElementsByTag("a").attr("href");
} catch (Exception e) {
System.out.print("-");
}
}
c++;
}
if (!url.equals("")) {
lib.setLocation(file,folder);
lib.downloadFile(url);
}
url = "";
i++;
r++;
}
}
public static void main(String arg[]) {
new URLDownload();
}
}
and following is the Library class file
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
/**
*
* #author nudanesh
*/
public class Library {
boolean downloaded = false;
Thread t;
int waitTime = 0;
String baseLoc = "";
int size = 1024, ByteWritten = 0;
URL url;
URLConnection uCon = null;
String folderLoc = "", file = "firstFile.csv";
File loc;
private OutputStream outStream;
private InputStream is=null;
private byte[] buf;
private int ByteRead;
private int FolderInUrl = 4;
private boolean rootFolder = true;
private File resultFile;
private FileOutputStream fileResult;
private XSSFWorkbook workbookResult;
private XSSFSheet sheetResult;
private int updateExcelRowNum = -1;
private int updateExcelColNum = -1;
String date;
private int waitLimit = 900000;
Library() {
/*System.out.print(Calendar.getInstance().toString());
Date d=new Date();
String date=d.toString();
System.out.println(date);*/
//t = new Thread(this);
// t.start();
date = new SimpleDateFormat("yyyy_MM_dd_HH_mm_ss").format(Calendar.getInstance().getTime());
System.out.print(date);
baseLoc = date + "/";
WriteDataToExcel();
baseLoc += "Business Reports/";
createRowExcel(updateExcelRowNum);
updateRowColExcel(updateExcelRowNum, updateExcelColNum, "Report Name");
updateRowColExcel(updateExcelRowNum, updateExcelColNum, "Path");
updateRowColExcel(updateExcelRowNum, updateExcelColNum, "Status");
updateExcel();
}
public void setLocation(String a, String b) {
file = a;
file += ".csv";
folderLoc = baseLoc + getFolderPath(b);
// System.out.println("File Name: "+file);
// System.out.println("Folder loc: "+folderLoc);
}
public String getFolderPath(String b) {
String path = "";
try {
System.out.println("path" + b);
path = b;
// path = java.net.URLDecoder.decode(b, "UTF-8");
String p[] = path.split("/");
path = "";
for (int i = FolderInUrl; i < p.length - 1; i++) {
rootFolder = false;
p[i] = removeSpacesAtEnd(p[i]);
path = path + p[i] + "/";
}
} catch (Exception ex) {
Logger.getLogger(Library.class.getName()).log(Level.SEVERE, null, ex);
}
return path;
}
public void downloadFile(String urlString) {
// System.out.println("Started");
try {
url = new URL(urlString);
} catch (MalformedURLException ex) {
Logger.getLogger(Library.class.getName()).log(Level.SEVERE, null, ex);
}
try {
loc = new File(folderLoc);
if (!loc.exists()) {
loc.mkdirs();
}
outStream = new BufferedOutputStream(new FileOutputStream(folderLoc + file));
uCon = url.openConnection();
uCon.setReadTimeout(waitLimit);
is = uCon.getInputStream();
downloaded=true;
buf = new byte[size];
while ((ByteRead = is.read(buf)) != -1) {
System.out.println("while executing" + ByteRead);
outStream.write(buf, 0, ByteRead);
ByteWritten += ByteRead;
}
//System.out.println("Downloaded" + ByteWritten);
resetCounters();
createRowExcel(updateExcelRowNum);
updateRowColExcel(updateExcelRowNum, updateExcelColNum, file);
updateRowColExcel(updateExcelRowNum, updateExcelColNum, folderLoc);
if (ByteWritten < 1000) {
updateRowColExcel(updateExcelRowNum, updateExcelColNum, "Downloaded ");
} else {
updateRowColExcel(updateExcelRowNum, updateExcelColNum, "Downloaded ");
}
updateExcel();
} catch (Exception e) {
System.out.println("error catch" + e);
resetCounters();
createRowExcel(updateExcelRowNum);
updateRowColExcel(updateExcelRowNum, updateExcelColNum, file);
updateRowColExcel(updateExcelRowNum, updateExcelColNum, folderLoc);
updateRowColExcel(updateExcelRowNum, updateExcelColNum, "Rejected the Download after waiting " + (waitLimit / 60000) + " minutes");
updateExcel();
waitTime = 0;
} finally {
try {
System.out.println("Error in streams");
if(downloaded)
is.close();
outStream.close();
downloaded= false;
} catch (IOException e) {
e.printStackTrace();
}
}
}
public void moveToFolder(String reportName, String path) {
try {
File repo = new File(folderLoc + "/" + reportName + ".csv");
path = folderLoc + "/" + path;
File pathFolder = new File(path);
if (!pathFolder.exists()) {
pathFolder.mkdirs();
}
pathFolder = new File(path + reportName + ".csv");
System.out.println("Path=" + pathFolder.getAbsolutePath() + "\nReport path=" + repo.getAbsolutePath());
System.out.println("Source" + repo.getAbsolutePath());
//System.out.println("Status" + repo.renameTo(new File(pathFolder.getAbsolutePath())));
System.out.println("Status" + Files.move(repo.toPath(), new File(pathFolder.getAbsolutePath()).toPath(), REPLACE_EXISTING));
//Files.
} catch (Exception e) {
System.out.println("error while moving" + e);
}
}
public String changeSpecialCharacters(String report) {
report = report.replaceAll(":", "_");
return report;
}
public String removeSpacesAtEnd(String inputPath) {
for (int i = inputPath.length() - 1; i >= 0; i--) {
if (inputPath.charAt(i) != ' ') {
break;
} else {
System.out.println("Before string is" + inputPath);
inputPath = inputPath.substring(0, i);
System.out.println("AFter string is" + inputPath);
}
}
return inputPath;
}
public void WriteDataToExcel() {
try {
// file = new FileInputStream(new File("config.xlsx"));
// File resultFolder = new File("Results");
// if (resultFolder.exists()) {
// deleteDirectory(resultFolder);
// }
// resultFolder.mkdirs();
if (!new File(baseLoc).exists()) {
new File(baseLoc).mkdirs();
}
resultFile = new File(baseLoc + "Reports info " + date + ".xlsx");
System.out.println("Path" + resultFile.getAbsolutePath());
resultFile.createNewFile();
// rFilePath = resultFile.getAbsolutePath();
fileResult = new FileOutputStream(resultFile);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//Get the workbook instance for XLS file
// System.out.println("file success");
XSSFWorkbook workbook = null;
try {
workbookResult = new XSSFWorkbook();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("Opening the browser");
//Get first sheet from the workbook
sheetResult = workbookResult.createSheet();
//sheetResult.set
//Get iterator to all the rows in current sheet
//Get iterator to all cells of current row
//ar.add(folderLocation);
// ar.add(firefoxProfileLocation);
}
public void updateExcel() {
try {
//fileResult.close();
fileResult = new FileOutputStream(resultFile);
workbookResult.write(fileResult);
fileResult.close();
} catch (Exception e) {
System.out.println(e);
}
}
public void createRowExcel(int num) {
updateExcelRowNum++;
num = updateExcelRowNum;
sheetResult.createRow(num);
}
public void updateRowColExcel(int rnum, int cnum, String value) {
updateExcelColNum++;
cnum = updateExcelColNum;
sheetResult.getRow(rnum).createCell(cnum);
XSSFCell cell = sheetResult.getRow(rnum).getCell(cnum);
cell.setCellValue(value);
}
public void updateColumn(int rnum, int cnum, String value) {
XSSFCell cell = sheetResult.getRow(rnum).getCell(cnum);
cell.setCellValue(value);
}
public void resetCounters() {
updateExcelColNum = -1;
}
/* #Override
public void run() {
while (true) {
if (true) {
waitTime += 1000;
System.out.println(waitTime);
if (waitTime > waitLimit) {
try {
is.close();
outStream.close();
//downloaded=false;
// cancelDownload=true;
} catch (Exception ex) {
Logger.getLogger(Library.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
try {
Thread.sleep(1000);
} catch (Exception e) {
}
}
}*/
}

itext Converting PDF to csv

I am trying to use itext framework to convert a pdf file into a csv for import into excel.
The output is garbled and I pressume I am missing a step in regards to format conversion however I can't seem to find the information in the itext site and am looking for assistance.
Current is as below.
package com.pdf.convert;
import java.io.FileOutputStream;
import java.io.IOException;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.pdf.PdfImportedPage;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfWriter;
public class ThirdPDF {
private static String INPUTFILE = "/location/test.pdf";
private static String OUTPUTFILE = "/location/test.csv";
public static void main(String[] args) throws DocumentException,
IOException {
Document document = new Document();
PdfWriter writer = PdfWriter.getInstance(document,
new FileOutputStream(OUTPUTFILE));
document.open();
PdfReader reader = new PdfReader(INPUTFILE);
int n = reader.getNumberOfPages();
PdfImportedPage page;
// Go through all pages
for (int i = 1; i <= n; i++) {
// Only page number 2 will be included
if (i == 2) {
page = writer.getImportedPage(reader, i);
Image instance = Image.getInstance(page);
document.add(instance);
}
}
document.close();
}
}
Converting PDF file to CSV file.
Present Directory and File creation is based on Android Framework.
Change your path and Directory as per your Framework Accordingly.
private void convertPDFToCSV(String pdfFilePath) {
String myfolder = Environment.getExternalStorageDirectory() + "/Mycsv";
if (createFolder(myfolder)) {
try {
Document document = new Document();
document.open();
FileOutputStream fos=new FileOutputStream(myfolder + "/MyCSVFile.csv");
StringBuilder parsedText=new StringBuilder();
PdfReader reader1 = new PdfReader(pdfFilePath);
int n = reader1.getNumberOfPages();
for (int i = 0; i <n ; i++) {
parsedText.append(parsedText+PdfTextExtractor.getTextFromPage(reader1, i+1).trim()+"\n") ;
//Extracting the content fromx the different pages
}
StringReader stReader = new StringReader(parsedText.toString());
int t;
while((t=stReader.read())>0)
fos.write(t);
document.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
private boolean createFolder(String myfolder) {
File f = new File(myfolder);
if (!f.exists()) {
if (!f.mkdir()) {
return false;
} else {
return true;
}
}else{
return true;
}
}

iText PDF; howto convert jpeg2000 to jpg using Java

I'm trying to solve a problem using java, iText, and the Java advanced imaging library. My software system uses ghostscript to create jpg thumbnail images etc... from PDF files. However on CentOS 5.x the highest version of ghostscript is 8.7 which has a known issue of not being able to handle PDF files containing JPEG 2000 images in them. My plan is to scan the file first and see if it contains jpeg2000 images (I've already got this part figured out); if so, then use iText and the Java Advanced Imaging library (contains the jpeg2000 read & write codecs) to convert the contained jpeg2000 files into regular jpeg files & then pass the new PDF file to ghostscript. The code below attempts this, but results in another file containing jpeg2000 files. Any help with this would be much appreciated.
public class ImageReplacer{
public static void main(String [] args){
try{
String RESULT = "";
PdfReader reader = new PdfReader("pdf_containing_jpeg2000_images.pdf");
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
MyImageRenderListener listener = new MyImageRenderListener(RESULT);
MyImageConverterListener clistener = new MyImageConverterListener(RESULT);
clistener.setReader(reader);
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
parser.processContent(i, clistener);
}
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream("out.pdf"));
stamper.close();
}catch(Exception e){
e.printStackTrace();
}
}
}
class MyImageConverterListener implements RenderListener {
protected String path = "";
protected PdfReader reader;
public MyImageConverterListener(String path) {
this.path = path;
}
public void beginTextBlock() { }
public void endTextBlock() { }
public void renderImage(ImageRenderInfo renderInfo) {
try {
PdfImageObject image = renderInfo.getImage();
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.JPXDECODE.equals(filter)) {
if(image.getDictionary().isStream()){
BufferedImage bi = image.getBufferedImage();
if (bi == null) return;
int width = (int)bi.getWidth();
int height = (int)bi.getHeight();
ByteArrayOutputStream imgBytes = new ByteArrayOutputStream();
ImageIO.write(bi, "JPG", imgBytes);
PRStream stream = new PRStream(reader,imgBytes.toByteArray());
stream.clear();
stream.setData(imgBytes.toByteArray(), false, PRStream.NO_COMPRESSION);
stream.put(PdfName.TYPE, PdfName.XOBJECT);
stream.put(PdfName.SUBTYPE, PdfName.IMAGE);
stream.put(new PdfName("foo"+Math.random()), new PdfName("bar"+Math.random()));
stream.put(PdfName.FILTER, PdfName.DCTDECODE);
stream.put(PdfName.WIDTH, new PdfNumber(width));
stream.put(PdfName.HEIGHT, new PdfNumber(height));
stream.put(PdfName.BITSPERCOMPONENT, new PdfNumber(8));
stream.put(PdfName.COLORSPACE, PdfName.DEVICERGB);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public void renderText(TextRenderInfo renderInfo) { }
public void setReader(PdfReader r){
reader = r;
}
}
So I managed to solve this one on my own (with a little help from iText in action by Bruno Lowagie - great book). Just to re-iterate, my intention is to scan a PDF using iText to see if it contains any JPEG2000 images and if it does output the same PDF but with the inner JPEG2000 images replaced with regular JPEG images. This solves the fatal ghostscript 8.7 'Unable to process JPXDecode data' error, but would also provide useful for making PDF's iOS compatible.
So without further a do kids; here goes...
Step 1) Download iText 5.x .jar file, and download jai_imageio-1.1.jar (the Java advanced imaging library that allows you to convert JPEG2000 files)
Step 2) Create a file called PDFConverter.java and put this code in it:
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.parser.PdfImageObject;
import com.itextpdf.text.pdf.PdfNumber;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import com.itextpdf.text.pdf.PdfStamper;
import java.io.*;
public class PDFConverter{
public static void main(String [] args){
if(args.length==1){
if(hasJpeg2000(args[0])){
System.out.println("Contains JPEG2000 images: Converting them to JPEG...");
convertPDF(args[0]);
System.out.println("Done...");
}else{
System.out.println("Doesn't contain any JPEG2000 images: Nothing to be done...");
}
}else{
System.out.println("Please specify a PDF filename as a command line argument!");
}
}
public static boolean hasJpeg2000(String s){
try{
PdfReader reader = new PdfReader(s);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream())continue;
stream = (PRStream)object;
PdfImageObject image = new PdfImageObject(stream);
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.JPXDECODE.equals(filter)) {
return true;
}
}
}catch(Exception e){
e.printStackTrace();
}
return false;
}
public static void convertPDF(String s){
try{
PdfReader reader = new PdfReader(s);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream())continue;
stream = (PRStream)object;
PdfImageObject image = new PdfImageObject(stream);
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.JPXDECODE.equals(filter)) {
BufferedImage bi = image.getBufferedImage();
if (bi == null) continue;
int width = (int)(bi.getWidth());
int height = (int)(bi.getHeight());
ByteArrayOutputStream imgBytes = new ByteArrayOutputStream();
ImageIO.write(bi, "JPG", imgBytes);
stream.clear();
stream.setData(imgBytes.toByteArray(),false, PRStream.NO_COMPRESSION);
stream.put(PdfName.TYPE, PdfName.XOBJECT);
stream.put(PdfName.SUBTYPE, PdfName.IMAGE);
stream.put(new PdfName("foo"+Math.random()), new PdfName("bar"+Math.random()));
stream.put(PdfName.FILTER, PdfName.DCTDECODE);
stream.put(PdfName.WIDTH, new PdfNumber(width));
stream.put(PdfName.HEIGHT, new PdfNumber(height));
stream.put(PdfName.BITSPERCOMPONENT,new PdfNumber(8));
stream.put(PdfName.COLORSPACE, PdfName.DEVICERGB);
}
}
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream("out.pdf")); stamper.close();
}catch(Exception e){
e.printStackTrace();
}
}
}
Step 3)Compile the above file in the following manner:
javac -cp .:iText-5.0.4.jar:jai_imageio-1.1.jar PDFConverter.java
Step 4)Run the program with a PDF...
java -cp .:iText-5.0.4.jar:jai_imageio-1.1.jar PDFConverter PDFFileName.pdf
Booyah...
Works great, but I had some issues with GlassFish v3.1. Glassfish acted as if there was no jai_imageio-1.1.jar in Classpath. I fixed this putting jai_imageio.jar in my "/path/to/glassfish/domains/domain1/lib/ext/" folder.
I have had some NullPointer problems with the PDFConverter from Reece, because my PDF had different types of embedded elements inside GhostScript on CentOS 5.3 - Unable to process JPXDecode data. So I make some Object/Type checks and added the output filename to command line.
Everything else is great and worked perfect to the image jpeg2000 problem. Thanks to Reece :)
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.*;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.parser.PdfImageObject;
import com.itextpdf.text.pdf.PdfNumber;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import com.itextpdf.text.pdf.PdfStamper;
import java.io.*;
public class PDFConverter{
public static void main(String [] args){
if(args.length==2){
if(hasJpeg2000(args[0])){
System.out.println("Contains JPEG2000 images: Converting them to JPEG...");
convertPDF(args[0], args[1]);
System.out.println("Done...");
}else{
System.out.println("Doesn't contain any JPEG2000 images: Nothing to be done...");
}
}else{
System.out.println("Please specify a PDF filename and a output filename as a command line arguments!");
}
}
public static boolean hasJpeg2000(String s){
try{
PdfReader reader = new PdfReader(s);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream())continue;
stream = (PRStream)object;
PdfObject pdfsubtype = stream.get(PdfName.SUBTYPE);
System.out.println(pdfsubtype);
if (pdfsubtype != null && pdfsubtype.toString().equals(PdfName.IMAGE.toString())) {
PdfImageObject image = new PdfImageObject(stream);
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.JPXDECODE.equals(filter)) {
return true;
}
}
}
}catch(Exception e){
e.printStackTrace();
}
return false;
}
private static void filterObject(PdfImageObject image,PdfName filter,PRStream stream) throws java.io.IOException {
if (PdfName.JPXDECODE.equals(filter)) {
BufferedImage bi = image.getBufferedImage();
if (bi == null) return;
int width = (int)(bi.getWidth());
int height = (int)(bi.getHeight());
ByteArrayOutputStream imgBytes = new ByteArrayOutputStream();
ImageIO.write(bi, "JPG", imgBytes);
stream.clear();
stream.setData(imgBytes.toByteArray(),false, PRStream.NO_COMPRESSION);
stream.put(PdfName.TYPE, PdfName.XOBJECT);
stream.put(PdfName.SUBTYPE, PdfName.IMAGE);
stream.put(new PdfName("foo"+Math.random()), new PdfName("bar"+Math.random()));
stream.put(PdfName.FILTER, PdfName.DCTDECODE);
stream.put(PdfName.WIDTH, new PdfNumber(width));
stream.put(PdfName.HEIGHT, new PdfNumber(height));
stream.put(PdfName.BITSPERCOMPONENT,new PdfNumber(8));
stream.put(PdfName.COLORSPACE, PdfName.DEVICERGB);
}
}
public static void convertPDF(String s, String out){
try{
PdfReader reader = new PdfReader(s);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream())continue;
stream = (PRStream)object;
PdfObject pdfsubtype = stream.get(PdfName.SUBTYPE);
if (pdfsubtype != null && pdfsubtype.toString().equals(PdfName.IMAGE.toString())) {
PdfImageObject image = new PdfImageObject(stream);
Object listOrName = image.get(PdfName.FILTER);
if (listOrName instanceof PdfName) {
PdfName filter = (PdfName)image.get(PdfName.FILTER);
filterObject(image, filter, stream);
}
else if (listOrName instanceof PdfArray) {
PdfArray list = (PdfArray)image.get(PdfName.FILTER);
for (int j = 0; j < list.size(); j++) {
PdfName filter = list.getAsName(j);
filterObject(image, filter, stream);
}
}
else {
System.err.println("Unknown Obejcttype: " + listOrName);
}
}
}
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(out)); stamper.close();
} catch(Exception e){
e.printStackTrace();
}
}
}

Categories

Resources