Issues with enclosing PDDoucment

Issues with enclosing PDDoucment - java

I have implemented a program that will print data into a pdf however I am facing this issue
java.io.IOException: COSStream has been closed and cannot be read. Perhaps its enclosing PDDocument has been closed?
I know there have been similar issues posted but I could not find the solution to my problem from them. What I am doing is that I initialized two documents one of them being the main document(doc) the other for the rest of the pages and then I add to the main one(activeDocument), the service calls three different functions to add a new page one for the first page one for any pages in between, one for before the last and finally the last page.a Here is my code most of the logic can be ignored its mainly just the things relating to PDDocuments since thats where the issue lies
private InputStream generateCardsPDF(String cardNumber,
String generationDate,
StatementSummaryResTypeStatementSummaryResBody resBody,
String pdf) throws IOException, ParseException {
List<StatementSummaryResTypeStatementSummaryResBodyTransactionsTransaction> transactionList = resBody.getTransactions().getTransaction().stream().limit(50).collect(Collectors.toList());
PDDocument doc = new PDDocument().load(getClass().getClassLoader().getResourceAsStream(pdf));
PDDocumentCatalog docCatalog = doc.getDocumentCatalog();
PDPage page = docCatalog.getPages().get(0);
try {
String fullName = resBody.getStatementHeader().getAddress().getTitle() +
" " + resBody.getStatementHeader().getAddress().getFirstName() + " ";
String fullAddress = "";
if (resBody.getStatementHeader().getAddress().getMiddleName() != null) {
fullName = fullName + resBody.getStatementHeader().getAddress().getMiddleName() + " ";
}
fullName = fullName + resBody.getStatementHeader().getAddress().getLastName();
String countryAddress = resBody.getStatementHeader().getAddress().getCity() != null ? resBody.getStatementHeader().getAddress().getCity() : "";
countryAddress += resBody.getStatementHeader().getAddress().getCountry() != null &&
countryAddress.length() > 0 ? ", " + resBody.getStatementHeader().getAddress().getCountry() : "";
countryAddress += resBody.getStatementHeader().getAddress().getCountry() != null &&
countryAddress.length() == 0 ? resBody.getStatementHeader().getAddress().getCountry() : "";
fullAddress += resBody.getStatementHeader().getAddress().getAddress1() + ONE_LINE;
fullAddress += resBody.getStatementHeader().getAddress().getAddress2() != null ? resBody.getStatementHeader().getAddress().getAddress2() + ONE_LINE : "";
fullAddress += resBody.getStatementHeader().getAddress().getAddress3() != null ? resBody.getStatementHeader().getAddress().getAddress3() + ONE_LINE : "";
fullAddress += countryAddress;
String header = fullName + ONE_LINE + fullAddress;
Integer currentIndex = 0;
Integer count = 0;
Integer cutOff = 0;
Integer pageNumber = 1;
Integer numberOfPages = statementsUtils.calculateNumberOfPages(transactionList, currentIndex, cutOff, count);
String pageOf;
PDDocument activeDocument = new PDDocument();
statementsUtils.setFirstPage(transactionList, resBody, doc, header, maskedCard(cardNumber), generationDate, numberOfPages);
try {
while (currentIndex < transactionList.size()) {
Boolean fourOrFive = false;
Boolean checkpoint = false;
Boolean finalPrint = false;
if (count == 4) {
checkpoint = statementsUtils.reachedCheckpoint(currentIndex, transactionList);
if (checkpoint) {
cutOff = currentIndex;
fourOrFive = true;
pageNumber++;
pageOf = "Page " + pageNumber.toString() + " of " + numberOfPages.toString();
statementsUtils.addExtraPage(doc, activeDocument, cutOff, fourOrFive, transactionList, resBody, pageOf, pdf, header, maskedCard(cardNumber), generationDate);
count = 0;
}
} else if (count == 5) {
cutOff = currentIndex;
fourOrFive = false;
pageNumber++;
pageOf = "Page " + pageNumber.toString() + " of " + numberOfPages.toString();
statementsUtils.addExtraPage(doc, activeDocument, cutOff, fourOrFive, transactionList, resBody, pageOf, pdf, header, maskedCard(cardNumber), generationDate);
count = 0;
} else if (transactionList.size() <= 2) {
finalPrint = true;
} else if (transactionList.size() - currentIndex + 1 <= 5) { // count <4? print last page or count >=4 print 4 and check homuch left
finalPrint = true;
pageNumber++;
pageOf = "Page " + pageNumber.toString() + " of " + numberOfPages.toString();
if (transactionList.size() - currentIndex + 1 == 5) {
statementsUtils.setPreLastPage(doc, activeDocument, currentIndex, transactionList, resBody, pageOf, pdf, header, maskedCard(cardNumber), generationDate);
pageNumber++;
pageOf = "Page " + pageNumber.toString() + " of " + numberOfPages.toString();
statementsUtils.setLastPage(doc, activeDocument, transactionList.size() - 1, transactionList, resBody, pageOf, pdf, header, maskedCard(cardNumber), generationDate);
} else {
statementsUtils.setLastPage(doc, activeDocument, currentIndex - count + 1, transactionList, resBody, pageOf, pdf, header, maskedCard(cardNumber), generationDate);
}
}
if (fourOrFive && !finalPrint)
currentIndex = cutOff + 1;
else if (finalPrint)
currentIndex = transactionList.size();
else
currentIndex++;
if (currentIndex >= 2) count++;
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (activeDocument != null) {
activeDocument.close();
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
PDPageContentStream contentStream = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true);
contentStream.close();
ByteArrayOutputStream out = new ByteArrayOutputStream();
if (doc != null) {
doc.save(out);
doc.close();
}
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
return in;
}
}
public void setFirstPage( PDDocument doc,....){
PDAcroForm acroForm = doc.getDocumentCatalog().getAcroForm();
...
acroForm.flatten();
}
public void setPreLastPage( PDDocument doc, PDDocument activeDocument....){
activeDocument = new PDDocument().load(getClass().getClassLoader().getResourceAsStream(pdf));
PDDocumentCatalog docCatalog = activeDocument.getDocumentCatalog();
PDPage p1 = docCatalog.getPages().get(0);
PDAcroForm acroForm = docCatalog.getAcroForm();
...
acroForm.flatten();
PDPageContentStream cs = new PDPageContentStream(activeDocument, p1,PDPageContentStream.AppendMode.APPEND, true);
cs.close();
ByteArrayOutputStream out = new ByteArrayOutputStream();
activeDocument.save(out);
doc.addPage(p1);
}
public void setLastPage( PDDocument doc, PDDocument activeDocument....){
activeDocument = new PDDocument().load(getClass().getClassLoader().getResourceAsStream(pdf));
PDDocumentCatalog docCatalog = activeDocument.getDocumentCatalog();
PDPage p1 = docCatalog.getPages().get(0);
PDAcroForm acroForm = docCatalog.getAcroForm();
...
acroForm.flatten();
PDPageContentStream cs = new PDPageContentStream(activeDocument, p1,PDPageContentStream.AppendMode.APPEND, true);
cs.close();
ByteArrayOutputStream out = new ByteArrayOutputStream();
activeDocument.save(out);
doc.addPage(p1);
}
public void addExtraPage( PDDocument doc, PDDocument activeDocument....){
activeDocument = new PDDocument().load(getClass().getClassLoader().getResourceAsStream(pdf));
PDDocumentCatalog docCatalog = activeDocument.getDocumentCatalog();
PDPage p1 = docCatalog.getPages().get(0);
PDAcroForm acroForm = docCatalog.getAcroForm();
...
acroForm.flatten();
PDPageContentStream cs = new PDPageContentStream(activeDocument, p1,PDPageContentStream.AppendMode.APPEND, true);
cs.close();
ByteArrayOutputStream out = new ByteArrayOutputStream();
activeDocument.save(out);
doc.addPage(p1);
}
There is a lot more logic in these functions but I tried adding only what I thought is relevant.
I believe the issue is concerning either the content stream of one of the pdfs or the order in which I close, save etc.... but I couldn't figure out what the exact issue is so any advice would be appreciated.

Three errors are in setPreLastPage, setLastPage, and addExtraPage: in each of these methods you load a new PDDocument in the respective local variable activeDocument, take a page from it and add it to the same PDDocument doc, and then drop all references to that PDDocument in activeDocument when leaving the respective method.
Dropping all references allows the garbage collection to pick these PDDocument instances up and close and remove them. This pulls away the data underneath the pages copied into the PDDocument doc, resulting in the error you observe when trying to save doc.
To prevent this either clone the page objects into doc before adding them or keep all these temporary documents open until you save doc.

In case anyone is curious of the actual implementation of the solution I created an empty list.
List<PDDocument> activeDocuments = new ArrayList<PDDocument>();
And then everytime I called one of the methods I returned the document and stored it inside of the list and then closed them all at the end after I saved the original document.
while (currentIndex < transactionList.size()) {
Boolean fourOrFive = false;
Boolean checkpoint = false;
Boolean finalPrint = false;
if (count == 4) {
checkpoint = statementsUtils.reachedCheckpoint(currentIndex, transactionList);
if (checkpoint) {
cutOff = currentIndex;
fourOrFive = true;
pageNumber++;
pageOf = "Page " + pageNumber.toString() + " of " + numberOfPages.toString();
activeDocuments.add(statementsUtils.addExtraPage(doc, activeDocument, cutOff, fourOrFive, transactionList, resBody, pageOf, pdf, header, maskedCard(cardNumber), generationDate));
count = 0;
}
} else if (count == 5) {
cutOff = currentIndex;
fourOrFive = false;
pageNumber++;
pageOf = "Page " + pageNumber.toString() + " of " + numberOfPages.toString();
activeDocuments.add(statementsUtils.addExtraPage(doc, activeDocument, cutOff, fourOrFive, transactionList, resBody, pageOf, pdf, header, maskedCard(cardNumber), generationDate));
count = 0;
} else if (transactionList.size() <= 2) {
finalPrint = true;
} else if (transactionList.size() - currentIndex + 1 <= 5) { // count <4? print last page or count >=4 print 4 and check homuch left
finalPrint = true;
pageNumber++;
pageOf = "Page " + pageNumber.toString() + " of " + numberOfPages.toString();
if (transactionList.size() - currentIndex + 1 == 5) {
activeDocuments.add(statementsUtils.setPreLastPage(doc, activeDocument, currentIndex, transactionList, resBody, pageOf, pdf, header, maskedCard(cardNumber), generationDate));
pageNumber++;
pageOf = "Page " + pageNumber.toString() + " of " + numberOfPages.toString();
activeDocuments.add(statementsUtils.setLastPage(doc, activeDocument, transactionList.size() - 1, transactionList, resBody, pageOf, pdf, header, maskedCard(cardNumber), generationDate));
} else {
activeDocuments.add(statementsUtils.setLastPage(doc, activeDocument, currentIndex - count + 1, transactionList, resBody, pageOf, pdf, header, maskedCard(cardNumber), generationDate));
}
}
if (fourOrFive && !finalPrint)
currentIndex = cutOff + 1;
else if (finalPrint)
currentIndex = transactionList.size();
else
currentIndex++;
if (currentIndex >= 2) count++;
}
} catch (IOException e) {
e.printStackTrace();
} finally {
PDPageContentStream contentStream = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true);
contentStream.close();
ByteArrayOutputStream out = new ByteArrayOutputStream();
if (doc != null) {
doc.save(out);
doc.close();
}
for (PDDocument a : activeDocuments)
a.close();
if (activeDocument != null) {
activeDocument.close();
}
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
return in;
}

Related

Size of the pdf file increases after removing qr code image using pdfbox library

Currently using pdfbox 2.0.24 library for removing and adding the qr code image after loading the pdf file from file system. Before removing the qr code image from the pdf file, the file size was 152mb. After removing the qr code, the file size increases to 523mb.
It was expected that after removing the qr code image, the file size should not increase, it should decrease or remain same. Also, in the removed qr image file, the qr code didn't get removed. Can you please help.
Below is the code for remove qr code image from the page:
pdDocument = PDDocument.load(new File(aBarcodeVO.getSourceFilePath()));
newDocument = new PDDocument();
for (int pageCount = 0; pageCount < pdDocument.getNumberOfPages(); pageCount++) {
PDPage pdPage = newDocument.importPage(pdDocument.getPage(pageCount));
String imgUniqueId = aBarcodeVO.getImgUniqueId().concat(String.valueOf(pageCount));
boolean hasQRCodeOnPage = removeQRCodeImage(newDocument, pdPage, imgUniqueId);
qRCodePageList.add(hasQRCodeOnPage);
}
if(qRCodePageList.contains(true)) {
newDocument.save(aBarcodeVO.getDestinationFilePath(true));
}
newDocument.close();
pdDocument.close();
public static boolean removeQRCodeImage(PDDocument document, PDPage page, String imgUniqueId) throws Exception {
String qrCodeCosName = null;
PDResources pdResources = page.getResources();
boolean hasQRCodeOnPage=false;
for (COSName propertyName : pdResources.getXObjectNames()) {
if (!pdResources.isImageXObject(propertyName)) {
continue;
}
PDXObject o;
try {
o = pdResources.getXObject(propertyName);
if (o instanceof PDImageXObject) {
PDImageXObject pdImageXObject = (PDImageXObject) o;
if (pdImageXObject.getMetadata() != null) {
DomXmpParser xmpParser = new DomXmpParser();
XMPMetadata xmpMetadata = xmpParser.parse(pdImageXObject.getMetadata().toByteArray());
if(xmpMetadata.getDublinCoreSchema()!=null && StringUtils.isNoneBlank(xmpMetadata.getDublinCoreSchema().getTitle())&&xmpMetadata.getDublinCoreSchema().getTitle().contains("_barcodeimg_")) {
((COSDictionary) pdResources.getCOSObject().getDictionaryObject(COSName.XOBJECT))
.removeItem(propertyName);
log.debug("propertyName REMOVED--"+propertyName.getName());
qrCodeCosName = propertyName.getName();
hasQRCodeOnPage=true;
}
}
}
} catch (IOException e) {
log.error("Exception in removeQRCodeImage() while extracting QR image:" + e, e);
}
}
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List<Object> tokens = parser.getTokens();
log.debug("original tokens size" + tokens.size());
List<Object> newTokens = new ArrayList<Object>();
for (int j = 0; j < tokens.size(); j++) {
Object token = tokens.get(j);
if (token instanceof Operator) {
Operator op = (Operator) token;
// find image - remove it
if (op.getName().equals("Do")) {
COSName cosName = (COSName) tokens.get(j - 1);
if (cosName.getName().equals(qrCodeCosName)) {
newTokens.remove(newTokens.size() - 1);
continue;
}
}
}
newTokens.add(token);
}
log.debug("tokens size" + newTokens.size());
PDStream newContents = new PDStream(document);
OutputStream out = newContents.createOutputStream();
ContentStreamWriter writer = new ContentStreamWriter(out);
writer.writeTokens(newTokens);
out.close();
page.setContents(newContents);
return hasQRCodeOnPage;
}

PDFBox Error After Deleting Image - An error exists on this page. Acrobat may not display the page correctly

I am using the pdfbox library 2.0 version. I have to remove the selected image from the page and add another image. It works properly. But when I open that file it shows a warning message: An error exists on this page. Acrobat may not display the page correctly. and the screenshot is as below:
Herewith sharing the code to delete an image and add other image:(EDITTED with Fix)
public static void removeImages(String pdfFile) throws Exception {
PDDocument document = PDDocument.load(new File(pdfFile));
for (PDPage page : document.getPages()) {
PDResources pdResources = page.getResources();
String[] qrCodeCosName = new String[1];
pdResources.getXObjectNames().forEach(propertyName -> {
if (!pdResources.isImageXObject(propertyName)) {
return;
}
PDXObject o;
try {
o = pdResources.getXObject(propertyName);
if (o instanceof PDImageXObject) {
PDImageXObject pdImageXObject = (PDImageXObject) o;
if (pdImageXObject.getMetadata() != null) {
// TO REMOVE FROM RESOURCE
((COSDictionary) pdResources.getCOSObject().getDictionaryObject(COSName.XOBJECT))
.removeItem(propertyName);
qrCodeCosName[0] = propertyName.getName();
}
}
} catch (IOException e) {
e.printStackTrace();
}
});
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List<Object> tokens = parser.getTokens();
System.out.println("Original tokens size" + tokens.size());
List<Object> newTokens = new ArrayList<Object>();
for (int j = 0; j < tokens.size(); j++) {
Object token = tokens.get(j);
if (token instanceof Operator) {
Operator op = (Operator) token;
// find image - remove it
if (op.getName().equals("Do")) {
COSName cosName = (COSName) tokens.get(j - 1);
if (cosName.getName().equals(qrCodeCosName[0])) {
newTokens.remove(newTokens.size() - 1);
continue;
}
}
newTokens.add(token);
}
System.out.println("tokens size" + newTokens.size());
PDStream newContents = new PDStream(document);
OutputStream out = newContents.createOutputStream();
ContentStreamWriter writer = new ContentStreamWriter(out);
writer.writeTokens(newTokens);
out.close();
page.setContents(newContents);
// ADD OTHER IMAGE
PDImageXObject pdImage = PDImageXObject.createFromFile("D:\\copy\\ind.png", document);
PDPageContentStream contents = new PDPageContentStream(document, page,
PDPageContentStream.AppendMode.PREPEND, true, true);
contents.saveGraphicsState();
// Drawing the image in the PDF document
contents.drawImage(pdImage, 0, 0, 50, 30);
contents.restoreGraphicsState();
System.out.println("Image inserted Successfully.");
// Closing the PDPageContentStream object
contents.close();
}
document.save("RemoveImage.pdf");
document.close();
}
}
Kindly help me with this.
Also, looking forward for other code review comments about required changes to make this operation properly. :)

As per #Tilman Hausherr suggestion below code works for me:
public static void removeImages(String pdfFile) throws Exception {
PDDocument document = PDDocument.load(new File(pdfFile));
for (PDPage page : document.getPages()) {
PDResources pdResources = page.getResources();
String[] qrCodeCosName = new String[1];
pdResources.getXObjectNames().forEach(propertyName -> {
if (!pdResources.isImageXObject(propertyName)) {
return;
}
PDXObject o;
try {
o = pdResources.getXObject(propertyName);
if (o instanceof PDImageXObject) {
PDImageXObject pdImageXObject = (PDImageXObject) o;
if (pdImageXObject.getMetadata() != null) {
// TO REMOVE FROM RESOURCE
((COSDictionary) pdResources.getCOSObject().getDictionaryObject(COSName.XOBJECT))
.removeItem(propertyName);
qrCodeCosName[0] = propertyName.getName();
}
}
} catch (IOException e) {
e.printStackTrace();
}
});
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List<Object> tokens = parser.getTokens();
System.out.println("Original tokens size" + tokens.size());
List<Object> newTokens = new ArrayList<Object>();
for (int j = 0; j < tokens.size(); j++) {
Object token = tokens.get(j);
if (token instanceof Operator) {
Operator op = (Operator) token;
// find image - remove it
if (op.getName().equals("Do")) {
COSName cosName = (COSName) tokens.get(j - 1);
if (cosName.getName().equals(qrCodeCosName[0])) {
newTokens.remove(newTokens.size() - 1);
continue;
}
}
newTokens.add(token);
}
System.out.println("tokens size" + newTokens.size());
PDStream newContents = new PDStream(document);
OutputStream out = newContents.createOutputStream();
ContentStreamWriter writer = new ContentStreamWriter(out);
writer.writeTokens(newTokens);
out.close();
page.setContents(newContents);
// ADD OTHER IMAGE
PDImageXObject pdImage = PDImageXObject.createFromFile("D:\\copy\\ind.png", document);
PDPageContentStream contents = new PDPageContentStream(document, page,
PDPageContentStream.AppendMode.PREPEND, true, true);
contents.saveGraphicsState();
// Drawing the image in the PDF document
contents.drawImage(pdImage, 0, 0, 50, 30);
contents.restoreGraphicsState();
System.out.println("Image inserted Successfully.");
// Closing the PDPageContentStream object
contents.close();
}
document.save("RemoveImage.pdf");
document.close();
}
}

Warning: You did not close a PDF Document looping when renderImageWithDPI

i want to split pdf to image file by page, but i got Warning: You did not close a PDF Document looping when renderImageWithDPI
Still have warning
UPDATE CODE :
public void splitImage(PDDocument document, File checkFile, File theDirSplit, String fileExtension, File theDir, File watermarkDirectory, int numberOfPages)
throws InvalidPasswordException, IOException {
String fileName = checkFile.getName().replace(".pdf", "");
int dpi = 300;
if (theDirSplit.list().length < numberOfPages)
{
for (int i = 0; i < numberOfPages; ++i)
{
if (i == numberOfPages)
break;
if (theDirSplit.list().length != numberOfPages)
{
File outPutFile = new File(theDirSplit + Constan.simbol + fileName + "_" + (i + 1) + "." + fileExtension);
document = PDDocument.load(checkFile);
PDFRenderer pdfRenderer = new PDFRenderer(document);
BufferedImage bImage = pdfRenderer.renderImageWithDPI(i, dpi, ImageType.RGB);
ImageIO.write(bImage, fileExtension, outPutFile);
}
// splitService.watermark(outPutFile, (i + 1), watermarkDirectory, "pdf");
}
document.close();
//System.out.println("Converted Images are saved at -> " + theDirSplit.getAbsolutePath());
}
System.out.println("Done Partial SPlit");
/*
* int i = 1; while (iterator.hasNext()) { PDDocument pd = iterator.next();
* pd.save(theDirSplit + Constan.simbol + i++ + ".pdf"); }
* System.out.println("Multiple PDF’s created");
*/
}
error looping
total warning same with number of pages...
i already try to close but not work, this process make my server java.lang.OutOfMemoryError: Java heap space
update :
else if ("pdf".equalsIgnoreCase(typeFile)) {
System.out.println(
"target file " + downloadPath + R_OBJECT_ID + Constan.simbol + R_OBJECT_ID + "." + typeFile);
//get jumlah halaman
try(PDDocument document = PDDocument.load(checkFile)){
File theDirSplit = new File(theDir.getAbsolutePath() + Constan.simbol + "splitImage");
createFolder(theDirSplit);
String fileExtension = "jpeg";
File watermarkDirectory = new File(theDir.getAbsolutePath() + Constan.simbol + "watermarkImage");
createFolder(watermarkDirectory);
// split 2 page image
if (theDirSplit.list().length <= document.getNumberOfPages()) {
try {
splitImage(document,checkFile, theDirSplit, fileExtension, theDir, watermarkDirectory, document.getNumberOfPages()/2);
} catch (IOException e) {
System.out.println("ERROR SPLIT PDF " + e.getMessage());
e.printStackTrace();
}
}
res.setTotalPages(document.getNumberOfPages());
document.close();
return new ResponseEntity<>(res, HttpStatus.OK);
}
} else {
res.setTotalPages(1);
return new ResponseEntity<>(res, HttpStatus.OK);
}
this is code to call split method....

This is somewhat lost from the question, but the cause was failing to close the documents generated by splitter.split().

How can I get bookmarks page number in a PDF file with Apache PdfBox?

I've already obtained bookmarks but I need to know where these bookmarks are located in the PDF. (Bookmark 1 = page 1,..., Bookmark 54= page 72 etc..). Anyone can help me? Thanks for the support.
PDDocument doc = PDDocument.load( ... );
PDDocumentOutline root = doc.getDocumentCatalog().getDocumentOutline();
PDOutlineItem item = root.getFirstChild();
while( item != null )
{
System.out.println( "Item:" + item.getTitle() );
item = item.getNextSibling();
}

Excerpt from the PrintBookmarks.java example from the source code download:
if (item.getDestination() instanceof PDPageDestination)
{
PDPageDestination pd = (PDPageDestination) item.getDestination();
System.out.println("Destination page: " + (pd.retrievePageNumber() + 1));
}
else if (item.getDestination() instanceof PDNamedDestination)
{
PDPageDestination pd = document.getDocumentCatalog().findNamedDestinationPage((PDNamedDestination) item.getDestination());
if (pd != null)
{
System.out.println("Destination page: " + (pd.retrievePageNumber() + 1));
}
}
if (item.getAction() instanceof PDActionGoTo)
{
PDActionGoTo gta = (PDActionGoTo) item.getAction();
if (gta.getDestination() instanceof PDPageDestination)
{
PDPageDestination pd = (PDPageDestination) gta.getDestination();
System.out.println("Destination page: " + (pd.retrievePageNumber() + 1));
}
else if (gta.getDestination() instanceof PDNamedDestination)
{
PDPageDestination pd = document.getDocumentCatalog().findNamedDestinationPage((PDNamedDestination) gta.getDestination());
if (pd != null)
{
System.out.println("Destination page: " + (pd.retrievePageNumber() + 1));
}
}
}

Need to split docx file based on string using docx4j Java?

I am new to Docx4j ,Need help to split docx file based on string using docx4j Java,So that it writes output into multiple files.
I tried to do the same using Apache POI and got the output,however when tried to convert it into HTML, got issues on style missing,also added styles later, still facing the same issue.
Below is the code using apache poi:
public static int pos = 0;
public static int posc = 0;
public static String ind = "n";
final static int DEFAULT_FONT_SIZE = 10;
public static void main(String[] args) throws FileNotFoundException,
IOException, XmlException {
File file = null;
File outfilep = null;
File outfilec = null;
File dir = new File(PropertyUtils.getProperty("INPUT_DIR"));
String[] files = dir.list();
if (files.length == 0) {
System.out.println("The directory is empty");
} else {
for (String aFile : files) {
System.out.println(aFile);
file = new File(PropertyUtils.getProperty("INPUT_DIR") + aFile
+ "/" + aFile + ".docx");
outfilep = new File(PropertyUtils.getProperty("INPUT_DIR")
+ aFile + "/" + aFile + "-Product.docx");
outfilec = new File(PropertyUtils.getProperty("INPUT_DIR")
+ aFile + "/" + aFile + "-Component.docx");
// Write Soruce file
}
}
XWPFDocument doc = new XWPFDocument(new FileInputStream(file));
XWPFDocument destDoc = new XWPFDocument();
copyLayout(doc, destDoc);
XWPFDocument destDocc = new XWPFDocument();
OutputStream out = new FileOutputStream(outfilep);
OutputStream outc = new FileOutputStream(outfilec);
for (IBodyElement bodyElement : doc.getBodyElements()) {
BodyElementType elementType = bodyElement.getElementType();
if (elementType.name().equals("PARAGRAPH")) {
XWPFParagraph pr = (XWPFParagraph) bodyElement;
if (pr.getText().contains("CONSTRUCTION DETAILS:"))
{
ind = "y";
System.out.println("ind is Y++++++++++++");
}
if (ind == "n")
{
copyStyle(doc, destDoc,
doc.getStyles().getStyle(pr.getStyleID()));
XWPFParagraph dstPr = destDoc.createParagraph();
dstPr.createRun();
pos = destDoc.getParagraphs().size() - 1;
CTPPr ppr = pr.getCTP().getPPr();
if (ppr == null) ppr = pr.getCTP().addNewPPr();
CTSpacing spacing = ppr.isSetSpacing()? ppr.getSpacing() : ppr.addNewSpacing();
spacing.setAfter(BigInteger.valueOf(0));
spacing.setBefore(BigInteger.valueOf(0));
spacing.setLineRule(STLineSpacingRule.AUTO);
spacing.setLine(BigInteger.valueOf(240));
destDoc.setParagraph(pr, pos);
// System.out.println("prod "
// + destDoc.getParagraphArray(pos).getParagraphText());
}
else {
copyStyle(doc, destDocc,
doc.getStyles().getStyle(pr.getStyleID()));
XWPFParagraph dstPrr = destDocc.createParagraph();
dstPrr.createRun();
pos = destDocc.getParagraphs().size() - 1;
CTPPr ppr = pr.getCTP().getPPr();
if (ppr == null) ppr = pr.getCTP().addNewPPr();
CTSpacing spacing = ppr.isSetSpacing()? ppr.getSpacing() : ppr.addNewSpacing();
spacing.setAfter(BigInteger.valueOf(0));
spacing.setBefore(BigInteger.valueOf(0));
spacing.setLineRule(STLineSpacingRule.AUTO);
spacing.setLine(BigInteger.valueOf(240));
destDocc.setParagraph(pr, pos);
//// System.out.println("comp "
//// + destDoc.getParagraphArray(pos).getParagraphText());
}
} else if (elementType.name().equals("TABLE")) {
XWPFTable table = (XWPFTable) bodyElement;
if (ind == "n")
{
copyStyle(doc, destDoc,
doc.getStyles().getStyle(table.getStyleID()));
destDoc.createTable();
pos = destDoc.getTables().size() - 1;
destDoc.setTable(pos, table);
// System.out.println("prodtable " + destDoc.getParagraphArray(pos).getParagraphText());
}
else {
copyStyle(doc, destDocc,
doc.getStyles().getStyle(table.getStyleID()));
destDocc.createTable();
pos = destDocc.getTables().size() - 1;
destDocc.setTable(pos, table);
// System.out.println("comptable " + destDoc.getParagraphArray(pos).getParagraphText());
}
}
}
destDoc.write(out);
destDocc.write(outc);
}
// Copy Styles of Table and Paragraph.
private static void copyStyle(XWPFDocument srcDoc, XWPFDocument destDoc,
XWPFStyle style) {
if (destDoc == null || style == null)
return;
if (destDoc.getStyles() == null) {
destDoc.createStyles();
}
List<XWPFStyle> usedStyleList = srcDoc.getStyles().getUsedStyleList(
style);
for (XWPFStyle xwpfStyle : usedStyleList) {
destDoc.getStyles().addStyle(xwpfStyle);
}
}
private static void copyLayout(XWPFDocument srcDoc, XWPFDocument destDoc)
{
CTPageMar pgMar = srcDoc.getDocument().getBody().getSectPr().getPgMar();
BigInteger bottom = pgMar.getBottom();
BigInteger footer = pgMar.getFooter();
BigInteger gutter = pgMar.getGutter();
BigInteger header = pgMar.getHeader();
BigInteger left = pgMar.getLeft();
BigInteger right = pgMar.getRight();
BigInteger top = pgMar.getTop();
CTPageMar addNewPgMar = destDoc.getDocument().getBody().addNewSectPr().addNewPgMar();
addNewPgMar.setBottom(bottom);
addNewPgMar.setFooter(footer);
addNewPgMar.setGutter(gutter);
addNewPgMar.setHeader(header);
addNewPgMar.setLeft(left);
addNewPgMar.setRight(right);
addNewPgMar.setTop(top);
CTPageSz pgSzSrc = srcDoc.getDocument().getBody().getSectPr().getPgSz();
BigInteger code = pgSzSrc.getCode();
BigInteger h = pgSzSrc.getH();
Enum orient = pgSzSrc.getOrient();
BigInteger w = pgSzSrc.getW();
CTPageSz addNewPgSz = destDoc.getDocument().getBody().addNewSectPr().addNewPgSz();
addNewPgSz.setCode(code);
addNewPgSz.setH(h);
addNewPgSz.setOrient(orient);
addNewPgSz.setW(w);
}

Splitting a docx is easy enough to do in a brute force kind of a way: you can delete the content (paragraphs etc) you don't want, then save the result.
This way, the original relationships will stay intact, but your docx container may be bigger than necessary, since it might have images etc which are no longer used.
Done this way, there are still things you need to look out for:
splitting between a bookmark start and end tag (same for comments)
automatic numbering might give the wrong start number, unless you set start at
Obviously you could write code to address such issues.
Alternatively, with our commercial Enterprise edition of docx4j, you can use its "merge" code to say you want say paragraphs X to Y, and it'll give you a docx containing only that (ie no extraneous images in the docx container, split bookmarks taken care of etc).

I hope this will solve the issue.
public class SplitUsingDocx4j {
/**
* #param args
* #throws Docx4JException
* #throws FileNotFoundException
*/
public static void main(String[] args) throws Docx4JException,
FileNotFoundException {
File dir = new File(PropertyUtils.getProperty("INPUT_DIR"));
String[] files = dir.list();
File file = null;
if (files.length == 0) {
System.out.println("The directory is empty");
} else {
for (String aFile : files) {
System.out.println(aFile);
file = new File(PropertyUtils.getProperty("INPUT_DIR") + aFile
+ "/" + aFile + ".docx");
}
}
// Creating new documents
WordprocessingMLPackage doc1 = WordprocessingMLPackage.createPackage();
WordprocessingMLPackage doc2 = WordprocessingMLPackage.createPackage();
// loading existing document
WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage
.load(new java.io.File(file.getPath()));
MainDocumentPart tempDocPart = wordMLPackage.getMainDocumentPart();
List<Object> obj = wordMLPackage.getMainDocumentPart().getContent();
// for copying styles from existing doc to new docs
StyleDefinitionsPart sdp = tempDocPart.getStyleDefinitionsPart();
Styles tempStyle = sdp.getJaxbElement();
doc1.getMainDocumentPart().getStyleDefinitionsPart()
.setJaxbElement(tempStyle);
doc2.getMainDocumentPart().getStyleDefinitionsPart()
.setJaxbElement(tempStyle);
boolean flag = false;
for (Object object : obj) {
if (!flag) {
if (object.toString().equalsIgnoreCase("CONSTRUCTION DETAILS:")) {
flag = true;
}
doc1.getMainDocumentPart().addObject(object);
} else {
doc2.getMainDocumentPart().addObject(object);
}
}
String fileName = file.getName().toString().replace(".docx", "");
doc1.save(new File(fileName + "-1.docx"));
doc2.save(new File(fileName + "-2.docx"));
}}

Develop Reference

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Issues with enclosing PDDoucment - java

Related

Size of the pdf file increases after removing qr code image using pdfbox library

PDFBox Error After Deleting Image - An error exists on this page. Acrobat may not display the page correctly

Warning: You did not close a PDF Document looping when renderImageWithDPI

How can I get bookmarks page number in a PDF file with Apache PdfBox?

Need to split docx file based on string using docx4j Java?

Categories

Resources