pdf clown- not highlighting specific search keyword - java

I am using pdf-clown with pdfclown-0.2.0-HEAD.jar.I have written below code for highlighting search the keyword in Chinese language pdf file and same code is working fine with english pdf file.
import java.awt.Color;
import java.awt.Desktop;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.BufferedInputStream;
import java.io.File;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.contents.colorSpaces.DeviceRGBColor;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;
import org.pdfclown.files.SerializationModeEnum;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;
import org.pdfclown.tools.TextExtractor;
public class pdfclown2 {
private static int count;
public static void main(String[] args) throws IOException {
highlight("ebook.pdf","C:\\Users\\Downloads\\6.pdf");
System.out.println("OK");
}
private static void highlight(String inputPath, String outputPath) throws IOException {
URL url = new URL(inputPath);
InputStream in = new BufferedInputStream(url.openStream());
org.pdfclown.files.File file = null;
try {
file = new org.pdfclown.files.File("C:\\Users\\Desktop\\pdf\\test123.pdf");
Map<String, String> m = new HashMap<String, String>();
m.put("亿元或","hi");
m.put("收入亿来","hi");
System.out.println("map size"+m.size());
long startTime = System.currentTimeMillis();
// 2. Iterating through the document pages...
TextExtractor textExtractor = new TextExtractor(true, true);
for (final Page page : file.getDocument().getPages()) {
Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page);
for (Map.Entry<String, String> entry : m.entrySet()) {
Pattern pattern;
String serachKey = entry.getKey();
final String translationKeyword = entry.getValue();
/*
if ((serachKey.contains(")") && serachKey.contains("("))
|| (serachKey.contains("(") && !serachKey.contains(")"))
|| (serachKey.contains(")") && !serachKey.contains("(")) || serachKey.contains("?")
|| serachKey.contains("*") || serachKey.contains("+")) {s
pattern = Pattern.compile(Pattern.quote(serachKey), Pattern.CASE_INSENSITIVE);
}
else*/
pattern = Pattern.compile(serachKey, Pattern.CASE_INSENSITIVE);
// 2.1. Extract the page text!
//System.out.println(textStrings.toString().indexOf(entry.getKey()));
// 2.2. Find the text pattern matches!
final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings));
// 2.3. Highlight the text pattern matches!
textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter() {
public boolean hasNext() {
// System.out.println(matcher.find());
// if(key.getMatchCriteria() == 1){
if (matcher.find()) {
return true;
}
/*
* } else if(key.getMatchCriteria() == 2) { if
* (matcher.hitEnd()) { count++; return true; } }
*/
return false;
}
public Interval<Integer> next() {
return new Interval<Integer>(matcher.start(), matcher.end());
}
public void process(Interval<Integer> interval, ITextString match) {
// Defining the highlight box of the text pattern
// match...
System.out.println(match);
/* List<Quad> highlightQuads = new ArrayList<Quad>();
{
Rectangle2D textBox = null;
for (TextChar textChar : match.getTextChars()) {
Rectangle2D textCharBox = textChar.getBox();
if (textBox == null) {
textBox = (Rectangle2D) textCharBox.clone();
} else {
if (textCharBox.getY() > textBox.getMaxY()) {
highlightQuads.add(Quad.get(textBox));
textBox = (Rectangle2D) textCharBox.clone();
} else {
textBox.add(textCharBox);
}
}
}
textBox.setRect(textBox.getX(), textBox.getY(), textBox.getWidth(), textBox.getHeight());
highlightQuads.add(Quad.get(textBox));
}*/
List<Quad> highlightQuads = new ArrayList<Quad>();
List<TextChar> textChars = match.getTextChars();
Rectangle2D firstRect = textChars.get(0).getBox();
Rectangle2D lastRect = textChars.get(textChars.size()-1).getBox();
Rectangle2D rect = firstRect.createUnion(lastRect);
highlightQuads.add(Quad.get(rect).get(rect));
// subtype can be Highlight, Underline, StrikeOut, Squiggly
new TextMarkup(page, highlightQuads, translationKeyword, MarkupTypeEnum.Highlight);
}
public void remove() {
throw new UnsupportedOperationException();
}
});
}
}
SerializationModeEnum serializationMode = SerializationModeEnum.Standard;
file.save(new java.io.File(outputPath), serializationMode);
System.out.println("file created");
long endTime = System.currentTimeMillis();
System.out.println("seconds take for execution is:"+(endTime-startTime)/1000);
} catch (Exception e) {
e.printStackTrace();
}
finally{
in.close();
}
}
}
Kindly provide your inputs to highlight specific search keyword for non english pdf files.
I am serching the keyword in below text which is in chinese langauage.
普双套习近平修宪普京利用双套车绕开宪法装班要走普京
enter image description here

Your PDF Clown version
The PDF Clown version you retrieved here from Tymate's maven repository on github has been pushed there April 23rd, 2015. The final (as of now) check-in to the PDF Clown subversion source code repository TRUNK on sourceforge, on the other hand, is from May 27th, 2015. There actually are some 30 checkins after April 23rd, 2015. Thus, you definitely do not use the most current version of this apparently dead PDF library project.
Using the current 0.2.0 snapshot
I tested your code with the 0.2.0 development version compiled from that trunk and the result indeed is different:
It is better insofar as the highlights have the width of the sought character and are located nearer to the actual character position. There still is a bug, though, as the second and third match highlights are somewhat off.
Fixing the bug
The remaining problem actually is not related to the language of the text. It simply is a bug in the processing of one type of the PDF text drawing commands, so it can be observed in documents with text in arbitrary languages. Due to the fact that these commands nowadays are used very seldom only, though, the bug is hardly ever observed, let alone reported. Your PDF, on the other hand, makes use of that kind of text drawing commands.
The bug is in the ShowText class (package org.pdfclown.documents.contents.objects). At the end of the scan method the text line matrix in the graphics state is updated like this if the ShowText instance actually is a ShowTextToNextLine instance derived from it:
if(textScanner == null)
{
state.setTm(tm);
if(this instanceof ShowTextToNextLine)
{state.setTlm((AffineTransform)tm.clone());}
}
The text line matrix here is set to the text matrix after the move to the next line and the drawing of the text. This is wrong, it must instead be set to text matrix right after the move to the next line before the drawing of the text.
This can be fixed e.g. like this:
if(textScanner == null)
{
state.setTm(tm);
if(this instanceof ShowTextToNextLine)
state.getTlm().concatenate(new AffineTransform(1, 0, 0, 1, 0, -state.getLead()));
}
With this change in place the result looks like this:

Related

Using PdfCleanUpTool or PdfAutoSweep causes some text to change to bold, line weights increase and double points change to hearts

I have weird problem when I try to use iText 7. Some parts of the PDF are modified (text to change to bold, line weights increase and double points change to hearts). In iText version 5.4.4 this didn't happened, but every version since that I have tried cause this same problem (5 or 7).
Does anyone have a clued why this is happening and is there anything I could to do to bypass this problem? Any help would be appreciated!
If more information is needed, I will try to provide it.
Below is simple code that I used to test iText 7.
Example PDF Files
package javaapplication1;
import com.itextpdf.kernel.colors.ColorConstants;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.pdfcleanup.PdfCleanUpLocation;
import com.itextpdf.pdfcleanup.PdfCleanUpTool;
import com.itextpdf.pdfcleanup.autosweep.ICleanupStrategy;
import com.itextpdf.pdfcleanup.autosweep.PdfAutoSweep;
import com.itextpdf.pdfcleanup.autosweep.RegexBasedCleanupStrategy;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
public class JavaApplication1 {
public static final String DEST = "D:/TEMP/TEMP/PDF/result/orientation_result.pdf";
public static final String DEST2 = "D:/TEMP/TEMP/PDF/result/orientation_result2.pdf";
public static final String DEST3 = "D:/TEMP/TEMP/PDF/result/orientation_result3.pdf";
public static final String SRC = "D:/TEMP/TEMP/PDF/TEST_PDF.pdf";
public static void main(String[] args) throws IOException {
File file = new File(DEST);
file.getParentFile().mkdirs();
new JavaApplication1().manipulatePdf(DEST);
new JavaApplication1().manipulatePdf2(DEST2);
try (PdfDocument pdf = new PdfDocument(new PdfReader(SRC), new PdfWriter(DEST3))) {
final ICleanupStrategy cleanupStrategy = new RegexBasedCleanupStrategy(Pattern.compile("2019", Pattern.CASE_INSENSITIVE)).setRedactionColor(ColorConstants.PINK);
final PdfAutoSweep autoSweep = new PdfAutoSweep(cleanupStrategy);
autoSweep.cleanUp(pdf);
} catch (Exception e) {
System.out.println(e.toString());
}
}
protected void manipulatePdf(String dest) throws IOException {
PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC), new PdfWriter(dest));
List<PdfCleanUpLocation> cleanUpLocations = new ArrayList<PdfCleanUpLocation>();
// The arguments of the PdfCleanUpLocation constructor: the number of page to be cleaned up,
// a Rectangle defining the area on the page we want to clean up,
// a color which will be used while filling the cleaned area.
PdfCleanUpLocation location = new PdfCleanUpLocation(1, new Rectangle(97, 405, 383, 40),
ColorConstants.GRAY);
cleanUpLocations.add(location);
PdfCleanUpTool cleaner = new PdfCleanUpTool(pdfDoc, cleanUpLocations);
cleaner.cleanUp();
pdfDoc.close();
}
protected void manipulatePdf2(String dest) throws IOException {
PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC), new PdfWriter(dest));
// If the second argument is true, then regions to be erased are extracted from the redact annotations
// contained inside the given document. If the second argument is false (that's default behavior),
// then use PdfCleanUpTool.addCleanupLocation(PdfCleanUpLocation)
// method to set regions to be erased from the document.
PdfCleanUpTool cleaner = new PdfCleanUpTool(pdfDoc, true);
cleaner.cleanUp();
pdfDoc.close();
}
}
Sorry for late response. I managed now verify that updating to latest versions corrected this problem. Thanks to #mkl pointing this out.
Problem solved

How to detect if a PDF page has an image in it

I am working with PDF/A style PDF documents that have a mixture of scanned in, full page size images and then a page or two after the image pages that have text in a ColumnText object.
Using Java, how do i detect which pages have an image?
The intent to detect which pages have either images or text is to determine where the first page with text appears. I need to either edit the text or replace the page(s) with text with updated text. The pages with images would remain untouched.
I'm using iText5 and don't currently have the option of upgrading to iText7.
Here's the solution I implemented with the solution provided by #mkl:
ImageDetector.java
package org.test.pdf;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.RenderListener;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
public class ImageDetector implements RenderListener {
public void beginTextBlock() { }
public void endTextBlock() { }
public void renderText(TextRenderInfo renderInfo) {
textFound = true;
}
public void renderImage(ImageRenderInfo renderInfo) {
imageFound = true;
}
boolean textFound = false;
boolean imageFound = false;
}
PdfDocumentServiceTest.java
package org.test.pdf;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.test.PdfService;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.annotation.DirtiesContext;
import org.springframework.test.context.ActiveProfiles;
import org.springframework.test.context.junit4.SpringRunner;
import org.springframework.transaction.annotation.Transactional;
#ActiveProfiles({"local", "testing"})
#DirtiesContext
#Transactional
#RunWith(SpringRunner.class)
#SpringBootTest
public class PdfDocumentServiceTest {
#Autowired
private PdfService pdfService;
#Test
public void testFindImagesInPdf(Long pdfId)) {
final byte[] resource = PdfService.getPdf(pdfId);
int imagePageCount = 0;
int textPageCount = 0;
if (resource != null && resource.length > 0) {
PdfReader reader = new PdfReader(resource);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
for (int pageNumber = 1; pageNumber <= reader.getNumberOfPages(); pageNumber++) {
ImageDetector imageDetector = new ImageDetector();
parser.processContent(pageNumber, imageDetector);
if (imageDetector.imageFound) {
imagePageCount++;
}
if (imageDetector.textFound) {
textPageCount++;
}
}
Assert.assertTrue(imagePageCount > 0);
Assert.assertTrue(textPageCount > 0);
}
}
}
Using iText 5 you can find out whether images actually are shown on a page by parsing the page content into a custom RenderListener implementation. E.g.
class ImageDetector implements RenderListener {
public void beginTextBlock() { }
public void endTextBlock() { }
public void renderText(TextRenderInfo renderInfo) { }
public void renderImage(ImageRenderInfo renderInfo) {
imageFound = true;
}
boolean imageFound = false;
}
used like this:
PdfReader reader = new PdfReader(resource);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
for (int pageNumber = 1; pageNumber <= reader.getNumberOfPages(); pageNumber++)
{
ImageDetector imageDetector = new ImageDetector();
parser.processContent(pageNumber, imageDetector);
if (imageDetector.imageFound) {
// There is at least one image rendered on page i
// Thus, handle it as an image page
} else {
// There is no image rendered on page i
// Thus, handle it as a no-image page
}
}
As a possible improvement: In a comment you mention full-page-size images. Thus, in the ImageDetector method renderImage you might want to check the image size before setting imageFound to true. Via the ImageRenderInfo parameter you can retrieve both information on how large the image is displayed on the page and how large it actually is.
Try the below code example (Spire.PDF for Java library is needed), hopefully it works for you.
PdfDocument doc = new PdfDocument();
doc.loadFromFile("sample.pdf");
for(int i = 0; i < doc.getPages().getCount(); i ++) {
PdfPageBase page = doc.getPages().get(i);
PdfImageInfo[] imageInfo = page.getImagesInfo();
if (imageInfo != null && imageInfo.length > 0) {
System.out.println("Page" + i + "contains image");
}
else {
System.out.print("Page" + i + "doesn't contain image");
}
Disclaimer: I work for Spire.

Highlevel interface to PDF-Generation

I need to generate PDF-documents with java. I've tried with iTextPDF, and with ApachePDFbox. I have the impression, these are libraries to do all possible things with PDF.
But I only have a few requirements:
create a PDF-document from Scratch
add chapters having a title
adding plain text to a chapter
adding text from HTML to a chapter
adding an image to the chapter, being scaled to fit if needed
adding a table to the chapter
create a header and a footer
having a background image
There is quite a learning curve to do these things with the libraries mentioned above. So I'm dreaming of a high level API making my life much easier, having these few methods:
createChapter(String title)
addPlainText(Chapter chapter,String text)
addHtml(Chapter chapter,String html)
addImage(Chapter chapter,byte[] bytes) (because it's coming out of the DB)
addTable(Chapter chapter, TableModel table)
addHeader(HeaderFooterModel header)
addFooter(HeaderFooterModel footer)
addBackGroundImage(Chapter chapter,byte[] bytes)
Is there something like that already available? This would be cool and safe some time.
Following class is a great start at having a very high level API for iText.
I implemented most of the methods you requested.
The remainder is left as a challenge to the reader.
package stackoverflow;
import com.itextpdf.html2pdf.HtmlConverter;
import com.itextpdf.io.image.ImageDataFactory;
import com.itextpdf.kernel.color.Color;
import com.itextpdf.kernel.color.DeviceRgb;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.layout.Document;
import com.itextpdf.layout.element.IBlockElement;
import com.itextpdf.layout.element.IElement;
import com.itextpdf.layout.element.Image;
import com.itextpdf.layout.element.Paragraph;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ITextForDummiesFacade
{
// iText IO
private PdfDocument pdfDocument;
private Document layoutDocument;
// font sizes
private float regularFontSize = 12f;
private float chapterTitleFontSize = 14f;
// font colors
private Color chapterFontColor = new DeviceRgb(249, 157, 37);
private Color regularFontColor = new DeviceRgb(100, 100, 100);
// structure
private Map<String, Integer> chapterNames = new HashMap<>();
private Map<Integer, List<IElement>> elementsPerChapter = new HashMap<>();
public ITextForDummiesFacade(OutputStream os) throws IOException
{
this.pdfDocument = new PdfDocument(new PdfWriter(os));
this.layoutDocument = new Document(pdfDocument);
}
public ITextForDummiesFacade(File outputFile) throws IOException
{
this.pdfDocument = new PdfDocument(new PdfWriter(outputFile));
this.layoutDocument = new Document(pdfDocument);
}
public boolean createChapter(String title)
{
if(chapterNames.containsKey(title))
return false;
int nextID = chapterNames.size();
chapterNames.put(title, nextID);
elementsPerChapter.put(nextID, new ArrayList<IElement>());
elementsPerChapter.get(nextID).add(new Paragraph(title)
.setFontSize(chapterTitleFontSize)
.setFontColor(chapterFontColor));
return true;
}
public boolean addPlainText(String chapter, String text)
{
if(!chapterNames.containsKey(chapter))
return false;
int ID = chapterNames.get(chapter);
elementsPerChapter.get(ID).add(new Paragraph(text)
.setFontSize(regularFontSize)
.setFontColor(regularFontColor));
return true;
}
public boolean addHTML(String chapter, String HTML)
{
if(!chapterNames.containsKey(chapter))
return false;
int ID = chapterNames.get(chapter);
try
{
elementsPerChapter.get(ID).addAll(HtmlConverter.convertToElements(HTML));
} catch (IOException e)
{
e.printStackTrace();
return false;
}
return true;
}
public boolean addImage(String chapter, byte[] image)
{
if(!chapterNames.containsKey(chapter))
return false;
int ID = chapterNames.get(chapter);
elementsPerChapter.get(ID).add(new Image(ImageDataFactory.create(image)));
return true;
}
private void write()
{
for(int i=0;i<chapterNames.size();i++)
{
for(IElement e : elementsPerChapter.get(i))
if(e instanceof IBlockElement)
layoutDocument.add((IBlockElement) e);
}
}
public void close()
{
write();
layoutDocument.flush();
layoutDocument.close();
}
}
You can then easily call this facade to do the work for you.
File outputFile = new File(System.getProperty("user.home"), "output.pdf");
ITextForDummiesFacade facade = new ITextForDummiesFacade(outputFile);
facade.createChapter("Chapter 1");
facade.addPlainText("Chapter 1","Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.");
facade.close();

Getting sub links of a URL using jsoup

Consider a URl www.example.com it may have plenty numbers of links ,some may be internal and other may be external.I want to get a list of all the sub links ,not even the sub-sub links but only sub link.
E.G if there are four links as follows
1)www.example.com/images/main
2)www.example.com/data
3)www.example.com/users
4)www.example.com/admin/data
Then out of the four only 2 and 3 are of use as they are sub links not the sub-sub and so on links .Is there a way to achieve it through j-soup..If this could not be achieved through j-soup then one can introduce me with some other java API.
Also note that it should be a link of the parent Url which is initially sent(i.e. www.example.com)
If i can understand a sub-link can contain one slash you can attempt with this with counting the number of slashes for example :
List<String> list = new ArrayList<>();
list.add("www.example.com/images/main");
list.add("www.example.com/data");
list.add("www.example.com/users");
list.add("www.example.com/admin/data");
for(String link : list){
if((link.length() - link.replaceAll("[/]", "").length()) == 1){
System.out.println(link);
}
}
link.length(): count the number of characters
link.replaceAll("[/]", "").length() : count the number of slashes
If the difference equal to one then right link else no.
EDIT
How will i scan the whole website for sub links?
The answer for this with the robots.txt file or Robots exclusion standard, so in this it define all the sub-links of the web site for example https://stackoverflow.com/robots.txt, so the idea is, to read this file and you can extract the sub-links from this web-site here is a piece of code that can help you :
public static void main(String[] args) throws Exception {
//Your web site
String website = "http://stackoverflow.com";
//We will read the URL https://stackoverflow.com/robots.txt
URL url = new URL(website + "/robots.txt");
//List of your sub-links
List<String> list;
//Read the file with BufferedReader
try (BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()))) {
String subLink;
list = new ArrayList<>();
//Loop throw your file
while ((subLink = in.readLine()) != null) {
//Check if the sub-link is match with this regex, if yes then add it to your list
if (subLink.matches("Disallow: \\/\\w+\\/")) {
list.add(website + "/" + subLink.replace("Disallow: /", ""));
}else{
System.out.println("not match");
}
}
}
//Print your result
System.out.println(list);
}
This will show you :
[https://stackoverflow.com/posts/, https://stackoverflow.com/posts?,
https://stackoverflow.com/search/, https://stackoverflow.com/search?,
https://stackoverflow.com/feeds/, https://stackoverflow.com/feeds?,
https://stackoverflow.com/unanswered/,
https://stackoverflow.com/unanswered?, https://stackoverflow.com/u/,
https://stackoverflow.com/messages/, https://stackoverflow.com/ajax/,
https://stackoverflow.com/plugins/]
Here is a Demo about the regex that i use.
Hope this can help you.
To scan the links on the web page you can use JSoup library.
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
class read_data {
public static void main(String[] args) {
try {
Document doc = Jsoup.connect("**your_url**").get();
Elements links = doc.select("a");
List<String> list = new ArrayList<>();
for (Element link : links) {
list.add(link.attr("abs:href"));
}
} catch (IOException ex) {
}
}
}
list can be used as suggested in the previous answer.
The code for reading all the links on a website is given below. I have used http://stackoverflow.com/ for illustration. I would recommend you to go through company's terms of use before scraping it's website.
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class readAllLinks {
public static Set<String> uniqueURL = new HashSet<String>();
public static String my_site;
public static void main(String[] args) {
readAllLinks obj = new readAllLinks();
my_site = "stackoverflow.com";
obj.get_links("http://stackoverflow.com/");
}
private void get_links(String url) {
try {
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a");
links.stream().map((link) -> link.attr("abs:href")).forEachOrdered((this_url) -> {
boolean add = uniqueURL.add(this_url);
if (add && this_url.contains(my_site)) {
System.out.println(this_url);
get_links(this_url);
}
});
} catch (IOException ex) {
}
}
}
You will get list of all the links in uniqueURL field.

Merging MS Word documents with Java

I'm looking for java libraries that read and write MS Word Document.
What I have to do is:
read a template file, .dot or .doc, and fill it with some data read from DB
take data from another Word document and merging that with the file described above, preserving paragraphs formats
users may make updates to the file.
I've searched and found POI Apache and UNO OpenOffice.
The first one can easily read a template and replace any placeholders with my own data from DB. I didn't found anything about merging two, or more, documents.
OpenOffice UNO looks more stable but complex too. Furthermore I'm not sure that it has the ability to merge documents..
We are looking the right direction?
Another solution i've thought was to convert doc file to docx. In that way I found more libraries that can help us merging documents.
But how can I do that?
Thanks!
You could take a look at Docmosis since it provides the four features you have mentioned (data population, template/document merging, DOC format and java interface). It has a couple of flavours (download, online service), but you could sign up for a free trial of the cloud service to see if Docmosis can do what you want (then you don't have to install anything) or read the online documentation.
It uses OpenOffice under the hood (you can see from the developer guide installation instructions) which does pretty decent conversions between documents. The UNO API has some complications - I would suggest either Docmosis or JODReports to isolate your project from UNO directly.
Hope that helps.
import java.io.File;
import java.util.List;
import javax.xml.bind.JAXBException;
import org.docx4j.dml.CTBlip;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.Part;
import org.docx4j.openpackaging.parts.PartName;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageBmpPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageEpsPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageGifPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageJpegPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImagePngPart;
import org.docx4j.openpackaging.parts.WordprocessingML.ImageTiffPart;
import org.docx4j.openpackaging.parts.relationships.RelationshipsPart;
import org.docx4j.openpackaging.parts.relationships.RelationshipsPart.AddPartBehaviour;
import org.docx4j.relationships.Relationship;
public class MultipleDocMerge {
public static void main(String[] args) throws Docx4JException, JAXBException {
File first = new File("D:\\Mreg.docx");
File second = new File("D:\\Mreg1.docx");
File third = new File("D:\\Mreg4&19.docx");
File fourth = new File("D:\\test12.docx");
WordprocessingMLPackage f = WordprocessingMLPackage.load(first);
WordprocessingMLPackage s = WordprocessingMLPackage.load(second);
WordprocessingMLPackage a = WordprocessingMLPackage.load(third);
WordprocessingMLPackage e = WordprocessingMLPackage.load(fourth);
List body = s.getMainDocumentPart().getJAXBNodesViaXPath("//w:body", false);
for(Object b : body){
List filhos = ((org.docx4j.wml.Body)b).getContent();
for(Object k : filhos)
f.getMainDocumentPart().addObject(k);
}
List body1 = a.getMainDocumentPart().getJAXBNodesViaXPath("//w:body", false);
for(Object b : body1){
List filhos = ((org.docx4j.wml.Body)b).getContent();
for(Object k : filhos)
f.getMainDocumentPart().addObject(k);
}
List body2 = e.getMainDocumentPart().getJAXBNodesViaXPath("//w:body", false);
for(Object b : body2){
List filhos = ((org.docx4j.wml.Body)b).getContent();
for(Object k : filhos)
f.getMainDocumentPart().addObject(k);
}
List<Object> blips = e.getMainDocumentPart().getJAXBNodesViaXPath("//a:blip", false);
for(Object el : blips){
try {
CTBlip blip = (CTBlip) el;
RelationshipsPart parts = e.getMainDocumentPart().getRelationshipsPart();
Relationship rel = parts.getRelationshipByID(blip.getEmbed());
Part part = parts.getPart(rel);
if(part instanceof ImagePngPart)
System.out.println(((ImagePngPart) part).getBytes());
if(part instanceof ImageJpegPart)
System.out.println(((ImageJpegPart) part).getBytes());
if(part instanceof ImageBmpPart)
System.out.println(((ImageBmpPart) part).getBytes());
if(part instanceof ImageGifPart)
System.out.println(((ImageGifPart) part).getBytes());
if(part instanceof ImageEpsPart)
System.out.println(((ImageEpsPart) part).getBytes());
if(part instanceof ImageTiffPart)
System.out.println(((ImageTiffPart) part).getBytes());
Relationship newrel = f.getMainDocumentPart().addTargetPart(part,AddPartBehaviour.RENAME_IF_NAME_EXISTS);
blip.setEmbed(newrel.getId());
f.getMainDocumentPart().addTargetPart(e.getParts().getParts().get(new PartName("/word/"+rel.getTarget())));
} catch (Exception ex){
ex.printStackTrace();
} }
File saved = new File("D:\\saved1.docx");
f.save(saved);
}
}
I've developed the next class (using Apache POI):
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
public class WordMerge {
private final OutputStream result;
private final List<InputStream> inputs;
private XWPFDocument first;
public WordMerge(OutputStream result) {
this.result = result;
inputs = new ArrayList<>();
}
public void add(InputStream stream) throws Exception{
inputs.add(stream);
OPCPackage srcPackage = OPCPackage.open(stream);
XWPFDocument src1Document = new XWPFDocument(srcPackage);
if(inputs.size() == 1){
first = src1Document;
} else {
CTBody srcBody = src1Document.getDocument().getBody();
first.getDocument().addNewBody().set(srcBody);
}
}
public void doMerge() throws Exception{
first.write(result);
}
public void close() throws Exception{
result.flush();
result.close();
for (InputStream input : inputs) {
input.close();
}
}
}
And its use:
public static void main(String[] args) throws Exception {
FileOutputStream faos = new FileOutputStream("/home/victor/result.docx");
WordMerge wm = new WordMerge(faos);
wm.add( new FileInputStream("/home/victor/001.docx") );
wm.add( new FileInputStream("/home/victor/002.docx") );
wm.doMerge();
wm.close();
}
The Apache POI code does not work for Images.

Categories

Resources