convert html text into pdf without losing formatting

convert html text into pdf without losing formatting - java

I used xmlworker-5.5.9.jar and itextpdf-5.5.13.jar,
In my web application I use CKEditor.
At submit button i want to convert the CK content in .pdf format,
I used this code and it is working properly:
public void createPDF(String text) throws DocumentException, IOException
{
String fileName="f:\\test.pdf";
Document document=new Document();
PdfWriter pdfWriter=PdfWriter.getInstance(document, new FileOutputStream(fileName));
document.open();
String finall=text;
InputStream is = new ByteArrayInputStream(finall.getBytes());
XMLWorkerHelper.getInstance().parseXHtml(pdfWriter,document, is);
document.close();
}
but this code does not work with arabic text
I try with this solution without sucsess :
public void createPdf(String htmlContentAr)
{
Charset CHARSET_UTF8 = Charset.forName("UTF-8");
try {
Document pdfDoc = new Document();
PdfWriter writer = PdfWriter.getInstance(pdfDoc, new FileOutputStream("f:\\test.pdf"));
writer.setRgbTransparencyBlending(true);
pdfDoc.open();
StyleAttrCSSResolver cssResolver = new StyleAttrCSSResolver();
ElementsCollector elementsHandler = new ElementsCollector();
HtmlPipelineContext htmlContext = new HtmlPipelineContext(new CssAppliersImpl(
new UnicodeFontProvider()));
htmlContext.charSet(CHARSET_UTF8);
htmlContext.setAcceptUnknown(true).autoBookmark(true)
.setTagFactory(Tags.getHtmlTagProcessorFactory());
CssResolverPipeline pipeline = new CssResolverPipeline(cssResolver, new HtmlPipeline(htmlContext,
new ElementHandlerPipeline(elementsHandler, null)));
XMLWorker worker = new XMLWorker(pipeline, true);
XMLParser parser = new XMLParser();
parser.addListener(worker);
parser.parse(new StringReader(htmlContentAr));
PdfPTable mainTable = new PdfPTable(1);
mainTable.setWidthPercentage(100);
PdfPCell cell = new PdfPCell();
cell.setBorder(0);
cell.setHorizontalAlignment(Element.ALIGN_LEFT);
cell.addElement(elementsHandler.getParagraph());
mainTable.addCell(cell);
pdfDoc.add(mainTable);
pdfDoc.close();
} catch (Exception e) {
e.printStackTrace();
}
}
and this is the code of ElementsCollector.java :
import java.util.Iterator;
import java.util.List;
import com.itextpdf.text.Chunk;
import com.itextpdf.text.Element;
import com.itextpdf.text.Font;
import com.itextpdf.text.Paragraph;
import com.itextpdf.text.pdf.PdfPCell;
import com.itextpdf.text.pdf.PdfPRow;
import com.itextpdf.text.pdf.PdfPTable;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.tool.xml.ElementHandler;
import com.itextpdf.tool.xml.Writable;
import com.itextpdf.tool.xml.html.pdfelement.NoNewLineParagraph;
import com.itextpdf.tool.xml.pipeline.WritableElement;
public class ElementsCollector implements ElementHandler {
private Paragraph _paragraph;
public ElementsCollector() {
_paragraph = new Paragraph();
_paragraph.setAlignment(Element.ALIGN_LEFT);
}
public Paragraph getParagraph() {
return _paragraph;
}
#Override
public void add(Writable htmlElement) {
WritableElement writableElement = (WritableElement) htmlElement;
if (writableElement == null) {
return;
}
for (Element element : writableElement.elements()) {
if (element instanceof NoNewLineParagraph) {
NoNewLineParagraph para = (NoNewLineParagraph) element;
Iterator<Element> it = para.iterator();
while (it.hasNext()) {
Element divChildElement = (Element) it.next();
fixNestedTablesRunDirection(divChildElement);
_paragraph.add(divChildElement);
}
} else {
fixNestedTablesRunDirection(element);
_paragraph.add(element);
}
}
}
private void fixNestedTablesRunDirection(Element element) {
if (element == null) {
return;
}
if (element instanceof PdfPTable) {
PdfPTable table = (PdfPTable) element;
for (PdfPRow row : table.getRows()) {
for (PdfPCell cell : row.getCells()) {
if (cell.getCompositeElements() != null) {
for (Element item : cell.getCompositeElements()) {
List<Chunk> chunks = item.getChunks();
if (chunks != null) {
for (Chunk chunk : chunks) {
Font font = chunk.getFont();
if (font != null) {
String name = font.getFamilyname() != null ? font.getFamilyname()
.toLowerCase() : null;
if (name != null && !name.isEmpty() && name.contains("arabic")) {
cell.setRunDirection(PdfWriter.RUN_DIRECTION_RTL);
if (item instanceof Paragraph
&& ((Paragraph) item).getAlignment() == 2) {
((Paragraph) item).setAlignment(0);
}
continue;
}
}
}
}
}
}
}
}
}
}
}
and this is the code of UnicodeFontProvider.java
import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.nio.file.Paths;
import com.itextpdf.text.BaseColor;
import com.itextpdf.text.Font;
import com.itextpdf.text.FontFactory;
import com.itextpdf.text.FontFactoryImp;
import com.itextpdf.text.pdf.BaseFont;
public class UnicodeFontProvider extends FontFactoryImp {
public UnicodeFontProvider() {
String root = System.getenv("SystemRoot");
FileSystems.getDefault();
Path path = Paths.get(root, "fonts");
FontFactory.getFontImp().registerDirectory(path.toString());
// TODO test, works on windows so far
}
public Font getFont(String fontname, String encoding, boolean embedded, float size, int style,
BaseColor color, boolean cached) {
if (fontname!= null && !fontname.isEmpty()) {
return new Font(Font.FontFamily.UNDEFINED, size, style, color);
}
return FontFactory.getFont(fontname, BaseFont.IDENTITY_H, BaseFont.EMBEDDED, size, style, color);
}
}
but nothing has been displayed in pdf file
I think the problem in this line :
parser.parse(new StringReader(htmlContentAr));
updated :
I try with this code :
import java.io.File;
import java.io.IOException;
import com.itextpdf.text.FontProvider;
import com.itextpdf.html2pdf.ConverterProperties;
import com.itextpdf.html2pdf.HtmlConverter;
import com.itextpdf.html2pdf.resolver.font.DefaultFontProvider;
import com.itextpdf.io.font.FontProgram;
import com.itextpdf.io.font.FontProgramFactory;
import com.itextpdf.kernel.font.PdfFont;
import com.itextpdf.kernel.font.PdfFontFactory;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfWriter;
public class TestHTML {
public static final String[] FONTS = {
"src/main/resources/fonts/noto/NotoSans-Regular.ttf",
"src/main/resources/fonts/noto/NotoNaskhArabic-Regular.ttf",
"src/main/resources/fonts/noto/NotoSansHebrew-Regular.ttf"
};
/**
* #param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
public void createPdf(String src, String[] fonts, String dest) throws IOException {
ConverterProperties properties = new ConverterProperties();
FontProvider fontProvider = (FontProvider) new DefaultFontProvider(false, false, false);
for (String font : fonts) {
FontProgram fontProgram = FontProgramFactory.createFont(font);
fontProvider.addFont(fontProgram);
}
properties.setFontProvider(fontProvider);
HtmlConverter.convertToPdf(new File(src), new File(dest), properties);
}
}
but I have errors which are related to jar :
error in this line :
fontProvider.addFont(fontProgram);
The method addFont(FontProgram) is undefined for the type FontProvider
and error in :
properties.setFontProvider(fontProvider);
The method addFont(FontProgram) is undefined for the type FontProvider
also error in :
Multiple markers at this line
- The type com.itextpdf.layout.font.FontProvider cannot be resolved. It is indirectly referenced from
required .class files
- The type com.itextpdf.layout.font.FontProvider cannot be resolved. It is indirectly referenced from
required .class files
I used this jar :
kernel-7.0.0.jar ,io-7.0.0.jar ,html2pdf-1.0.2.jar ,itextpdf-5.5.13.jar,xmlworker-5.5.9.jar

Related

gettting exception while reading duplicate column name excel file using sparkexcel library. How to overcome this issue

I am using spark-excel(com.crealytics.spark.excel) library to read excel file. If no duplicate column in excel file, the library working fine. If any duplicate column name occurs in excel file, throwing below exception.
How to overcome this error?
Is there any workaround solution to overcome this?
Exception in thread "main" org.apache.spark.sql.AnalysisException: Found duplicate column(s) in the data schema: `net territory`;
at org.apache.spark.sql.util.SchemaUtils$.checkColumnNameDuplication(SchemaUtils.scala:85)

Using spark excel API getting exception .
StructType schema = DataTypes.createStructType(new StructField[]{DataTypes.createStructField("CGISAI", DataTypes.StringType, true), DataTypes.createStructField("SALES TERRITORY", DataTypes.StringType, true)});
SQLContext sqlcxt = new SQLContext(jsc);
Dataset<Row> df = sqlcxt.read()
.format("com.crealytics.spark.excel")
.option("path", "file:///"+siteinfofile)
.option("useHeader", "true")
.option("spark.read.simpleMode", "true")
.option("treatEmptyValuesAsNulls", "true")
.option("inferSchema", "false")
.option("addColorColumns", "False")
.option("sheetName", "sheet1")
.option("startColumn", 22)
.option("endColumn", 23)
//.schema(schema)
.load();
return df;
This is the code I am using. I am using sparkexcel library from com.crealytics.spark.excel.
I want the solution to identify whether excel file has duplicate columns or not. if have duplicate columns, how to rename/eliminate the duplicate columns.
WorkAround is as below:
convert .xlsx file into .csv . using spark default csv api that can handle duplicate column names by renaming them automatically.
Below is the code to convert from xlsx to csv file.
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package com.huawei.java.tools;
/**
*
* #author Nanaji Jonnadula
*/
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellAddress;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.util.SAXHelper;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import static org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
public class ExcelXlsx2Csv {
private static class SheetToCSV implements SheetContentsHandler {
private boolean firstCellOfRow = false;
private int currentRow = -1;
private int currentCol = -1;
private StringBuffer lineBuffer = new StringBuffer();
/** * Destination for data */
private FileOutputStream outputStream;
public SheetToCSV(FileOutputStream outputStream) {
this.outputStream = outputStream;
}
#Override
public void startRow(int rowNum) {
/** * If there were gaps, output the missing rows: * outputMissingRows(rowNum - currentRow - 1); */
// Prepare for this row
firstCellOfRow = true;
currentRow = rowNum;
currentCol = -1;
lineBuffer.delete(0, lineBuffer.length()); //clear lineBuffer
}
#Override
public void endRow(int rowNum) {
lineBuffer.append('\n');
try {
outputStream.write(lineBuffer.substring(0).getBytes());
} catch (IOException e) {
System.out.println("save date to file error at row number: {}"+ currentCol);
throw new RuntimeException("save date to file error at row number: " + currentCol);
}
}
#Override
public void cell(String cellReference, String formattedValue, XSSFComment comment) {
if (firstCellOfRow) {
firstCellOfRow = false;
} else {
lineBuffer.append(',');
}
// gracefully handle missing CellRef here in a similar way as XSSFCell does
if (cellReference == null) {
cellReference = new CellAddress(currentRow, currentCol).formatAsString();
}
int thisCol = (new CellReference(cellReference)).getCol();
int missedCols = thisCol - currentCol - 1;
if (missedCols > 1) {
lineBuffer.append(',');
}
currentCol = thisCol;
if (formattedValue.contains("\n")) {
formattedValue = formattedValue.replace("\n", "");
}
formattedValue = "\"" + formattedValue + "\"";
lineBuffer.append(formattedValue);
}
#Override
public void headerFooter(String text, boolean isHeader, String tagName) {
// Skip, no headers or footers in CSV
}
}
private static void processSheet(StylesTable styles, ReadOnlySharedStringsTable strings,
SheetContentsHandler sheetHandler, InputStream sheetInputStream) throws Exception {
DataFormatter formatter = new DataFormatter();
InputSource sheetSource = new InputSource(sheetInputStream);
try {
XMLReader sheetParser = SAXHelper.newXMLReader();
ContentHandler handler = new XSSFSheetXMLHandler(
styles, null, strings, sheetHandler, formatter, false);
sheetParser.setContentHandler(handler);
sheetParser.parse(sheetSource);
} catch (ParserConfigurationException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
}
public static void process(String srcFile, String destFile,String sheetname_) throws Exception {
File xlsxFile = new File(srcFile);
OPCPackage xlsxPackage = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(xlsxPackage);
XSSFReader xssfReader = new XSSFReader(xlsxPackage);
StylesTable styles = xssfReader.getStylesTable();
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
int index = 0;
while (iter.hasNext()) {
InputStream stream = iter.next();
String sheetName = iter.getSheetName();
System.out.println(sheetName + " [index=" + index + "]");
if(sheetName.equals(sheetname_)){
FileOutputStream fileOutputStream = new FileOutputStream(destFile);
processSheet(styles, strings, new SheetToCSV(fileOutputStream), stream);
fileOutputStream.flush();
fileOutputStream.close();
}
stream.close();
++index;
}
xlsxPackage.close();
}
public static void main(String[] args) throws Exception {
ExcelXlsx2Csv.process("D:\\data\\latest.xlsx", "D:\\data\\latest.csv","sheet1"); //source , destination, sheetname
}
}

Extract Only Images from PDF File in java using Apache Tika or PDFBox? [duplicate]

I m trying to extract images from a pdf using pdfbox. The example pdf here
But i m getting blank images only.
The code i m trying:-
public static void main(String[] args) {
PDFImageExtract obj = new PDFImageExtract();
try {
obj.read_pdf();
} catch (IOException ex) {
System.out.println("" + ex);
}
}
void read_pdf() throws IOException {
PDDocument document = null;
try {
document = PDDocument.load("C:\\Users\\Pradyut\\Documents\\MCS-034.pdf");
} catch (IOException ex) {
System.out.println("" + ex);
}
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
int i =1;
String name = null;
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
Map pageImages = resources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
i ++;
}
}
}
}
Thanks

Here is code using PDFBox 2.0.1 that will get a list of all images from the PDF. This is different than the other code in that it will recurse through the document instead of trying to get the images from the top level.
public List<RenderedImage> getImagesFromPDF(PDDocument document) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (PDPage page : document.getPages()) {
images.addAll(getImagesFromResources(page.getResources()));
}
return images;
}
private List<RenderedImage> getImagesFromResources(PDResources resources) throws IOException {
List<RenderedImage> images = new ArrayList<>();
for (COSName xObjectName : resources.getXObjectNames()) {
PDXObject xObject = resources.getXObject(xObjectName);
if (xObject instanceof PDFormXObject) {
images.addAll(getImagesFromResources(((PDFormXObject) xObject).getResources()));
} else if (xObject instanceof PDImageXObject) {
images.add(((PDImageXObject) xObject).getImage());
}
}
return images;
}

The below GetImagesFromPDF java class get all images in 04-Request-Headers.pdf file and save those files into destination folder PDFCopy.
import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
#SuppressWarnings({ "unchecked", "rawtypes", "deprecation" })
public class GetImagesFromPDF {
public static void main(String[] args) {
try {
String sourceDir = "C:/PDFCopy/04-Request-Headers.pdf";// Paste pdf files in PDFCopy folder to read
String destinationDir = "C:/PDFCopy/";
File oldFile = new File(sourceDir);
if (oldFile.exists()) {
PDDocument document = PDDocument.load(sourceDir);
List<PDPage> list = document.getDocumentCatalog().getAllPages();
String fileName = oldFile.getName().replace(".pdf", "_cover");
int totalImages = 1;
for (PDPage page : list) {
PDResources pdResources = page.getResources();
Map pageImages = pdResources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage pdxObjectImage = (PDXObjectImage) pageImages.get(key);
pdxObjectImage.write2file(destinationDir + fileName+ "_" + totalImages);
totalImages++;
}
}
}
} else {
System.err.println("File not exists");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}

For PDFBox 2.0.1, pudaykiran's answer must be slightly modified since some APIs have been changed.
public static void testPDFBoxExtractImages() throws Exception {
PDDocument document = PDDocument.load(new File("D:/Temp/Test.pdf"));
PDPageTree list = document.getPages();
for (PDPage page : list) {
PDResources pdResources = page.getResources();
for (COSName c : pdResources.getXObjectNames()) {
PDXObject o = pdResources.getXObject(c);
if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
File file = new File("D:/Temp/" + System.nanoTime() + ".png");
ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject)o).getImage(), "png", file);
}
}
}
}

Just add the .jpeg to the end of your path:
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i + ".jpeg");
That works for me.

You can use PDPage.convertToImage() function which can convert the PDF page into a BufferedImage. Next you can use the BufferedImage to create an Image.
Use the following reference for further detail:
All PDF realated classes in PDFBox you can get in
Apache PDFBox 1.8.3 API
Here you can see PDPage related documentation.
And do not forget to look for PDPage.convertToImage() function in PDPage class.

This is a kotlin version of #Matt's answer.
fun <R> PDResources.onImageResources(block: (RenderedImage) -> (R)): List<R> =
this.xObjectNames.flatMap {
when (val xObject = this.getXObject(it)) {
is PDFormXObject -> xObject.resources.onImageResources(block)
is PDImageXObject -> listOf(block(xObject.image))
else -> emptyList()
}
}
You can use it on PDPage Resources like this:
page.resources.onImageResources { image ->
Files.createTempFile("image", "xxx").also { path->
if(!ImageIO.write(it, "xxx", file.toFile()))
IllegalStateException("Couldn't write image to file")
}
}
Where "xxx" is the format you need (like "jpeg")

For someone who want just copy and paste this ready to use code
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.UUID;
public class ExtractImagesUseCase extends PDFStreamEngine{
private final String filePath;
private final String outputDir;
// Constructor
public ExtractImagesUseCase(String filePath,
String outputDir){
this.filePath = filePath;
this.outputDir = outputDir;
}
// Execute
public void execute(){
try{
File file = new File(filePath);
PDDocument document = PDDocument.load(file);
for(PDPage page : document.getPages()){
processPage(page);
}
}catch(IOException e){
e.printStackTrace();
}
}
#Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException{
String operation = operator.getName();
if("Do".equals(operation)){
COSName objectName = (COSName) operands.get(0);
PDXObject pdxObject = getResources().getXObject(objectName);
if(pdxObject instanceof PDImageXObject){
// Image
PDImageXObject image = (PDImageXObject) pdxObject;
BufferedImage bImage = image.getImage();
// File
String randomName = UUID.randomUUID().toString();
File outputFile = new File(outputDir,randomName + ".png");
// Write image to file
ImageIO.write(bImage, "PNG", outputFile);
}else if(pdxObject instanceof PDFormXObject){
PDFormXObject form = (PDFormXObject) pdxObject;
showForm(form);
}
}
else super.processOperator(operator, operands);
}
}
Demo
public class ExtractImageDemo{
public static void main(String[] args){
String filePath = "C:\\Users\\John\\Downloads\\Documents\\sample-file.pdf";
String outputDir = "C:\\Users\\John\\Downloads\\Documents\\Output";
ExtractImagesUseCase useCase = new ExtractImagesUseCase(
filePath,
outputDir
);
useCase.execute();
}
}

Instead of calling
image.write2file("C:\\Users\\Pradyut\\Documents\\image" + i);
You can use the ImageIO.write() static method to write the RGB image out in whatever format you need. Here I've used PNG:
File outputFile = new File( "C:\\Users\\Pradyut\\Documents\\image" + i + ".png");
ImageIO.write( image.getRGBImage(), "png", outputFile);

Extended Html Unit Driver with screenshot capabilities

I am working on code (java) that will open up a selenium headless browser with HTML Unit webdriver and then take a screenshot. Unfortunately, HTML Unit does not support screenshots on its own, so I had to download an extended version:
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URL;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.openqa.selenium.Capabilities;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriverException;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
import org.openqa.selenium.internal.Base64Encoder;
import org.openqa.selenium.remote.CapabilityType;
import org.openqa.selenium.remote.DesiredCapabilities;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.WebWindow;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class ScreenCaptureHtmlUnitDriver extends HtmlUnitDriver implements TakesScreenshot {
private static Map<String, byte[]> imagesCache = Collections.synchronizedMap(new HashMap<String, byte[]>());
private static Map<String, String> cssjsCache = Collections.synchronizedMap(new HashMap<String, String>());
// http://stackoverflow.com/questions/4652777/java-regex-to-get-the-urls-from-css
private final static Pattern cssUrlPattern = Pattern.compile("background(-image)?[\\s]*:[^url]*url[\\s]*\\([\\s]*([^\\)]*)[\\s]*\\)[\\s]*");// ?<url>
public ScreenCaptureHtmlUnitDriver() {
super();
}
public ScreenCaptureHtmlUnitDriver(boolean enableJavascript) {
super(enableJavascript);
}
public ScreenCaptureHtmlUnitDriver(Capabilities capabilities) {
super(capabilities);
}
public ScreenCaptureHtmlUnitDriver(BrowserVersion version) {
super(version);
DesiredCapabilities var = ((DesiredCapabilities) getCapabilities());
var.setCapability(CapabilityType.TAKES_SCREENSHOT, true);
}
//#Override
#SuppressWarnings("unchecked")
public <X> X getScreenshotAs(OutputType<X> target) throws WebDriverException {
byte[] archive = new byte[0];
try {
archive = downloadCssAndImages(getWebClient(), (HtmlPage) getCurrentWindow().getEnclosedPage());
} catch (Exception e) {
}
if(target.equals(OutputType.BASE64)){
return target.convertFromBase64Png(new Base64Encoder().encode(archive));
}
if(target.equals(OutputType.BYTES)){
return (X) archive;
}
return (X) archive;
}
// http://stackoverflow.com/questions/2244272/how-can-i-tell-htmlunits-webclient-to-download-images-and-css
protected byte[] downloadCssAndImages(WebClient webClient, HtmlPage page) throws Exception {
WebWindow currentWindow = webClient.getCurrentWindow();
Map<String, String> urlMapping = new HashMap<String, String>();
Map<String, byte[]> files = new HashMap<String, byte[]>();
WebWindow window = null;
try {
window = webClient.getWebWindowByName(page.getUrl().toString()+"_screenshot");
webClient.getPage(window, new WebRequest(page.getUrl()));
} catch (Exception e) {
window = webClient.openWindow(page.getUrl(), page.getUrl().toString()+"_screenshot");
}
String xPathExpression = "//*[name() = 'img' or name() = 'link' and (#type = 'text/css' or #type = 'image/x-icon') or #type = 'text/javascript']";
List<?> resultList = page.getByXPath(xPathExpression);
Iterator<?> i = resultList.iterator();
while (i.hasNext()) {
try {
HtmlElement el = (HtmlElement) i.next();
String resourceSourcePath = el.getAttribute("src").equals("") ? el.getAttribute("href") : el
.getAttribute("src");
if (resourceSourcePath == null || resourceSourcePath.equals(""))
continue;
URL resourceRemoteLink = page.getFullyQualifiedUrl(resourceSourcePath);
String resourceLocalPath = mapLocalUrl(page, resourceRemoteLink, resourceSourcePath, urlMapping);
urlMapping.put(resourceSourcePath, resourceLocalPath);
if (!resourceRemoteLink.toString().endsWith(".css")) {
byte[] image = downloadImage(webClient, window, resourceRemoteLink);
files.put(resourceLocalPath, image);
} else {
String css = downloadCss(webClient, window, resourceRemoteLink);
for (String cssImagePath : getLinksFromCss(css)) {
URL cssImagelink = page.getFullyQualifiedUrl(cssImagePath.replace("\"", "").replace("\'", "")
.replace(" ", ""));
String cssImageLocalPath = mapLocalUrl(page, cssImagelink, cssImagePath, urlMapping);
files.put(cssImageLocalPath, downloadImage(webClient, window, cssImagelink));
}
files.put(resourceLocalPath, replaceRemoteUrlsWithLocal(css, urlMapping)
.replace("resources/", "./").getBytes());
}
} catch (Exception e) {
}
}
String pagesrc = replaceRemoteUrlsWithLocal(page.getWebResponse().getContentAsString(), urlMapping);
files.put("page.html", pagesrc.getBytes());
webClient.setCurrentWindow(currentWindow);
return createZip(files);
}
String downloadCss(WebClient webClient, WebWindow window, URL resourceUrl) throws Exception {
if (cssjsCache.get(resourceUrl.toString()) == null) {
cssjsCache.put(resourceUrl.toString(), webClient.getPage(window, new WebRequest(resourceUrl))
.getWebResponse().getContentAsString());
}
return cssjsCache.get(resourceUrl.toString());
}
byte[] downloadImage(WebClient webClient, WebWindow window, URL resourceUrl) throws Exception {
if (imagesCache.get(resourceUrl.toString()) == null) {
imagesCache.put(
resourceUrl.toString(),
IOUtils.toByteArray(webClient.getPage(window, new WebRequest(resourceUrl)).getWebResponse()
.getContentAsStream()));
}
return imagesCache.get(resourceUrl.toString());
}
public static byte[] createZip(Map<String, byte[]> files) throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ZipOutputStream zipfile = new ZipOutputStream(bos);
Iterator<String> i = files.keySet().iterator();
String fileName = null;
ZipEntry zipentry = null;
while (i.hasNext()) {
fileName = i.next();
zipentry = new ZipEntry(fileName);
zipfile.putNextEntry(zipentry);
zipfile.write(files.get(fileName));
}
zipfile.close();
return bos.toByteArray();
}
List<String> getLinksFromCss(String css) {
List<String> result = new LinkedList<String>();
Matcher m = cssUrlPattern.matcher(css);
while (m.find()) { // find next match
result.add( m.group(2));
}
return result;
}
String replaceRemoteUrlsWithLocal(String source, Map<String, String> replacement) {
for (String object : replacement.keySet()) {
// background:url(http://org.com/images/image.gif)
source = source.replace(object, replacement.get(object));
}
return source;
}
String mapLocalUrl(HtmlPage page, URL link, String path, Map<String, String> replacementToAdd) throws Exception {
String resultingFileName = "resources/" + FilenameUtils.getName(link.getFile());
replacementToAdd.put(path, resultingFileName);
return resultingFileName;
}
}
Anyway, I found this code, but I am unsure how to use this code to actually take a screenshot.
I would like the screenshot to be a .jpg and to be able to be stored in a certain folder.
In the long haul, I am also going to need to run the screenshot code repeatedly.
Any help would be appreciated.

Reading multiple tabs of a huge excel file in Java using XSS and Event

I am using this code from (by author: lchen) which reads contents from excel file based on number of rows I provide into method 'readRow()'.
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.InputSource;
public class TestLargeFileRead {
private int rowNum = 0;
private OPCPackage opcPkg;
private ReadOnlySharedStringsTable stringsTable;
private XMLStreamReader xmlReader;
public void XExcelFileReader(String excelPath) throws Exception {
opcPkg = OPCPackage.open(excelPath, PackageAccess.READ);
this.stringsTable = new ReadOnlySharedStringsTable(opcPkg);
XSSFReader xssfReader = new XSSFReader(opcPkg);
XMLInputFactory factory = XMLInputFactory.newInstance();
InputStream inputStream = xssfReader.getSheetsData().next();
xmlReader = factory.createXMLStreamReader(inputStream);
while (xmlReader.hasNext()) {
xmlReader.next();
if (xmlReader.isStartElement()) {
if (xmlReader.getLocalName().equals("sheetData"))
break;
}
}
}
public int rowNum() {
return rowNum;
}
public List<String[]> readRows(int batchSize) throws XMLStreamException {
String elementName = "row";
List<String[]> dataRows = new ArrayList<String[]>();
if (batchSize > 0) {
while (xmlReader.hasNext()) {
xmlReader.next();
if (xmlReader.isStartElement()) {
if (xmlReader.getLocalName().equals(elementName)) {
rowNum++;
dataRows.add(getDataRow());
if (dataRows.size() == batchSize)
break;
}
}
}
}
return dataRows;
}
private String[] getDataRow() throws XMLStreamException {
List<String> rowValues = new ArrayList<String>();
while (xmlReader.hasNext()) {
xmlReader.next();
if (xmlReader.isStartElement()) {
if (xmlReader.getLocalName().equals("c")) {
CellReference cellReference = new CellReference(
xmlReader.getAttributeValue(null, "r"));
// Fill in the possible blank cells!
while (rowValues.size() < cellReference.getCol()) {
rowValues.add("");
}
String cellType = xmlReader.getAttributeValue(null, "t");
rowValues.add(getCellValue(cellType));
}
} else if (xmlReader.isEndElement()
&& xmlReader.getLocalName().equals("row")) {
break;
}
}
return rowValues.toArray(new String[rowValues.size()]);
}
private String getCellValue(String cellType) throws XMLStreamException {
String value = ""; // by default
while (xmlReader.hasNext()) {
xmlReader.next();
if (xmlReader.isStartElement()) {
if (xmlReader.getLocalName().equals("v")) {
if (cellType != null && cellType.equals("s")) {
int idx = Integer.parseInt(xmlReader.getElementText());
return new XSSFRichTextString(
stringsTable.getEntryAt(idx)).toString();
} else {
return xmlReader.getElementText();
}
}
} else if (xmlReader.isEndElement()
&& xmlReader.getLocalName().equals("c")) {
break;
}
}
return value;
}
#Override
protected void finalize() throws Throwable {
if (opcPkg != null)
opcPkg.close();
super.finalize();
}
public static void main(String[] args) {
try {
TestLargeFileRead howto = new TestLargeFileRead();
howto.XExcelFileReader("D:\\TEMP_CATALOG\\H1.xlsx");
} catch (Exception e) {
e.printStackTrace();
}
}
}
But it reads only First SHEET's contents and discards other subsequent SHEETS. My requirement is to read SHEET name; and based on name read that SHEET's contents. Can anyone help me to customize this above code fetch SHEET NAME and their contents ? please ?

The key class you need to work with, and tweak your use of, is XSSFReader. If you take a look at the Javadocs for it, you'll see it provides an Iterator of InputStreams of all the sheets, and a way to get at the root Workbook stream.
If you want to access all the sheets, you need to change these lines:
InputStream inputStream = xssfReader.getSheetsData().next();
xmlReader = factory.createXMLStreamReader(inputStream);
Into something more like:
Iterator<InputStream> sheetsData = xssfReader.getSheetsData();
while (sheetsData.hasNext()) {
InputStream inputStream = sheetsData.next();
xmlReader = factory.createXMLStreamReader(inputStream);
....
}
If you also want to get the sheet name as well, you'll want to do something like what is shown in the Apache POI XLSX event-based text extractor
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
while (sheetsData.hasNext()) {
InputStream inputStream = sheetsData.next();
String sheetName = iter.getSheetName();
if (sheetName.equalsIgnoreCase("TheSheetIWant")) {
xmlReader = factory.createXMLStreamReader(inputStream);
....
}
}
If you want to know more about doing this stuff, then one of the best examples, that's easy to read and follow, is XSSFEventBasedExcelExtractor that comes with Apache POI - read the code for that and learn!

itext Converting PDF to csv

I am trying to use itext framework to convert a pdf file into a csv for import into excel.
The output is garbled and I pressume I am missing a step in regards to format conversion however I can't seem to find the information in the itext site and am looking for assistance.
Current is as below.
package com.pdf.convert;
import java.io.FileOutputStream;
import java.io.IOException;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.pdf.PdfImportedPage;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfWriter;
public class ThirdPDF {
private static String INPUTFILE = "/location/test.pdf";
private static String OUTPUTFILE = "/location/test.csv";
public static void main(String[] args) throws DocumentException,
IOException {
Document document = new Document();
PdfWriter writer = PdfWriter.getInstance(document,
new FileOutputStream(OUTPUTFILE));
document.open();
PdfReader reader = new PdfReader(INPUTFILE);
int n = reader.getNumberOfPages();
PdfImportedPage page;
// Go through all pages
for (int i = 1; i <= n; i++) {
// Only page number 2 will be included
if (i == 2) {
page = writer.getImportedPage(reader, i);
Image instance = Image.getInstance(page);
document.add(instance);
}
}
document.close();
}
}

Converting PDF file to CSV file.
Present Directory and File creation is based on Android Framework.
Change your path and Directory as per your Framework Accordingly.
private void convertPDFToCSV(String pdfFilePath) {
String myfolder = Environment.getExternalStorageDirectory() + "/Mycsv";
if (createFolder(myfolder)) {
try {
Document document = new Document();
document.open();
FileOutputStream fos=new FileOutputStream(myfolder + "/MyCSVFile.csv");
StringBuilder parsedText=new StringBuilder();
PdfReader reader1 = new PdfReader(pdfFilePath);
int n = reader1.getNumberOfPages();
for (int i = 0; i <n ; i++) {
parsedText.append(parsedText+PdfTextExtractor.getTextFromPage(reader1, i+1).trim()+"\n") ;
//Extracting the content fromx the different pages
}
StringReader stReader = new StringReader(parsedText.toString());
int t;
while((t=stReader.read())>0)
fos.write(t);
document.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
private boolean createFolder(String myfolder) {
File f = new File(myfolder);
if (!f.exists()) {
if (!f.mkdir()) {
return false;
} else {
return true;
}
}else{
return true;
}
}

Develop Reference

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

convert html text into pdf without losing formatting - java

Related

gettting exception while reading duplicate column name excel file using sparkexcel library. How to overcome this issue

Extract Only Images from PDF File in java using Apache Tika or PDFBox? [duplicate]

Extended Html Unit Driver with screenshot capabilities

Reading multiple tabs of a huge excel file in Java using XSS and Event

itext Converting PDF to csv

Categories

Resources