I have a class which ingests .xlsx-files. I took it from this example and modified it for my needs:
https://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java
Now the application processes some files just fine, others not at all. If I change one single field or even character in one of the not working files and save them again, the whole content is processed correctly. Does anyone have an idea what might be the reason for (imho it lies somewhere within the original excel files).
To whom it may help, here is my code:
package com.goodgamestudios.icosphere.service.fileReader;
import com.goodgamestudios.icosphere.datamodel.DataSet;
import com.goodgamestudios.icosphere.datamodel.Tuple;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
public class ExcelFileReader implements FileReader {
static final Logger LOG = LoggerFactory.getLogger(ExcelFileReader.class);
private SheetHandler handler;
#Override
public DataSet getDataFromFile(File file) throws IOException {
LOG.info("Start ingesting file {}");
try {
OPCPackage pkg = OPCPackage.open(file);
XSSFReader reader = new XSSFReader(pkg);
StylesTable styles = reader.getStylesTable();
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(pkg);
SharedStringsTable sst = reader.getSharedStringsTable();
XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
handler = new SheetHandler(styles, strings, 24);
parser.setContentHandler(handler);
// rId2 found by processing the Workbook
// Seems to either be rId# or rSheet#
System.out.println("yooooo 1");
InputStream sheet2 = reader.getSheet("rId2");
System.out.println("yooooo 2");
InputSource sheetSource = new InputSource(sheet2);
System.out.println("yooooo 3");
parser.parse(sheetSource);
LOG.debug("{} rows parsed", handler.getData().getRows().size() + 1);
sheet2.close();
return handler.getData();
} catch (OpenXML4JException | SAXException ex) {
LOG.warn("Unable to parse file {}", file.getName());
LOG.warn("Exception: {}: ", ex);
}
return null;
}
/**
* See org.xml.sax.helpers.DefaultHandler javadocs
*
* Derived from http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api
* <p/>
* Also see Standard ECMA-376, 1st edition, part 4, pages 1928ff, at
* http://www.ecma-international.org/publications/standards/Ecma-376.htm
* <p/>
* A web-friendly version is http://openiso.org/Ecma/376/Part4
*/
private static class SheetHandler extends DefaultHandler {
boolean isFirstRow = true;
private int quantityOfColumns;
private int currentColumnNumber = 1;
int currentRowNumber = 1;
private int rowNumberOfLastCell = 1;
private DataSet data = new DataSet();
private Tuple tuple;
/**
* Table with styles
*/
private StylesTable stylesTable;
/**
* Table with unique strings
*/
private ReadOnlySharedStringsTable sharedStringsTable;
/**
* Number of columns to read starting with leftmost
*/
private final int minColumnCount;
// Set when V start element is seen
private boolean vIsOpen;
// Set when cell start element is seen;
// used when cell close element is seen.
private xssfDataType nextDataType;
// Used to format numeric cell values.
private short formatIndex;
private String formatString;
private final DataFormatter formatter;
// The last column printed to the output stream
private int lastColumnNumber = -1;
// Gathers characters as they are seen.
private StringBuffer value;
static final Logger LOG = LoggerFactory.getLogger(SheetHandler.class);
private SheetHandler(StylesTable styles,
ReadOnlySharedStringsTable strings,
int cols) {
this.stylesTable = styles;
this.sharedStringsTable = strings;
this.minColumnCount = cols;
this.value = new StringBuffer();
this.nextDataType = xssfDataType.NUMBER;
this.formatter = new DataFormatter();
LOG.debug("Sheethandler created");
}
/*
* (non-Javadoc)
* #see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
*/
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
System.out.println("yooooooooooo start:uri:" + uri + " localname: " + localName + " name: " + name);
if ("inlineStr".equals(name) || "v".equals(name)) {
vIsOpen = true;
// Clear contents cache
value.setLength(0);
} // c => cell
else if ("c".equals(name)) {
// Get the cell reference
String r = attributes.getValue("r");
int firstDigit = -1;
for (int c = 0; c < r.length(); ++c) {
if (Character.isDigit(r.charAt(c))) {
firstDigit = c;
break;
}
}
currentColumnNumber = nameToColumn(r.substring(0, firstDigit));
System.out.println("colu mn " + currentColumnNumber);
// Set up defaults.
this.nextDataType = xssfDataType.NUMBER;
this.formatIndex = -1;
this.formatString = null;
String cellType = attributes.getValue("t");
String cellStyleStr = attributes.getValue("s");
if ("b".equals(cellType)) {
nextDataType = xssfDataType.BOOL;
} else if ("e".equals(cellType)) {
nextDataType = xssfDataType.ERROR;
} else if ("inlineStr".equals(cellType)) {
nextDataType = xssfDataType.INLINESTR;
} else if ("s".equals(cellType)) {
nextDataType = xssfDataType.SSTINDEX;
} else if ("str".equals(cellType)) {
nextDataType = xssfDataType.FORMULA;
} else if (cellStyleStr != null) {
// It's a number, but almost certainly one
// with a special style or format
XSSFCellStyle style = null;
if (cellStyleStr != null) {
int styleIndex = Integer.parseInt(cellStyleStr);
style = stylesTable.getStyleAt(styleIndex);
} else if (stylesTable.getNumCellStyles() > 0) {
style = stylesTable.getStyleAt(0);
}
if (style != null) {
this.formatIndex = style.getDataFormat();
this.formatString = style.getDataFormatString();
if (this.formatString == null) {
this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex);
}
}
}
}
}
/*
* (non-Javadoc)
* #see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
*/
public void endElement(String uri, String localName, String name)
throws SAXException {
String thisStr = null;
// v => contents of a cell
if ("v".equals(name)) {
// Process the value contents as required.
// Do now, as characters() may be called more than once
switch (nextDataType) {
case BOOL:
char first = value.charAt(0);
thisStr = first == '0' ? "FALSE" : "TRUE";
break;
case ERROR:
thisStr = "\"ERROR:" + value.toString() + '"';
break;
case FORMULA:
// A formula could result in a string value,
// so always add double-quote characters.
thisStr = '"' + value.toString() + '"';
break;
case INLINESTR:
// TODO: have seen an example of this, so it's untested.
XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
thisStr = '"' + rtsi.toString() + '"';
break;
case SSTINDEX:
String sstIndex = value.toString();
try {
int idx = Integer.parseInt(sstIndex);
XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx));
thisStr = rtss.toString();
} catch (NumberFormatException ex) {
System.out.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString());
}
break;
case NUMBER:
String n = value.toString();
if (this.formatString != null && n.length() > 0) {
thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString);
} else {
thisStr = n;
}
break;
default:
thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
break;
}
// Output after we've seen the string contents
// Emit commas for any fields that were missing on this row
if (lastColumnNumber == -1) {
lastColumnNumber = 0;
}
for (int i = lastColumnNumber; i < currentColumnNumber; ++i) {
}
// Might be the empty string.
System.out.println(thisStr);
if (isFirstRow) {
data.getHeaders().add(thisStr);
} else {
tuple.getRowEntries()[currentColumnNumber] = thisStr;
}
// Update column
if (currentColumnNumber > -1) {
lastColumnNumber = currentColumnNumber;
}
} else if ("row".equals(name)) {
// We're onto a new row
System.out.println("nextrow");
lastColumnNumber = -1;
System.out.println("yoooooo tuple:" + tuple);
if (isFirstRow) {
isFirstRow = false;
quantityOfColumns = data.getHeaders().size();
tuple = new Tuple(quantityOfColumns);
} else if (!tuple.isEmpty()) {
data.addRow(tuple);
tuple = new Tuple(quantityOfColumns);
}
}
}
/**
* Captures characters only if a suitable element is open. Originally
* was just "v"; extended for inlineStr also.
*/
public void characters(char[] ch, int start, int length)
throws SAXException {
if (vIsOpen) {
value.append(ch, start, length);
}
}
/**
* Converts an Excel column name like "C" to a zero-based index.
*
* #param name
* #return Index corresponding to the specified name
*/
private int nameToColumn(String name) {
int column = -1;
for (int i = 0; i < name.length(); ++i) {
int c = name.charAt(i);
column = (column + 1) * 26 + c - 'A';
}
return column;
}
public DataSet getData() {
return data;
}
}
/**
* The type of the data value is indicated by an attribute on the cell. The
* value is usually in a "v" element within the cell.
*/
enum xssfDataType {
BOOL,
ERROR,
FORMULA,
INLINESTR,
SSTINDEX,
NUMBER,
}
}
Here is the xml example of a working and a not working worksheet:
http://www.file-upload.net/download-10909789/not_working.xml.html
http://www.file-upload.net/download-10909790/working.xml.html
and here the xlsx-files:
http://www.file-upload.net/download-10909802/not_working.xlsx.html
http://www.file-upload.net/download-10909803/working.xlsx.html
Thanks!
The problem was, that LibreOffice Calc saves the first worksheet under "rId2", whereas MSOffice does so under "rId1". So now I'm now going through sheetIds until a sheet with content is parsed or no more sheets are found. Works with both files:
private void parseFirstWorksheetWithContent(XSSFReader reader) throws IOException, InvalidFormatException, SAXException {
//Sheet-ID seems to differ, seems to be "rId2" for files saved by MS Excel and "rId1" for those saved by LibreOffice Calc
try {
for (int i = 1; handler.getData().isEmpty(); i++) {
parseSheet(reader, "rId" + i);
}
} catch (IllegalArgumentException e) {
//No more sheets, file empty
}
}
private void parseSheet(XSSFReader reader, String sheetId) throws InvalidFormatException, SAXException, IOException {
XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
parser.setContentHandler(handler);
InputStream sheetStream = reader.getSheet(sheetId);
InputSource sheetSource = new InputSource(sheetStream);
parser.parse(sheetSource);
sheetStream.close();
}
Related
I am using PDFBox to do a simple extraction of words from a PDF file. Then it inserts those words to a table in database. From what I have tested, a 90 degrees clockwise rotated text in PDF will gives gibberish result when I tried to extract the words.
For example, database in the file will yield atabase and also database itself as two different words. Obviously, atabase does not exist in the PDF file.
I tried converting the original file to be rotated upright and do the extraction and it works perfectly as expected. I understand this could be a limitation of the PDFBox itself.
So, in the case of someone trying to index a rotated PDF file, is there a way to tackle this?
Code snippet ( just for reference) :
String lines[] = text.split("\\r?\\n");
for (String line : lines) {
String[] words = line.split(" ");
System.out.println("Line: " + line);
preparedStatement = con1.prepareStatement(sql);
int i=0;
for (String word : words) {
// check if one or more special characters at end of string then remove OR
// check special characters in beginning of the string then remove
// insert every word directly to table db
word = word.replaceAll("([\\W]+$)|(^[\\W]+)", "");
preparedStatement.setString(1, path1);
preparedStatement.setString(2, word);
System.out.println("Token: " +word);
preparedStatement.executeUpdate();
}
}
preparedStatement.close();
}
This is the PDFBox ExtractText command line utility, which can detect rotations since 2.0.13 (PDFBOX-4371). (That release had a bug with type 3 fonts, which was fixed (PDFBOX-4390) in the repository and in this code, and is in 2.0.14). Later code may have been improved since then. The current 2.0.* source can be found here.
To extract text from rotated files, use the "rotationMagic" setting. This setting first detects the angle of every glyph, collects these angles (AngleCollector), and in a second pass it does an extraction for every angle while discarding the rest (FilteredTextStripper). The order of extraction is by angle, which may or may not make sense if there are several different angles in a page.
The PDF is modified while extracting, so don't use this on documents you are saving.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.tools;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
/**
* This is the main program that simply parses the pdf document and transforms it
* into text.
*
* #author Ben Litchfield
* #author Tilman Hausherr
*/
public final class ExtractText
{
private static final Log LOG = LogFactory.getLog(ExtractText.class);
private static final String PASSWORD = "-password";
private static final String ENCODING = "-encoding";
private static final String CONSOLE = "-console";
private static final String START_PAGE = "-startPage";
private static final String END_PAGE = "-endPage";
private static final String SORT = "-sort";
private static final String IGNORE_BEADS = "-ignoreBeads";
private static final String DEBUG = "-debug";
private static final String HTML = "-html";
private static final String ALWAYSNEXT = "-alwaysNext";
private static final String ROTATION_MAGIC = "-rotationMagic";
private static final String STD_ENCODING = "UTF-8";
/*
* debug flag
*/
private boolean debug = false;
/**
* private constructor.
*/
private ExtractText()
{
//static class
}
/**
* Infamous main method.
*
* #param args Command line arguments, should be one and a reference to a file.
*
* #throws IOException if there is an error reading the document or extracting the text.
*/
public static void main( String[] args ) throws IOException
{
// suppress the Dock icon on OS X
System.setProperty("apple.awt.UIElement", "true");
ExtractText extractor = new ExtractText();
extractor.startExtraction(args);
}
/**
* Starts the text extraction.
*
* #param args the commandline arguments.
* #throws IOException if there is an error reading the document or extracting the text.
*/
public void startExtraction( String[] args ) throws IOException
{
boolean toConsole = false;
boolean toHTML = false;
boolean sort = false;
boolean separateBeads = true;
boolean alwaysNext = false;
boolean rotationMagic = false;
String password = "";
String encoding = STD_ENCODING;
String pdfFile = null;
String outputFile = null;
// Defaults to text files
String ext = ".txt";
int startPage = 1;
int endPage = Integer.MAX_VALUE;
for( int i=0; i<args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
{
i++;
if( i >= args.length )
{
usage();
}
password = args[i];
}
else if( args[i].equals( ENCODING ) )
{
i++;
if( i >= args.length )
{
usage();
}
encoding = args[i];
}
else if( args[i].equals( START_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
startPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( HTML ) )
{
toHTML = true;
ext = ".html";
}
else if( args[i].equals( SORT ) )
{
sort = true;
}
else if( args[i].equals( IGNORE_BEADS ) )
{
separateBeads = false;
}
else if (args[i].equals(ALWAYSNEXT))
{
alwaysNext = true;
}
else if (args[i].equals(ROTATION_MAGIC))
{
rotationMagic = true;
}
else if( args[i].equals( DEBUG ) )
{
debug = true;
}
else if( args[i].equals( END_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
endPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( CONSOLE ) )
{
toConsole = true;
}
else
{
if( pdfFile == null )
{
pdfFile = args[i];
}
else
{
outputFile = args[i];
}
}
}
if( pdfFile == null )
{
usage();
}
else
{
Writer output = null;
PDDocument document = null;
try
{
long startTime = startProcessing("Loading PDF "+pdfFile);
if( outputFile == null && pdfFile.length() >4 )
{
outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
}
document = PDDocument.load(new File( pdfFile ), password);
AccessPermission ap = document.getCurrentAccessPermission();
if( ! ap.canExtractContent() )
{
throw new IOException( "You do not have permission to extract text" );
}
stopProcessing("Time for loading: ", startTime);
if( toConsole )
{
output = new OutputStreamWriter( System.out, encoding );
}
else
{
if (toHTML && !STD_ENCODING.equals(encoding))
{
encoding = STD_ENCODING;
System.out.println("The encoding parameter is ignored when writing html output.");
}
output = new OutputStreamWriter( new FileOutputStream( outputFile ), encoding );
}
startTime = startProcessing("Starting text extraction");
if (debug)
{
System.err.println("Writing to " + outputFile);
}
PDFTextStripper stripper;
if(toHTML)
{
// HTML stripper can't work page by page because of startDocument() callback
stripper = new PDFText2HTML();
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(separateBeads);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);
// Extract text for main document:
stripper.writeText(document, output);
}
else
{
if (rotationMagic)
{
stripper = new FilteredTextStripper();
}
else
{
stripper = new PDFTextStripper();
}
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(separateBeads);
// Extract text for main document:
extractPages(startPage, Math.min(endPage, document.getNumberOfPages()),
stripper, document, output, rotationMagic, alwaysNext);
}
// ... also for any embedded PDFs:
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDDocumentNameDictionary names = catalog.getNames();
if (names != null)
{
PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
if (embeddedFiles != null)
{
Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
if (embeddedFileNames != null)
{
for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet())
{
if (debug)
{
System.err.println("Processing embedded file " + ent.getKey() + ":");
}
PDComplexFileSpecification spec = ent.getValue();
PDEmbeddedFile file = spec.getEmbeddedFile();
if (file != null && "application/pdf".equals(file.getSubtype()))
{
if (debug)
{
System.err.println(" is PDF (size=" + file.getSize() + ")");
}
InputStream fis = file.createInputStream();
PDDocument subDoc = null;
try
{
subDoc = PDDocument.load(fis);
if (toHTML)
{
// will not really work because of HTML header + footer
stripper.writeText( subDoc, output );
}
else
{
extractPages(1, subDoc.getNumberOfPages(),
stripper, subDoc, output, rotationMagic, alwaysNext);
}
}
finally
{
fis.close();
IOUtils.closeQuietly(subDoc);
}
}
}
}
}
}
stopProcessing("Time for extraction: ", startTime);
}
finally
{
IOUtils.closeQuietly(output);
IOUtils.closeQuietly(document);
}
}
}
private void extractPages(int startPage, int endPage,
PDFTextStripper stripper, PDDocument document, Writer output,
boolean rotationMagic, boolean alwaysNext) throws IOException
{
for (int p = startPage; p <= endPage; ++p)
{
stripper.setStartPage(p);
stripper.setEndPage(p);
try
{
if (rotationMagic)
{
PDPage page = document.getPage(p - 1);
int rotation = page.getRotation();
page.setRotation(0);
AngleCollector angleCollector = new AngleCollector();
angleCollector.setStartPage(p);
angleCollector.setEndPage(p);
angleCollector.writeText(document, new NullWriter());
// rotation magic
for (int angle : angleCollector.getAngles())
{
// prepend a transformation
// (we could skip these parts for angle 0, but it doesn't matter much)
PDPageContentStream cs = new PDPageContentStream(document, page,
PDPageContentStream.AppendMode.PREPEND, false);
cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0));
cs.close();
stripper.writeText(document, output);
// remove prepended transformation
((COSArray) page.getCOSObject().getItem(COSName.CONTENTS)).remove(0);
}
page.setRotation(rotation);
}
else
{
stripper.writeText(document, output);
}
}
catch (IOException ex)
{
if (!alwaysNext)
{
throw ex;
}
LOG.error("Failed to process page " + p, ex);
}
}
}
private long startProcessing(String message)
{
if (debug)
{
System.err.println(message);
}
return System.currentTimeMillis();
}
private void stopProcessing(String message, long startTime)
{
if (debug)
{
long stopTime = System.currentTimeMillis();
float elapsedTime = ((float)(stopTime - startTime))/1000;
System.err.println(message + elapsedTime + " seconds");
}
}
/**
* This will print the usage requirements and exit.
*/
private static void usage()
{
String message = "Usage: java -jar pdfbox-app-x.y.z.jar ExtractText [options] <inputfile> [output-text-file]\n"
+ "\nOptions:\n"
+ " -password <password> : Password to decrypt document\n"
+ " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE,\n"
+ " UTF-16LE, etc.\n"
+ " -console : Send text to console instead of file\n"
+ " -html : Output in HTML format instead of raw text\n"
+ " -sort : Sort the text before writing\n"
+ " -ignoreBeads : Disables the separation by beads\n"
+ " -debug : Enables debug output about the time consumption\n"
+ " of every stage\n"
+ " -alwaysNext : Process next page (if applicable) despite\n"
+ " IOException (ignored when -html)\n"
+ " -rotationMagic : Analyze each page for rotated/skewed text,\n"
+ " rotate to 0° and extract separately\n"
+ " (slower, and ignored when -html)\n"
+ " -startPage <number> : The first page to start extraction (1 based)\n"
+ " -endPage <number> : The last page to extract (1 based, inclusive)\n"
+ " <inputfile> : The PDF document to use\n"
+ " [output-text-file] : The file to write the text to";
System.err.println(message);
System.exit( 1 );
}
}
/**
* Collect all angles while doing text extraction. Angles are in degrees and rounded to the closest
* integer (to avoid slight differences from floating point arithmethic resulting in similarly
* angled glyphs being treated separately). This class must be constructed for each page so that the
* angle set is initialized.
*/
class AngleCollector extends PDFTextStripper
{
private final Set<Integer> angles = new TreeSet<Integer>();
AngleCollector() throws IOException
{
}
Set<Integer> getAngles()
{
return angles;
}
#Override
protected void processTextPosition(TextPosition text)
{
Matrix m = text.getTextMatrix();
m.concatenate(text.getFont().getFontMatrix());
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
angle = (angle + 360) % 360;
angles.add(angle);
}
}
/**
* TextStripper that only processes glyphs that have angle 0.
*/
class FilteredTextStripper extends PDFTextStripper
{
FilteredTextStripper() throws IOException
{
}
#Override
protected void processTextPosition(TextPosition text)
{
Matrix m = text.getTextMatrix();
m.concatenate(text.getFont().getFontMatrix());
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
if (angle == 0)
{
super.processTextPosition(text);
}
}
}
/**
* Dummy output.
*/
class NullWriter extends Writer
{
#Override
public void write(char[] cbuf, int off, int len) throws IOException
{
// do nothing
}
#Override
public void flush() throws IOException
{
// do nothing
}
#Override
public void close() throws IOException
{
// do nothing
}
}
i am using Apache POI for reading Excel rows and using as needed. In order to Enhance the script for better reusability, how can i search and find a String value in all sheets under Column A and read corresponding row. For Example in Sheet2 ColumnA i have Name called Peter and in ColumnB Date of birth of Peter is 12/18/1984. Can you give code sample to search Peter in ColumnA in Excel work book and return his Date of Birth from ColumnB? Below is the code i am using currently may not suit above criteria.
package com.Sample.GenericFunctionsLibrary;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.commons.io.FileUtils;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.TakesScreenshot;
public class TestUtil {
public static Xls_Reader excel = null;
public static String path = "./XLFile/Data.xlsx";
public static String mailscreenshotpath;
public static String generateTimeStamp() {
Calendar cal = new GregorianCalendar();
int month = cal.get(Calendar.MONTH); // 3
int year = cal.get(Calendar.YEAR); // 2014
int sec = cal.get(Calendar.SECOND);
int min = cal.get(Calendar.MINUTE);
int date = cal.get(Calendar.DATE);
int day = cal.get(Calendar.HOUR_OF_DAY);
String timestamp = year + "_" + date + "_" + (month + 1) + "_" + day + "_" + min + "_" + sec;
return timestamp;
}
public static boolean isExecutable(String tcid) {
for (int rowNum = 2; rowNum <= excel.getRowCount("Credentials"); rowNum++) {
if (excel.getCellData("Credentials", "TestCase_Name", rowNum).equals(tcid)) {
if (excel.getCellData("Credentials", "runmode", rowNum).equalsIgnoreCase("Y")) {
return true;
} else {
return false;
}
}
}
return false;
}
public static Object[][] getData(String sheetName) {
int rows = excel.getRowCount(sheetName);
int cols = excel.getColumnCount(sheetName);
Object[][] data = new Object[rows - 1][cols];
for (int rowNum = 2; rowNum <= rows; rowNum++) { // 2
for (int colNum = 0; colNum < cols; colNum++) {
data[rowNum - 2][colNum] = excel.getCellData(sheetName, colNum, rowNum); // -2
}
}
return data;
}
public static void zip(String filepath) {
try {
File inFolder = new File(filepath);
File outFolder = new File("Reports.zip");
ZipOutputStream out = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(outFolder)));
BufferedInputStream in = null;
byte[] data = new byte[1000];
String files[] = inFolder.list();
for (int i = 0; i < files.length; i++) {
in = new BufferedInputStream(new FileInputStream(inFolder.getPath() + "/" + files[i]), 1000);
out.putNextEntry(new ZipEntry(files[i]));
int count;
while ((count = in.read(data, 0, 1000)) != -1) {
out.write(data, 0, count);
}
out.closeEntry();
}
out.flush();
out.close();
} catch (Exception e) {
e.printStackTrace();
}
}
// --------------------------------------Read Data From
// Excel------------------------------------
public static String[][] GetValue(String Pathfile, String sheetName, int startrow) throws IOException {
File excel = new File(Pathfile);
FileInputStream fis = new FileInputStream(excel);
#SuppressWarnings("resource")
XSSFWorkbook wb = new XSSFWorkbook(fis);
XSSFSheet ws = wb.getSheet(sheetName);
// System.out.println(startrow);
int colNum = ws.getRow(startrow).getLastCellNum();
// System.out.println(colNum);
String[][] arrays = new String[1][colNum];
for (int i = 0; i < colNum; i++) {
XSSFRow row = ws.getRow(startrow);
XSSFCell cell = row.getCell(i);
arrays[0][i] = cellToString(cell);
// System.out.println(arrays[0][i]);
}
return arrays;
}
// private static String cellToString(XSSFCell cell) {
// Object result;
// int type = cell.getCellType();
//
// switch(type)
// {
// case 0:
// result = cell.getNumericCellValue();
// break;
// case 1:
// result = cell.getStringCellValue();
// break;
// default:
// throw new RuntimeException("there are no support for this type of cell");
// }
private static String cellToString(XSSFCell cell) {
Object result;
int type;
try {
type = cell.getCellType();
} catch (NullPointerException e) {
type = 2;
}
switch (type) {
case Cell.CELL_TYPE_NUMERIC:
DataFormatter formatter = new DataFormatter();
result = formatter.formatCellValue(cell);
break;
case Cell.CELL_TYPE_STRING:
result = cell.getStringCellValue();
break;
case Cell.CELL_TYPE_BLANK:
result = "";
break;
default:
throw new RuntimeException("there are no support for this type of cell");
}
//
return result.toString();
}
}
This method will take a String value for the name to search and return the address for the first record found in the column next to it, assuming that the name is in the first column and the address is in the second column. It will iterate over all sheets as asked. It returns empty String if name is not found. Try/catch block excluded for readability.
public static String findAddressByName(String nameToSearch) {
String fileLocation = "I:\\foo.xlsx";
XSSFWorkbook wb = new XSSFWorkbook(new File(fileLocation));
for (int sheetIndex = 0; sheetIndex < wb.getNumberOfSheets(); sheetIndex++) {
XSSFSheet sheet = wb.getSheetAt(sheetIndex);
for (int rowIndex = 0; rowIndex < sheet.getLastRowNum(); rowIndex++) {
XSSFRow row = sheet.getRow(rowIndex);
if (row != null && row.getCell(0).getStringCellValue().equals(nameToSearch)) {
return row.getCell(1).getRawValue();
}
}
}
return "";
}
I am using PDFBox to do a simple extraction of words from a PDF file. Then it inserts those words to a table in database. From what I have tested, a 90 degrees clockwise rotated text in PDF will gives gibberish result when I tried to extract the words.
For example, database in the file will yield atabase and also database itself as two different words. Obviously, atabase does not exist in the PDF file.
I tried converting the original file to be rotated upright and do the extraction and it works perfectly as expected. I understand this could be a limitation of the PDFBox itself.
So, in the case of someone trying to index a rotated PDF file, is there a way to tackle this?
Code snippet ( just for reference) :
String lines[] = text.split("\\r?\\n");
for (String line : lines) {
String[] words = line.split(" ");
System.out.println("Line: " + line);
preparedStatement = con1.prepareStatement(sql);
int i=0;
for (String word : words) {
// check if one or more special characters at end of string then remove OR
// check special characters in beginning of the string then remove
// insert every word directly to table db
word = word.replaceAll("([\\W]+$)|(^[\\W]+)", "");
preparedStatement.setString(1, path1);
preparedStatement.setString(2, word);
System.out.println("Token: " +word);
preparedStatement.executeUpdate();
}
}
preparedStatement.close();
}
This is the PDFBox ExtractText command line utility, which can detect rotations since 2.0.13 (PDFBOX-4371). (That release had a bug with type 3 fonts, which was fixed (PDFBOX-4390) in the repository and in this code, and is in 2.0.14). Later code may have been improved since then. The current 2.0.* source can be found here.
To extract text from rotated files, use the "rotationMagic" setting. This setting first detects the angle of every glyph, collects these angles (AngleCollector), and in a second pass it does an extraction for every angle while discarding the rest (FilteredTextStripper). The order of extraction is by angle, which may or may not make sense if there are several different angles in a page.
The PDF is modified while extracting, so don't use this on documents you are saving.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.tools;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
/**
* This is the main program that simply parses the pdf document and transforms it
* into text.
*
* #author Ben Litchfield
* #author Tilman Hausherr
*/
public final class ExtractText
{
private static final Log LOG = LogFactory.getLog(ExtractText.class);
private static final String PASSWORD = "-password";
private static final String ENCODING = "-encoding";
private static final String CONSOLE = "-console";
private static final String START_PAGE = "-startPage";
private static final String END_PAGE = "-endPage";
private static final String SORT = "-sort";
private static final String IGNORE_BEADS = "-ignoreBeads";
private static final String DEBUG = "-debug";
private static final String HTML = "-html";
private static final String ALWAYSNEXT = "-alwaysNext";
private static final String ROTATION_MAGIC = "-rotationMagic";
private static final String STD_ENCODING = "UTF-8";
/*
* debug flag
*/
private boolean debug = false;
/**
* private constructor.
*/
private ExtractText()
{
//static class
}
/**
* Infamous main method.
*
* #param args Command line arguments, should be one and a reference to a file.
*
* #throws IOException if there is an error reading the document or extracting the text.
*/
public static void main( String[] args ) throws IOException
{
// suppress the Dock icon on OS X
System.setProperty("apple.awt.UIElement", "true");
ExtractText extractor = new ExtractText();
extractor.startExtraction(args);
}
/**
* Starts the text extraction.
*
* #param args the commandline arguments.
* #throws IOException if there is an error reading the document or extracting the text.
*/
public void startExtraction( String[] args ) throws IOException
{
boolean toConsole = false;
boolean toHTML = false;
boolean sort = false;
boolean separateBeads = true;
boolean alwaysNext = false;
boolean rotationMagic = false;
String password = "";
String encoding = STD_ENCODING;
String pdfFile = null;
String outputFile = null;
// Defaults to text files
String ext = ".txt";
int startPage = 1;
int endPage = Integer.MAX_VALUE;
for( int i=0; i<args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
{
i++;
if( i >= args.length )
{
usage();
}
password = args[i];
}
else if( args[i].equals( ENCODING ) )
{
i++;
if( i >= args.length )
{
usage();
}
encoding = args[i];
}
else if( args[i].equals( START_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
startPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( HTML ) )
{
toHTML = true;
ext = ".html";
}
else if( args[i].equals( SORT ) )
{
sort = true;
}
else if( args[i].equals( IGNORE_BEADS ) )
{
separateBeads = false;
}
else if (args[i].equals(ALWAYSNEXT))
{
alwaysNext = true;
}
else if (args[i].equals(ROTATION_MAGIC))
{
rotationMagic = true;
}
else if( args[i].equals( DEBUG ) )
{
debug = true;
}
else if( args[i].equals( END_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
endPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( CONSOLE ) )
{
toConsole = true;
}
else
{
if( pdfFile == null )
{
pdfFile = args[i];
}
else
{
outputFile = args[i];
}
}
}
if( pdfFile == null )
{
usage();
}
else
{
Writer output = null;
PDDocument document = null;
try
{
long startTime = startProcessing("Loading PDF "+pdfFile);
if( outputFile == null && pdfFile.length() >4 )
{
outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
}
document = PDDocument.load(new File( pdfFile ), password);
AccessPermission ap = document.getCurrentAccessPermission();
if( ! ap.canExtractContent() )
{
throw new IOException( "You do not have permission to extract text" );
}
stopProcessing("Time for loading: ", startTime);
if( toConsole )
{
output = new OutputStreamWriter( System.out, encoding );
}
else
{
if (toHTML && !STD_ENCODING.equals(encoding))
{
encoding = STD_ENCODING;
System.out.println("The encoding parameter is ignored when writing html output.");
}
output = new OutputStreamWriter( new FileOutputStream( outputFile ), encoding );
}
startTime = startProcessing("Starting text extraction");
if (debug)
{
System.err.println("Writing to " + outputFile);
}
PDFTextStripper stripper;
if(toHTML)
{
// HTML stripper can't work page by page because of startDocument() callback
stripper = new PDFText2HTML();
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(separateBeads);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);
// Extract text for main document:
stripper.writeText(document, output);
}
else
{
if (rotationMagic)
{
stripper = new FilteredTextStripper();
}
else
{
stripper = new PDFTextStripper();
}
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(separateBeads);
// Extract text for main document:
extractPages(startPage, Math.min(endPage, document.getNumberOfPages()),
stripper, document, output, rotationMagic, alwaysNext);
}
// ... also for any embedded PDFs:
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDDocumentNameDictionary names = catalog.getNames();
if (names != null)
{
PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
if (embeddedFiles != null)
{
Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
if (embeddedFileNames != null)
{
for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet())
{
if (debug)
{
System.err.println("Processing embedded file " + ent.getKey() + ":");
}
PDComplexFileSpecification spec = ent.getValue();
PDEmbeddedFile file = spec.getEmbeddedFile();
if (file != null && "application/pdf".equals(file.getSubtype()))
{
if (debug)
{
System.err.println(" is PDF (size=" + file.getSize() + ")");
}
InputStream fis = file.createInputStream();
PDDocument subDoc = null;
try
{
subDoc = PDDocument.load(fis);
if (toHTML)
{
// will not really work because of HTML header + footer
stripper.writeText( subDoc, output );
}
else
{
extractPages(1, subDoc.getNumberOfPages(),
stripper, subDoc, output, rotationMagic, alwaysNext);
}
}
finally
{
fis.close();
IOUtils.closeQuietly(subDoc);
}
}
}
}
}
}
stopProcessing("Time for extraction: ", startTime);
}
finally
{
IOUtils.closeQuietly(output);
IOUtils.closeQuietly(document);
}
}
}
private void extractPages(int startPage, int endPage,
PDFTextStripper stripper, PDDocument document, Writer output,
boolean rotationMagic, boolean alwaysNext) throws IOException
{
for (int p = startPage; p <= endPage; ++p)
{
stripper.setStartPage(p);
stripper.setEndPage(p);
try
{
if (rotationMagic)
{
PDPage page = document.getPage(p - 1);
int rotation = page.getRotation();
page.setRotation(0);
AngleCollector angleCollector = new AngleCollector();
angleCollector.setStartPage(p);
angleCollector.setEndPage(p);
angleCollector.writeText(document, new NullWriter());
// rotation magic
for (int angle : angleCollector.getAngles())
{
// prepend a transformation
// (we could skip these parts for angle 0, but it doesn't matter much)
PDPageContentStream cs = new PDPageContentStream(document, page,
PDPageContentStream.AppendMode.PREPEND, false);
cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0));
cs.close();
stripper.writeText(document, output);
// remove prepended transformation
((COSArray) page.getCOSObject().getItem(COSName.CONTENTS)).remove(0);
}
page.setRotation(rotation);
}
else
{
stripper.writeText(document, output);
}
}
catch (IOException ex)
{
if (!alwaysNext)
{
throw ex;
}
LOG.error("Failed to process page " + p, ex);
}
}
}
private long startProcessing(String message)
{
if (debug)
{
System.err.println(message);
}
return System.currentTimeMillis();
}
private void stopProcessing(String message, long startTime)
{
if (debug)
{
long stopTime = System.currentTimeMillis();
float elapsedTime = ((float)(stopTime - startTime))/1000;
System.err.println(message + elapsedTime + " seconds");
}
}
/**
* This will print the usage requirements and exit.
*/
private static void usage()
{
String message = "Usage: java -jar pdfbox-app-x.y.z.jar ExtractText [options] <inputfile> [output-text-file]\n"
+ "\nOptions:\n"
+ " -password <password> : Password to decrypt document\n"
+ " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE,\n"
+ " UTF-16LE, etc.\n"
+ " -console : Send text to console instead of file\n"
+ " -html : Output in HTML format instead of raw text\n"
+ " -sort : Sort the text before writing\n"
+ " -ignoreBeads : Disables the separation by beads\n"
+ " -debug : Enables debug output about the time consumption\n"
+ " of every stage\n"
+ " -alwaysNext : Process next page (if applicable) despite\n"
+ " IOException (ignored when -html)\n"
+ " -rotationMagic : Analyze each page for rotated/skewed text,\n"
+ " rotate to 0° and extract separately\n"
+ " (slower, and ignored when -html)\n"
+ " -startPage <number> : The first page to start extraction (1 based)\n"
+ " -endPage <number> : The last page to extract (1 based, inclusive)\n"
+ " <inputfile> : The PDF document to use\n"
+ " [output-text-file] : The file to write the text to";
System.err.println(message);
System.exit( 1 );
}
}
/**
* Collect all angles while doing text extraction. Angles are in degrees and rounded to the closest
* integer (to avoid slight differences from floating point arithmethic resulting in similarly
* angled glyphs being treated separately). This class must be constructed for each page so that the
* angle set is initialized.
*/
class AngleCollector extends PDFTextStripper
{
private final Set<Integer> angles = new TreeSet<Integer>();
AngleCollector() throws IOException
{
}
Set<Integer> getAngles()
{
return angles;
}
#Override
protected void processTextPosition(TextPosition text)
{
Matrix m = text.getTextMatrix();
m.concatenate(text.getFont().getFontMatrix());
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
angle = (angle + 360) % 360;
angles.add(angle);
}
}
/**
* TextStripper that only processes glyphs that have angle 0.
*/
class FilteredTextStripper extends PDFTextStripper
{
FilteredTextStripper() throws IOException
{
}
#Override
protected void processTextPosition(TextPosition text)
{
Matrix m = text.getTextMatrix();
m.concatenate(text.getFont().getFontMatrix());
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
if (angle == 0)
{
super.processTextPosition(text);
}
}
}
/**
* Dummy output.
*/
class NullWriter extends Writer
{
#Override
public void write(char[] cbuf, int off, int len) throws IOException
{
// do nothing
}
#Override
public void flush() throws IOException
{
// do nothing
}
#Override
public void close() throws IOException
{
// do nothing
}
}
I'm trying to do two things for this application:one when the class is called upon, it will run from top to bottom and two, I can call the public method "xlxTOcsvConverter" and supply the three needed parameters for future use...that said, if the class is run, 4 files will be input, and 4 files will be written. The first three are working beautifully, but the last one, the one with the comment, seems to fail with the given error. Now the issue I have is that the data in, is maintained by an outside company and we have read privileges only, so all I can do is ask them to maintain a set cell formatting, but that may not stick. What I am noticing for the cell that is failing, is that it is formatted as a date, but is empty. All the other empty cells are general formats; it is literally the only cell around it formatted as such. I'm guessing this is why it failing. Am I missing some other if for the cell_types?
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package datarefresh;
import java.io.FileInputStream;
import java.io.FileWriter;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
/**
*
* #author null
*/
public class DataRefresh {
public static String EOL = System.getProperty("line.separator"); //Finding out OS specific EOL and setting it to a string named EOL
/**
* #param args the command line arguments
*/
public static void main(String[] args) {
String StrInfoXLS = "\\\\path\\Dashboard.xls";
String POSPwXLS = "\\\\path\\POS Passwords.xls";
String NetPwXLS = "\\\\path\\Dashboard Data.xls";
String SnLPwXLS = "\\\\path\\AccessVia ID & Passcodes - Helpdesk.xls";
String StrInfoCSV = "\\\\path\\StoreInfoTst.csv";
String POSPwCSV = "\\\\path\\POS PasswordsTst.csv";
String NetPwCSV = "\\\\path\\Dashboard DataTst.csv";
String SnLPwCSV = "\\\\path\\AccessVia ID & Passcodes - HelpdeskTst.csv";
String StrInfoSht = "Store List";
String POSPwSht = "Sheet1";
String NetPwSht = "Network";
String SnLPwSht = "S & L Store List w Passcode";
xlxTOcsvConverter(StrInfoXLS,StrInfoSht,StrInfoCSV);
xlxTOcsvConverter(POSPwXLS,POSPwSht,POSPwCSV);
xlxTOcsvConverter(NetPwXLS,NetPwSht,NetPwCSV);
xlxTOcsvConverter(SnLPwXLS,SnLPwSht,SnLPwCSV); //THIS ONE IS NOT WORKING
}
public static void xlxTOcsvConverter(String inFile, String inSheet, String outFile) {
DataRefresh GetIt = new DataRefresh();
String data = "";
try{
FileWriter output = new FileWriter(outFile);
Workbook wb = WorkbookFactory.create(new FileInputStream(inFile));
Sheet sheet = wb.getSheet(inSheet);
Row row = null;
for(int i = 0; i<sheet.getLastRowNum(); i++){
row = sheet.getRow(i);
for(int j = 0; j<row.getLastCellNum(); j++){
Cell cell = row.getCell(j);
data = GetIt.getCellValue(cell);
output.write(data + ";");
}
output.write(EOL);
}
output.close();
}
catch(Exception e){
System.out.println(e);
}
}
private String getCellValue(Cell cell){
String data = "";
if(cell == null){
return "";
}
if(cell.getCellType() == Cell.CELL_TYPE_BLANK){
return "";
}
if(cell.getCellType() == Cell.CELL_TYPE_FORMULA){
switch (cell.getCachedFormulaResultType()){
case Cell.CELL_TYPE_NUMERIC:
if(DateUtil.isCellDateFormatted(cell)){
data = String.valueOf(cell.getDateCellValue());
return data;
}else{
double temp = cell.getNumericCellValue();
data = String.format("%.0f", temp);
return data;
}
case Cell.CELL_TYPE_STRING:
data = cell.getStringCellValue();
return data;
}
}
if(cell.getCellType() == Cell.CELL_TYPE_STRING){
data = cell.getStringCellValue();
return data;
}
if(cell.getCellType() == Cell.CELL_TYPE_NUMERIC){
if(DateUtil.isCellDateFormatted(cell)){
data = String.valueOf(cell.getDateCellValue());
return data;
}else{
double temp = cell.getNumericCellValue();
data = String.format("%.0f", temp);
return data;
}
}
return "";
}
}
This is the stacktrace for it:
Exception in thread "main" java.lang.NullPointerException
at jdk.nashorn.internal.runtime.ECMAErrors.typeError(ECMAErrors.java:214)
at jdk.nashorn.internal.runtime.ECMAErrors.typeError(ECMAErrors.java:186)
at jdk.nashorn.internal.runtime.ECMAErrors.typeError(ECMAErrors.java:173)
at jdk.nashorn.internal.objects.Global.checkObject(Global.java:1500)
at jdk.nashorn.internal.objects.NativeError.printStackTrace(NativeError.java:187)
at datarefresh.DataRefresh.xlxTOcsvConverter(DataRefresh.java:68)
at datarefresh.DataRefresh.main(DataRefresh.java:45)
Java Result: 1
BUILD SUCCESSFUL (total time: 5 seconds)
I changed the catch to:
catch(IOException | InvalidFormatException e){
printStackTrace(e);
}
Now the stacktrace returns:
Exception in thread "main" java.lang.NullPointerException
at datarefresh.DataRefresh.xlxTOcsvConverter(DataRefresh.java:60)
at datarefresh.DataRefresh.main(DataRefresh.java:47)
Java Result: 1
If there are totally empty rows between row 0 and row sheet.getLastRowNum() then the row will be NULL after row = sheet.getRow(i). This is because totally empty rows are not physically stored within the XLS file.
This you should consider. Example:
public static void xlxTOcsvConverter(String inFile, String inSheet, String outFile) {
DataRefresh GetIt = new DataRefresh();
String data = "";
try {
FileWriter output = new FileWriter(outFile);
Workbook wb = WorkbookFactory.create(new FileInputStream(inFile));
Sheet sheet = wb.getSheet(inSheet);
Row row = null;
for(int i = 0; i<=sheet.getLastRowNum(); i++) {
row = sheet.getRow(i);
if (row == null) {
data = "empty row";
output.write(data);
} else {
for(int j = 0; j<row.getLastCellNum(); j++) {
Cell cell = row.getCell(j);
data = GetIt.getCellValue(cell);
output.write(data + ((j<row.getLastCellNum()-1) ? ";" : ""));
}
}
output.write(EOL);
}
output.close();
}
catch(Exception e) {
e.printStackTrace();
}
}
I am trying to find out the easiest manner to export data from a JTable into an excel format like CSV or XLSX etc.
The problem is the data is dynamic, so basically the user can run a dynamic query and he/she would require to export that data.
Most simple way would be to read all the rows from JTable and create a excel file using POI library.
You can get the table data using below method which you can store in list or something :
table.getModel().getValueAt(rowIndex, columnIndex);
Now to create excel file you can use below code from POI library
WorkbookSettings ws = new WorkbookSettings();
ws.setLocale(new Locale("en", "EN"));
WritableWorkbook workbook = null;
File newFile = new File(dir.getPath() + "\\" + fileName);
workbook = Workbook.createWorkbook(newFile, ws);
WritableSheet s = workbook.createSheet("mySheet", 0);
for (int i = 1; i <= <columncount>; ++i) {
Label l = new Label(i - 1, 0, <columnname>, cf);
s.addCell(l);
}
for (int j = 1; j <= <rowcount>; j++) {
for (int i = 1; i <= <columncount>; i++) {
Label m = new Label(i - 1, j, <rowvalue>, cf);
s.addCell(m);
}
}
workbook.write();
workbook.close();`
Here's a class I use to do the trick. It exports the JTable exactly as is presented to the user in the view (including the order of rows and columns). It can also extract text directly from JLabel custom renderers if you use them. I made some small changes for the post without compiling, so tell me if there are issues...
Note that this is a duplicate of questions
How to export a JTable to a .csv file?
and
How to export data from JTable to CSV
although the answers contain only pseudocode.
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import javax.swing.JLabel;
import javax.swing.JTable;
public class JTableCSVExporter {
protected char columnDelimiter = ',';
public char getColumnDelimiter() {
return columnDelimiter;
}
public void setColumnDelimiter(char columnDelimiter) {
this.columnDelimiter = columnDelimiter;
}
protected char rowDelimiter = '\n';
public char getRowDelimiter() {
return rowDelimiter;
}
public void setRowDelimiter(char rowDelimiter) {
this.rowDelimiter = rowDelimiter;
}
protected char quote = '"';
public char getQuote() {
return quote;
}
public void setQuote(char quote) {
this.quote = quote;
}
protected boolean useRenderers = true;
public boolean isUseRenderers() {
return useRenderers;
}
/**
* If true, the value provided by table renderers (if they are JLabel) is used as value for CSV.
* If false, or the renderer is not JLabel, the class uses toString of the cell value.
*/
public void setUseRenderers(boolean useRenderers) {
this.useRenderers = useRenderers;
}
protected boolean translateBools = true;
public boolean isTranslateBools() {
return translateBools;
}
/**
* If true, bools are translated to "yes" a "no". Otherwise toString() is used
*/
public void setTranslateBools(boolean translateBools) {
this.translateBools = translateBools;
}
/**
* Exports table to file.
* #throws IOException
*/
public void exportCSV(JTable table, File file) throws IOException {
FileWriter out = new FileWriter(file);
for (int i = 0; i < table.getColumnCount(); i++) {
int columnIndex = table.convertColumnIndexToView(i);
if (columnIndex != -1) {
out.write(table.getColumnName(columnIndex) + columnDelimiter);
}
}
out.write(rowDelimiter);
for (int rowIndex = 0; rowIndex < table.getRowCount(); rowIndex++) {
for (int j = 0; j < table.getColumnCount(); j++) {
int columnIndex = table.convertColumnIndexToView(j);
if (columnIndex != -1) {
String toWrite;
Object value = table.getValueAt(rowIndex, columnIndex);
java.awt.Component rendererComponent = table.getCellRenderer(rowIndex, columnIndex).getTableCellRendererComponent(table, value, false, false, rowIndex, columnIndex);
if (isUseRenderers() && rendererComponent instanceof JLabel) {
toWrite = ((JLabel) rendererComponent).getText();
} else {
if (isTranslateBools() && value instanceof Boolean) {
if (value.equals(Boolean.TRUE)) {
toWrite = "yes";
} else {
toWrite = "no";
}
} else {
if (value == null) {
toWrite = "";
} else {
toWrite = value.toString();
}
}
}
out.write(quote + toWrite.replace(Character.toString(quote), "\\" + quote) + quote + columnDelimiter);
}
}
out.write(rowDelimiter);
}
out.close();
}
}