How to read specific columns using Apache POI Event API? - java

I want to read large xls or xlsx file i.e. 70 MB file having 1,50,000+ rows.
I am able to load the entire excel file using below code, but I would like to fetch specific columns only like column C only or some other column.
import java.io.InputStream;
import java.util.Iterator;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
public class testclass {
public void processOneSheet(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader( pkg );
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
// To look up the Sheet Name / Sheet Order / rID,
// you need to process the core Workbook stream.
// Normally it's of the form rId# or rSheet#
InputStream sheet2 = r.getSheet("rId2");
InputSource sheetSource = new InputSource(sheet2);
parser.parse(sheetSource);
sheet2.close();
}
public void processAllSheets(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader( pkg );
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
Iterator<InputStream> sheets = r.getSheetsData();
while(sheets.hasNext()) {
System.out.println("Processing new sheet:\n");
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
System.out.println("");
}
}
public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
ContentHandler handler = (ContentHandler) new SheetHandler(sst);
parser.setContentHandler(handler);
return parser;
}
/**
* See org.xml.sax.helpers.DefaultHandler javadocs
*/
private static class SheetHandler extends DefaultHandler {
private SharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
private SheetHandler(SharedStringsTable sst) {
this.sst = sst;
}
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
// c => cell
if(name.equals("c")) {
// Print the cell reference
//System.out.print(attributes.getValue("r") + " - ");
// Figure out if the value is an index in the SST
String cellType = attributes.getValue("t");
if(cellType != null && cellType.equals("s")) {
nextIsString = true;
} else {
nextIsString = false;
}
}
// Clear contents cache
lastContents = "";
}
public void endElement(String uri, String localName, String name)
throws SAXException {
// Process the last contents as required.
// Do now, as characters() may be called more than once
if(nextIsString) {
int idx = Integer.parseInt(lastContents);
lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
nextIsString = false;
}
// v => contents of a cell
// Output after we've seen the string contents
if(name.equals("v")) {
System.out.println(lastContents);
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
lastContents += new String(ch, start, length);
}
}
public static void main(String[] args) throws Exception {
String file_path = "D:/Data/Finance/Finance.xlsx";
testclass example = new testclass();
example.processAllSheets(file_path);
}
}

Related

Apache POI reads wrong date format of Excel sheet by the XSSF reader

I want to read specific value of cell in Excel sheet.Currently my code is giving General value instead of Custom value of cell.
For cite an Example :-
Now,
I am giving the input in Excel sheet Custom(Cell type)--> 01/10/2250.
output is converting in General value of Cell--->128110
(The cell is in format 'General', this is an important point becuse it is this specific condition that causes the error we have).
We are using XSSF reader and XMLReader and now Workbook because of memory issues with workbook. (we see alot of answers on the we for workbook not for XSSF)
Test cases
input --> 01/10/2250
Desired output -->01/10/2250
Actual output---->128110
input --> 12-25-2250
Desired output -->12-25-2250
Actual output---->128110
input --> 2250-25-12
Desired output -->2250-25-12
Actual output---->128195
Please mind all the input pattern with the hyphen(-) and slash(/).
strong text
This is code---->
import java.io.InputStream;
import java.util.Iterator;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
public class DateCellDemo {
public void processOneSheet(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader( pkg );
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
InputStream sheet2 = r.getSheet("rId1");
InputSource sheetSource = new InputSource(sheet2);
parser.parse(sheetSource);
sheet2.close();
}
public void processAllSheets(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader( pkg );
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
Iterator<InputStream> sheets = r.getSheetsData();
while(sheets.hasNext()) {
System.out.println("Processing new sheet:\n");
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
System.out.println("");
}
}
public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
XMLReader parser =
XMLReaderFactory.createXMLReader(
"org.apache.xerces.parsers.SAXParser"
);
ContentHandler handler = new SheetHandler(sst);
parser.setContentHandler(handler);
return parser;
}
private static class SheetHandler extends DefaultHandler {
private SharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
private SheetHandler(SharedStringsTable sst) {
this.sst = sst;
}
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
// c => cell
if(name.equals("c")) {
// Print the cell reference
System.out.print(attributes.getValue("r") + " - ");
// Figure out if the value is an index in the SST
String cellType = attributes.getValue("t");
//System.out.println("------ cellType :"+cellType);
if(cellType != null && cellType.equals("s")) {
nextIsString = true;
} else {
nextIsString = false;
}
}
// Clear contents cache
lastContents = "";
}
public void endElement(String uri, String localName, String name)
throws SAXException {
// Process the last contents as required.
// Do now, as characters() may be called more than once
if(nextIsString) {
System.out.println("Date Pattren : "+nextIsString);
int idx = Integer.parseInt(lastContents);
lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
}
// v => contents of a cell
// Output after we've seen the string contents
if(name.equals("v")) {
System.out.println("Last_Content : "+lastContents);
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
lastContents += new String(ch, start, length);
}
}
public static void main(String[] args) {
try {
DateCellDemo howto = new DateCellDemo();
howto.processOneSheet("C:\\Users\\Dell\\Downloads\\Source11_Data.xlsx");
} catch (Exception e) {
e.printStackTrace();
}
}
}
There is another Apache POI example (XLSX2CSV) that lets you stream the input xlsx but get access to formatted cell data. Your code is based on another sample which reads the raw XML from the xlsx file (and the raw XML has the dates in numeric format).
This is based on that sample:
https://github.com/pjfanning/poi-shared-strings-sample/blob/master/src/main/java/com/github/pjfanning/poi/sample/XLSX2CSV.java

gettting exception while reading duplicate column name excel file using sparkexcel library. How to overcome this issue

I am using spark-excel(com.crealytics.spark.excel) library to read excel file. If no duplicate column in excel file, the library working fine. If any duplicate column name occurs in excel file, throwing below exception.
How to overcome this error?
Is there any workaround solution to overcome this?
Exception in thread "main" org.apache.spark.sql.AnalysisException: Found duplicate column(s) in the data schema: `net territory`;
at org.apache.spark.sql.util.SchemaUtils$.checkColumnNameDuplication(SchemaUtils.scala:85)
Using spark excel API getting exception .
StructType schema = DataTypes.createStructType(new StructField[]{DataTypes.createStructField("CGISAI", DataTypes.StringType, true), DataTypes.createStructField("SALES TERRITORY", DataTypes.StringType, true)});
SQLContext sqlcxt = new SQLContext(jsc);
Dataset<Row> df = sqlcxt.read()
.format("com.crealytics.spark.excel")
.option("path", "file:///"+siteinfofile)
.option("useHeader", "true")
.option("spark.read.simpleMode", "true")
.option("treatEmptyValuesAsNulls", "true")
.option("inferSchema", "false")
.option("addColorColumns", "False")
.option("sheetName", "sheet1")
.option("startColumn", 22)
.option("endColumn", 23)
//.schema(schema)
.load();
return df;
This is the code I am using. I am using sparkexcel library from com.crealytics.spark.excel.
I want the solution to identify whether excel file has duplicate columns or not. if have duplicate columns, how to rename/eliminate the duplicate columns.
WorkAround is as below:
convert .xlsx file into .csv . using spark default csv api that can handle duplicate column names by renaming them automatically.
Below is the code to convert from xlsx to csv file.
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package com.huawei.java.tools;
/**
*
* #author Nanaji Jonnadula
*/
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellAddress;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.util.SAXHelper;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import static org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
public class ExcelXlsx2Csv {
private static class SheetToCSV implements SheetContentsHandler {
private boolean firstCellOfRow = false;
private int currentRow = -1;
private int currentCol = -1;
private StringBuffer lineBuffer = new StringBuffer();
/** * Destination for data */
private FileOutputStream outputStream;
public SheetToCSV(FileOutputStream outputStream) {
this.outputStream = outputStream;
}
#Override
public void startRow(int rowNum) {
/** * If there were gaps, output the missing rows: * outputMissingRows(rowNum - currentRow - 1); */
// Prepare for this row
firstCellOfRow = true;
currentRow = rowNum;
currentCol = -1;
lineBuffer.delete(0, lineBuffer.length()); //clear lineBuffer
}
#Override
public void endRow(int rowNum) {
lineBuffer.append('\n');
try {
outputStream.write(lineBuffer.substring(0).getBytes());
} catch (IOException e) {
System.out.println("save date to file error at row number: {}"+ currentCol);
throw new RuntimeException("save date to file error at row number: " + currentCol);
}
}
#Override
public void cell(String cellReference, String formattedValue, XSSFComment comment) {
if (firstCellOfRow) {
firstCellOfRow = false;
} else {
lineBuffer.append(',');
}
// gracefully handle missing CellRef here in a similar way as XSSFCell does
if (cellReference == null) {
cellReference = new CellAddress(currentRow, currentCol).formatAsString();
}
int thisCol = (new CellReference(cellReference)).getCol();
int missedCols = thisCol - currentCol - 1;
if (missedCols > 1) {
lineBuffer.append(',');
}
currentCol = thisCol;
if (formattedValue.contains("\n")) {
formattedValue = formattedValue.replace("\n", "");
}
formattedValue = "\"" + formattedValue + "\"";
lineBuffer.append(formattedValue);
}
#Override
public void headerFooter(String text, boolean isHeader, String tagName) {
// Skip, no headers or footers in CSV
}
}
private static void processSheet(StylesTable styles, ReadOnlySharedStringsTable strings,
SheetContentsHandler sheetHandler, InputStream sheetInputStream) throws Exception {
DataFormatter formatter = new DataFormatter();
InputSource sheetSource = new InputSource(sheetInputStream);
try {
XMLReader sheetParser = SAXHelper.newXMLReader();
ContentHandler handler = new XSSFSheetXMLHandler(
styles, null, strings, sheetHandler, formatter, false);
sheetParser.setContentHandler(handler);
sheetParser.parse(sheetSource);
} catch (ParserConfigurationException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
}
public static void process(String srcFile, String destFile,String sheetname_) throws Exception {
File xlsxFile = new File(srcFile);
OPCPackage xlsxPackage = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(xlsxPackage);
XSSFReader xssfReader = new XSSFReader(xlsxPackage);
StylesTable styles = xssfReader.getStylesTable();
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
int index = 0;
while (iter.hasNext()) {
InputStream stream = iter.next();
String sheetName = iter.getSheetName();
System.out.println(sheetName + " [index=" + index + "]");
if(sheetName.equals(sheetname_)){
FileOutputStream fileOutputStream = new FileOutputStream(destFile);
processSheet(styles, strings, new SheetToCSV(fileOutputStream), stream);
fileOutputStream.flush();
fileOutputStream.close();
}
stream.close();
++index;
}
xlsxPackage.close();
}
public static void main(String[] args) throws Exception {
ExcelXlsx2Csv.process("D:\\data\\latest.xlsx", "D:\\data\\latest.csv","sheet1"); //source , destination, sheetname
}
}

Reading a 10 MB file in Apache POI

The project that I'm working on is trying to read a very large Excel file (a couple hundred columns and around 3000 rows) and recognize patterns in a series of letters. It works just fine on smaller files, but when I try to run it using this file I receive a java.lang.OutOfMemoryError: Java heap space error even when I'm only trying to analyze the first few rows. The error appears to be at Workbook wb = WorkbookFactory.create(new File(filepath));
I've tried a few of the solutions on this website, but have not come across any success. My code is as follows:
import java.awt.List;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
public class ExcelReader {
public int Reader(File file) throws IOException, EncryptedDocumentException, InvalidFormatException {
String filepath = file.getPath();
Workbook wb = WorkbookFactory.create(new File(filepath));
XSSFSheet sheet = (XSSFSheet) wb.getSheetAt(0);
XSSFRow row;
XSSFCell cell;
ArrayList<Integer> list = new ArrayList<Integer>();
int rows;
int cols = 0;
int temp = 0;
rows = sheet.getPhysicalNumberOfRows();
for (int i = 0; i <= 1; i++) {
row = sheet.getRow(i);
if (row != null) {
temp = sheet.getRow(i).getPhysicalNumberOfCells();
if (temp > cols)
cols = temp;
}
}
for (int r = 0; r <= 60; r++) {
row = sheet.getRow(r);
if (row != null) {
for (int c = 0; c <= cols; c++) {
int numblanks = 0;
cell = row.getCell((short) c);
if (cell != null) {
//System.out.print(cell + "\t\t");
} else {
//System.out.print("\t\t");
}
if (cell != null && cell.getCellType() == XSSFCell.CELL_TYPE_STRING) {
if ("N".equals(cell.getStringCellValue())) {
for (int k = c; k <= cols; k++) {
if ("-".equals(row.getCell(k).getStringCellValue())) {
numblanks++;
continue;
}
if ("S".equals(row.getCell(c + 2 + numblanks).getStringCellValue())
|| "T".equals(row.getCell(c + 2 + numblanks).getStringCellValue())) {
list.add((int) sheet.getRow(1).getCell(c).getNumericCellValue());
break;
}
}
}
}
}
System.out.println();
}
}
System.out.println();
System.out.println("Rows: " + rows);
System.out.println("Columns: " + cols);
System.out.println(list);
return temp;
}
}
Thank you for any help you can give me!
I solved this issue before. My case was to read a 23M Excel file which contains 230k rows.
Increase maximum heap size is not a good solution. Apache poi does not have a streaming mode to read data. This non-streaming mode costs too much memory.
My solution is to convert the data to xml and then use XMLReader to parse the data.
Please check the following sample code:
protected List<Entity> parseData(InputStream in) throws Exception {
OPCPackage pkg = OPCPackage.open(in);
XSSFReader r = new XSSFReader(pkg);
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
XSSFReader.SheetIterator sheets = (XSSFReader.SheetIterator) r.getSheetsData();
while (sheets.hasNext()) {
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
break; // if only need to process one sheet.
}
return SheetHandler.getRawData();
}
private XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
XMLReader parser =
XMLReaderFactory.createXMLReader();
ContentHandler handler = new SheetHandler(sst);
parser.setContentHandler(handler);
return parser;
}
private static class SheetHandler extends DefaultHandler {
private SharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
private boolean nextIsInlineString;
private boolean nextIsNull;
private SheetHandler(SharedStringsTable sst) {
this.sst = sst;
rawData = new ArrayList<Entity>();
}
public static List<Entity> getRawData() {
return rawData;
}
#Override
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
}
#Override
public void endElement(String uri, String localName, String name)
throws SAXException {
}
#Override
public void characters(char[] ch, int start, int length)
throws SAXException {
lastContents += new String(ch, start, length);
}
}
}

Reading multiple tabs of a huge excel file in Java using XSS and Event

I am using this code from (by author: lchen) which reads contents from excel file based on number of rows I provide into method 'readRow()'.
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.InputSource;
public class TestLargeFileRead {
private int rowNum = 0;
private OPCPackage opcPkg;
private ReadOnlySharedStringsTable stringsTable;
private XMLStreamReader xmlReader;
public void XExcelFileReader(String excelPath) throws Exception {
opcPkg = OPCPackage.open(excelPath, PackageAccess.READ);
this.stringsTable = new ReadOnlySharedStringsTable(opcPkg);
XSSFReader xssfReader = new XSSFReader(opcPkg);
XMLInputFactory factory = XMLInputFactory.newInstance();
InputStream inputStream = xssfReader.getSheetsData().next();
xmlReader = factory.createXMLStreamReader(inputStream);
while (xmlReader.hasNext()) {
xmlReader.next();
if (xmlReader.isStartElement()) {
if (xmlReader.getLocalName().equals("sheetData"))
break;
}
}
}
public int rowNum() {
return rowNum;
}
public List<String[]> readRows(int batchSize) throws XMLStreamException {
String elementName = "row";
List<String[]> dataRows = new ArrayList<String[]>();
if (batchSize > 0) {
while (xmlReader.hasNext()) {
xmlReader.next();
if (xmlReader.isStartElement()) {
if (xmlReader.getLocalName().equals(elementName)) {
rowNum++;
dataRows.add(getDataRow());
if (dataRows.size() == batchSize)
break;
}
}
}
}
return dataRows;
}
private String[] getDataRow() throws XMLStreamException {
List<String> rowValues = new ArrayList<String>();
while (xmlReader.hasNext()) {
xmlReader.next();
if (xmlReader.isStartElement()) {
if (xmlReader.getLocalName().equals("c")) {
CellReference cellReference = new CellReference(
xmlReader.getAttributeValue(null, "r"));
// Fill in the possible blank cells!
while (rowValues.size() < cellReference.getCol()) {
rowValues.add("");
}
String cellType = xmlReader.getAttributeValue(null, "t");
rowValues.add(getCellValue(cellType));
}
} else if (xmlReader.isEndElement()
&& xmlReader.getLocalName().equals("row")) {
break;
}
}
return rowValues.toArray(new String[rowValues.size()]);
}
private String getCellValue(String cellType) throws XMLStreamException {
String value = ""; // by default
while (xmlReader.hasNext()) {
xmlReader.next();
if (xmlReader.isStartElement()) {
if (xmlReader.getLocalName().equals("v")) {
if (cellType != null && cellType.equals("s")) {
int idx = Integer.parseInt(xmlReader.getElementText());
return new XSSFRichTextString(
stringsTable.getEntryAt(idx)).toString();
} else {
return xmlReader.getElementText();
}
}
} else if (xmlReader.isEndElement()
&& xmlReader.getLocalName().equals("c")) {
break;
}
}
return value;
}
#Override
protected void finalize() throws Throwable {
if (opcPkg != null)
opcPkg.close();
super.finalize();
}
public static void main(String[] args) {
try {
TestLargeFileRead howto = new TestLargeFileRead();
howto.XExcelFileReader("D:\\TEMP_CATALOG\\H1.xlsx");
} catch (Exception e) {
e.printStackTrace();
}
}
}
But it reads only First SHEET's contents and discards other subsequent SHEETS. My requirement is to read SHEET name; and based on name read that SHEET's contents. Can anyone help me to customize this above code fetch SHEET NAME and their contents ? please ?
The key class you need to work with, and tweak your use of, is XSSFReader. If you take a look at the Javadocs for it, you'll see it provides an Iterator of InputStreams of all the sheets, and a way to get at the root Workbook stream.
If you want to access all the sheets, you need to change these lines:
InputStream inputStream = xssfReader.getSheetsData().next();
xmlReader = factory.createXMLStreamReader(inputStream);
Into something more like:
Iterator<InputStream> sheetsData = xssfReader.getSheetsData();
while (sheetsData.hasNext()) {
InputStream inputStream = sheetsData.next();
xmlReader = factory.createXMLStreamReader(inputStream);
....
}
If you also want to get the sheet name as well, you'll want to do something like what is shown in the Apache POI XLSX event-based text extractor
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
while (sheetsData.hasNext()) {
InputStream inputStream = sheetsData.next();
String sheetName = iter.getSheetName();
if (sheetName.equalsIgnoreCase("TheSheetIWant")) {
xmlReader = factory.createXMLStreamReader(inputStream);
....
}
}
If you want to know more about doing this stuff, then one of the best examples, that's easy to read and follow, is XSSFEventBasedExcelExtractor that comes with Apache POI - read the code for that and learn!

Event Based POI reading Excel with hidden columns

I downloaded an excel file which has a hidden column (first column (A1)) along with other column values which are visible to the user. But when I am trying to read this excel using SAX event POI, it never processes the excel file. In startelement method, we do a check if(name.equals("c")) { ......} but this condition is never met when the excel file had a hidden column(The first column).
However when I make the hidden column visible, it reads(processes) the excel file. Another observation made is, once you make the hidden column visible and hide the column again, the excel file is read.
Please suggest.
Try the code here
I did it with an xlsx with a hidden column and the same xlsx with the column unhidden and both revealed all of the columns of the spreadsheet including cells in hidden columns. A bit long is the code, but I put it anyway in the event the link dies someday.
import java.io.InputStream;
import java.util.Iterator;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
public class ExampleEventUserModel {
public void processOneSheet(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader( pkg );
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
// rId2 found by processing the Workbook
// Seems to either be rId# or rSheet#
InputStream sheet2 = r.getSheet("rId2");
InputSource sheetSource = new InputSource(sheet2);
parser.parse(sheetSource);
sheet2.close();
}
public void processAllSheets(String filename) throws Exception {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader( pkg );
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
Iterator<InputStream> sheets = r.getSheetsData();
while(sheets.hasNext()) {
System.out.println("Processing new sheet:\n");
InputStream sheet = sheets.next();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
System.out.println("");
}
}
public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
XMLReader parser =
XMLReaderFactory.createXMLReader(
"org.apache.xerces.parsers.SAXParser"
);
ContentHandler handler = new SheetHandler(sst);
parser.setContentHandler(handler);
return parser;
}
/**
* See org.xml.sax.helpers.DefaultHandler javadocs
*/
private static class SheetHandler extends DefaultHandler {
private SharedStringsTable sst;
private String lastContents;
private boolean nextIsString;
private SheetHandler(SharedStringsTable sst) {
this.sst = sst;
}
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
// c => cell
if(name.equals("c")) {
// Print the cell reference
System.out.print(attributes.getValue("r") + " - ");
// Figure out if the value is an index in the SST
String cellType = attributes.getValue("t");
if(cellType != null && cellType.equals("s")) {
nextIsString = true;
} else {
nextIsString = false;
}
}
// Clear contents cache
lastContents = "";
}
public void endElement(String uri, String localName, String name)
throws SAXException {
// Process the last contents as required.
// Do now, as characters() may be called more than once
if(nextIsString) {
int idx = Integer.parseInt(lastContents);
lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
nextIsString = false;
}
// v => contents of a cell
// Output after we've seen the string contents
if(name.equals("v")) {
System.out.println(lastContents);
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
lastContents += new String(ch, start, length);
}
}
public static void main(String[] args) throws Exception {
FromHowTo howto = new FromHowTo();
howto.processOneSheet(args[0]);
howto.processAllSheets(args[0]);
}
}

Categories

Resources