I am converting CSV file to XML , it is converting but not getting desired structured output .
My java Code :-
public static void main(String[] args){
List<String> headers=new ArrayList<String>(5);
File file=new File("C:/Users/Admin/Desktop/data.csv");
BufferedReader reader=null;
try {
DocumentBuilderFactory domFactory =DocumentBuilderFactory.newInstance();
DocumentBuilder domBuilder=domFactory.newDocumentBuilder();
Document newDoc=domBuilder.newDocument();
// Root element
Element rootElement=newDoc.createElement("root");
newDoc.appendChild(rootElement);
reader = new BufferedReader(new FileReader(file));
int line=0;
String text=null;
while ((text=reader.readLine())!=null) {
StringTokenizer st=new StringTokenizer(text, "?", false);
String[] rowValues=new String[st.countTokens()];
int index=0;
while (st.hasMoreTokens()) {
String next=st.nextToken();
rowValues[index++]=next;
}
//String[] rowValues = text.split(",");
if (line == 0) { // Header row
for (String col:rowValues) {
headers.add(col);
Element rowElement=newDoc.createElement("header");
rootElement.appendChild(rowElement);
for (int col1=0;col1<headers.size();col1++) {
String header = headers.get(col1);
String value = null;
if (col1<rowValues.length) {
value=rowValues[col1];
} else {
// ?? Default value
value=" ";
}
rowElement.setTextContent(value);
System.out.println(headers+" "+value);
}
}} else { // Data row
Element rowElement=newDoc.createElement("row");
rootElement.appendChild(rowElement);
for (int col=0;col<headers.size();col++) {
String header = headers.get(col);
String value = null;
if (col<rowValues.length) {
value=rowValues[col];
} else {
// ?? Default value
value=" ";
}
rowElement.setTextContent(value);
System.out.println(header+" "+value);
}
}
line++;
}
try {
TransformerFactory tranFactory = TransformerFactory.newInstance();
Transformer aTransformer = tranFactory.newTransformer();
aTransformer.setOutputProperty(OutputKeys.INDENT, "yes");
aTransformer.setOutputProperty(OutputKeys.METHOD, "xml");
aTransformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
Source src = new DOMSource(newDoc);
Result result = new StreamResult(new File("C:/Users/Admin/Desktop/data.xml"));
aTransformer.transform(src, result);
System.out.println("File creation successfully!");
} catch (Exception exp) {
exp.printStackTrace();
} finally {
try {
} catch (Exception e1) {
}
try {
} catch (Exception e1) {
}
}
} catch (Exception e1) {
e1.printStackTrace();
}
}
This is my CSV file:-
Symbol,Open,High,Low,Last Traded Price,Change
"NIFTY 50","9,645.90","9,650.65","9,600.95","9,609.30","-5.70"
"RELIANCE","1,390.00","1,414.20","1,389.00","1,407.55","26.50"
"BPCL","647.70","665.00","645.95","660.10","10.75"
"ADANIPORTS","368.50","373.80","368.00","372.25","4.25"
"ONGC","159.50","161.75","159.35","160.80","1.70"
And this is the output I am getting:-
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<root>
<header>Symbol,Open,High,Low,Last Traded Price,Change</header>
<row>"NIFTY 50","9,645.90","9,650.65","9,600.95","9,609.30","-5.70"</row>
<row>"RELIANCE","1,390.00","1,414.20","1,389.00","1,407.55","26.50"</row>
<row>"BPCL","647.70","665.00","645.95","660.10","10.75"</row>
<row>"ADANIPORTS","368.50","373.80","368.00","372.25","4.25"</row>
<row>"ONGC","159.50","161.75","159.35","160.80","1.70"</row>
</root>
Suggest me where am I going wrong ? I tried according to me , but getting confuse where in header and row section should I make changes.
ADDED :-
Expected output
<root>
<header>symbol</header>
<row>NIFTY 50</row>
<row>RELIANCE</row>
<row>BPCL></row>
.
.
<header>Open</header>
<row>9,645.90</row>
<row>1,390.00</row>
.
.
</root>
For your reference:
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.csv.QuoteMode;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class CsvToXml {
public static void main(String[] args) {
File inputFile = new File("C:/Users/Admin/Desktop/data.csv");
CSVParser inParser = null;
Document newDoc = null;
try {
inParser = CSVParser.parse(inputFile, StandardCharsets.UTF_8,
CSVFormat.EXCEL.withHeader().withQuoteMode(QuoteMode.NON_NUMERIC));
DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder domBuilder = domFactory.newDocumentBuilder();
newDoc = domBuilder.newDocument();
// Root element
Element rootElement = newDoc.createElement("root");
newDoc.appendChild(rootElement);
List<CSVRecord> records = inParser.getRecords();
for (String key : inParser.getHeaderMap().keySet()) {
Element rowElement = newDoc.createElement("header");
rootElement.appendChild(rowElement);
rowElement.setTextContent(key);
for (CSVRecord record : records) {
rowElement = newDoc.createElement("row");
rootElement.appendChild(rowElement);
rowElement.setTextContent(record.get(key));
}
}
TransformerFactory tranFactory = TransformerFactory.newInstance();
Transformer aTransformer = tranFactory.newTransformer();
aTransformer.setOutputProperty(OutputKeys.INDENT, "yes");
aTransformer.setOutputProperty(OutputKeys.METHOD, "xml");
aTransformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
Source src = new DOMSource(newDoc);
Result result = new StreamResult(new File("C:/Users/Admin/Desktop/data.xml"));
aTransformer.transform(src, result);
System.out.println("File creation successfully!");
} catch (Exception e) {
e.printStackTrace();
} finally {
if (inParser != null) {
try {
inParser.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
This is using Apache Commons CSV.
Related
I am very new to XML parsing. I am trying to read the XML file from a shared drive on my computer and moving them to another shared drive. I have the below XML file. i am trying to read the Test.pdf value from this XML document
<?xml version="1.0" encoding="utf-8" ?>
<xml>
<IndexData FileName="Test.pdf">
<AttachmentID>3221929</AttachmentID>
<URI>test234555..pdf</URI>
<postmarkDate>2018-07-02T12:52:00.9</postmarkDate>
<pin>305270036</pin>
<scanDate>2018-07-02T12:52:00.9</scanDate>
<UserLogin>admin</UserLogin>
</IndexData>
<IndexData FileName="Test2.pdf">
<AttachmentID>3221931</AttachmentID>
<URI>Appp2.pdf</URI>
<postmarkDate>2018-07-02T14:19:22.5</postmarkDate>
<pin>305270036</pin>
<scanDate>2018-07-02T14:19:22.5</scanDate>
<UserLogin>admin</UserLogin>
</IndexData>
</xml>
I tried importing import org.w3c.dom.Node; for this. Below is my code:
String processXml(Node doc) {
String fileName = null;
try {
DfLogger.debug(this, "Loading: " + doc.getNodeName(), null, null);
Map<String, String> indexData = getXmlData(doc);
fileName = indexData.get("IndexData FileName");
if (new File(fileName).exists()) {
import(fileName, indexData);
}
} catch (Exception ex) {
DfLogger.error(this, "Error processing document.", null, ex);
return null;
}
return fileName;
}
My value for FileName is always NULL when I am trying to read the value by doing this:
fileName = indexData.get("IndexData FileName");
below is my getXmlData method.
protected Map<String, String> getXmlData(Node xmlDataNode) {
Map<String, String> xmlData = new HashMap<>();
NodeList nodeList = xmlDataNode.getChildNodes();
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
xmlData.put(node.getNodeName(), node.getTextContent().trim());
}
}
return xmlData;
}
The caller method for processXML is below:
Public void processIncomingfiles(String documentTagName) throws Exception {
DfLogger.debug(this, "Import Process Begin ---- exportPath=" + exportPath, null, null);
try {
File dir = new File(exportPath);
if (dir.isDirectory()) {
FilenameFilter xmlFiles = new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.toLowerCase().endsWith(".xml");
}
};
for (File file : dir.listFiles(xmlFiles)) {
if (!file.isDirectory()) {
DfLogger.debug(this, "Loading XML file: " + file.getAbsolutePath(), null, null);
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder documentBuilder = dbFactory.newDocumentBuilder();
FileInputStream fileStream = new FileInputStream(file);
try {
// Use FileInputStream instead of File since parse will leave file locked on error
Document doc = documentBuilder.parse(fileStream);
fileStream.close();
fileStream = null;
doc.getDocumentElement().normalize();
NodeList nodeList = doc.getElementsByTagName(documentTagName);
List<Node> errors = new ArrayList<>();
for (int i = 0; i < nodeList.getLength(); i++) {
String documentFilename = processXml(nodeList.item(i));
if (documentFilename != null) {
moveFileToProcessedSuccessful(documentFilename);
} else {
DfLogger.debug(
this,
"Error processing document in file: " + file.getName(),
null,
null);
errors.add(nodeList.item(i));
}
}
if (!errors.isEmpty()) {
if (errors.size() == nodeList.getLength()) {
safeMove(file, file.getAbsolutePath() + ".errors");
} else {
Node parent = nodeList.item(0).getParentNode();
for (Node errorDoc : errors) {
parent.removeChild(errorDoc);
}
writeXml(doc, file.getAbsolutePath());
moveFileToProcessedSuccessful(file);
while (nodeList.getLength() > 0) {
parent.removeChild(nodeList.item(0));
}
for (Node errorDoc : errors) {
parent.appendChild(errorDoc);
}
writeXml(doc, file.getAbsolutePath() + ".errors");
}
} else {
moveFileToProcessedSuccessful(file);
}
} catch (Exception ex) {
DfLogger.error(this, "Error parsing XML File.", null, ex);
if (fileStream != null) {
fileStream.close(); // If DocBuilder.parse fails, leaves file locked
}
safeMove(file, file.getAbsolutePath() + ".error");
}
}
}
}
} catch (Exception ex) {
DfLogger.error(this, "Error in XML Parser.", null, ex);
throw ex;
}
DfLogger.debug(this, "Import Process Ends -----------", null, null);
}
/**
* Process the Xml for the give document node.
* #param doc xml node
* #return filename of successfully processed document, otherwise null
*/
any help will be appreciated.
Lets assume you have your xml data in test.xml file. You can read file and get specific data from your xml using the below code:
package yourPackage;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
public class Main {
public static void main(String[] args) throws IOException, ParserConfigurationException, SAXException {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
Document doc = factory.newDocumentBuilder().parse(Files.newInputStream(Paths.get("test.xml")));
doc.getDocumentElement().normalize();
Element data = (Element)doc.getElementsByTagName("IndexData").item(0);
System.out.println(data.getAttribute("FileName"));
}
}
The output is :
Test.pdf
I have the following source XML file named customers.xml:
<?xml version="1.0" encoding="utf-8"?>
<p:CustomerElement xmlns:p="http://www.dog.com/customer" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:schemaLocation="http://www.dog.com/customer Customer.xsd">
<Customer>
<Sender>
<transmitDate>2016-02-21T00:00:00</transmitDate>
<transmitter>Dog ETL v2.0</transmitter>
<dealerCode><![CDATA[P020]]></dealerCode>
<DMSSystem><![CDATA[DBS]]></DMSSystem>
<DMSReleaseNumber><![CDATA[5.0]]></DMSReleaseNumber>
</Sender>
<Identifier>
<updateInd><![CDATA[A]]></updateInd>
<dealerCustNumber><![CDATA[AMADOR]]></dealerCustNumber>
<dealerCustName><![CDATA[AMADOR COMPUTERS]]></dealerCustName>
<phoneNumber><![CDATA[800 111 4444]]></phoneNumber>
<faxNumber><![CDATA[780 111 4444]]></faxNumber>
<email xsi:nil="true" />
<customerType><![CDATA[R]]></customerType>
<activeCustomerInd>false</activeCustomerInd>
<parentCustomerNumber xsi:nil="true" />
<primaryStoreNumber><![CDATA[00]]></primaryStoreNumber>
<preferredLanguage><![CDATA[ENG]]></preferredLanguage>
<dealerDateInSystem>2000-01-11T00:00:00</dealerDateInSystem>
<dealerLastUpdatedDate>2015-02-05T00:00:00</dealerLastUpdatedDate>
</Identifier>
<Location>
<address2><![CDATA[ACCOUNT FLAGGED FOR DELETION]]></address2>
<address3><![CDATA[AS PER BILL FEB AA/15]]></address3>
<city><![CDATA[CHICAGO]]></city>
<postalCode><![CDATA[Q5S 1E5]]></postalCode>
<state><![CDATA[AB]]></state>
<country><![CDATA[CA]]></country>
<location><![CDATA[FLAGGED FOR DELETION]]></location>
<addressType><![CDATA[M]]></addressType>
</Location>
<Division>
<divisionCode><![CDATA[G]]></divisionCode>
<divisionName><![CDATA[CAR]]></divisionName>
<IndustryCode>
<industryCode><![CDATA[AQ99]]></industryCode>
<primaryIndustryCodeInd>true</primaryIndustryCodeInd>
</IndustryCode>
<SalesRep>
<number><![CDATA[XXX]]></number>
<name><![CDATA[KILL ACCOUNT IN PROCESS]]></name>
<type><![CDATA[M]]></type>
<par>0</par>
<email xsi:nil="true" />
<phoneNumber><![CDATA[000 000 0000]]></phoneNumber>
</SalesRep>
</Division>
</Customer>
<Customer>
<Sender>
<transmitDate>2016-02-21T00:00:00</transmitDate>
<transmitter>Dog ETL v2.0</transmitter>
<dealerCode><![CDATA[P000]]></dealerCode>
<DMSSystem><![CDATA[DBS]]></DMSSystem>
<DMSReleaseNumber><![CDATA[5.0]]></DMSReleaseNumber>
</Sender>
<Identifier>
<updateInd><![CDATA[A]]></updateInd>
<dealerCustNumber><![CDATA[UU20888]]></dealerCustNumber>
<dealerCustName><![CDATA[ ADVERTISING AND PR]]></dealerCustName>
<phoneNumber xsi:nil="true" />
<faxNumber xsi:nil="true" />
<email xsi:nil="true" />
<customerType><![CDATA[I]]></customerType>
<activeCustomerInd>true</activeCustomerInd>
<parentCustomerNumber xsi:nil="true" />
<primaryStoreNumber><![CDATA[M2]]></primaryStoreNumber>
<preferredLanguage><![CDATA[ENG]]></preferredLanguage>
<dealerDateInSystem>2015-11-18T00:00:00</dealerDateInSystem>
<dealerLastUpdatedDate>2015-11-19T00:00:00</dealerLastUpdatedDate>
</Identifier>
<Location>
<address2><![CDATA[EQUIP]]></address2>
<city><![CDATA[ADER]]></city>
<country><![CDATA[CA]]></country>
<addressType><![CDATA[M]]></addressType>
</Location>
<Division>
<divisionCode><![CDATA[A]]></divisionCode>
<divisionName><![CDATA[AGRO]]></divisionName>
<IndustryCode>
<industryCode><![CDATA[EQ00]]></industryCode>
<primaryIndustryCodeInd>true</primaryIndustryCodeInd>
</IndustryCode>
</Division>
</Customer>
</p:CustomerElement>
I have the following java code, which parses customers.xml into individual "Customer" entities, and then attempts to convert each of them into an AVRO format:
package com.dogsoft.data.xmltoavro;
import java.io.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.FactoryConfigurationError;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import jdk.nashorn.internal.runtime.regexp.joni.constants.NodeType;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import org.apache.avro.Protocol;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.avro.util.Utf8;
import org.w3c.dom.*;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
public class ParseXmlFile {
private static Protocol protocol;
public static void xmlToAvro(File xmlFile, File avroFile) throws IOException, SAXException {
try {
InputStream stream = new FileInputStream("/tmp/xml.avsc");
if (stream == null) throw new IllegalStateException("Classpath should include xml.avsc");
protocol = Protocol.parse(stream);
} catch (IOException e) {
throw new RuntimeException(e);
}
Schema schema = protocol.getType("Element");
Document doc = parse(xmlFile);
DatumWriter<GenericRecord> datumWriter = new SpecificDatumWriter<>(schema);
try (DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<>(datumWriter)) {
fileWriter.create(schema, avroFile);
Object docElement = doc.getDocumentElement();
fileWriter.append(wrapElement(doc.getDocumentElement()));
}
}
private static GenericData.Record wrapElement(Element el) {
GenericData.Record record = new GenericData.Record(protocol.getType("Element"));
record.put("name", el.getNodeName());
NamedNodeMap attributeNodes = el.getAttributes();
List<GenericData.Record> attrRecords = new ArrayList<>();
for (int i = 0; i < attributeNodes.getLength(); i++) {
Attr attr = (Attr) attributeNodes.item(i);
attrRecords.add(wrapAttr(attr));
}
record.put("attributes", attrRecords);
List<Object> childArray = new ArrayList<>();
NodeList childNodes = el.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node node = childNodes.item(i);
Object nt = node.getNodeType();
if (node.getNodeType() == Node.ELEMENT_NODE)
childArray.add(wrapElement((Element) node));
if (node.getNodeType() == Node.TEXT_NODE)
childArray.add(node.getTextContent());
}
record.put("children", childArray);
return record;
}
private static GenericData.Record wrapAttr(Attr attr) {
GenericData.Record record = new GenericData.Record(protocol.getType("Attribute"));
record.put("name", attr.getName());
record.put("value", attr.getValue());
return record;
}
private static Document parse(File file) throws IOException, SAXException {
try {
DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
return builder.parse(file);
} catch (ParserConfigurationException e) {
throw new RuntimeException(e);
}
}
public static void avroToXml(File avroFile, File xmlFile) throws IOException {
try {
InputStream stream = new FileInputStream("/tmp/xml.avsc");
if (stream == null) throw new IllegalStateException("Classpath should include xml.avsc");
protocol = Protocol.parse(stream);
} catch (IOException e) {
throw new RuntimeException(e);
}
DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(protocol.getType("Element"));
DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(avroFile, datumReader);
GenericRecord record = dataFileReader.next();
Document doc;
try {
doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
} catch (ParserConfigurationException e) {
throw new RuntimeException(e);
}
Element el = unwrapElement(record, doc);
doc.appendChild(el);
saveDocument(doc, xmlFile);
}
private static Element unwrapElement(GenericRecord record, Document doc) {
String name = "" + record.get("name");
Element el = doc.createElement(name);
#SuppressWarnings("unchecked")
GenericArray<GenericRecord> attrArray = (GenericArray<GenericRecord>) record.get("attributes");
for (GenericRecord attrRecord : attrArray)
el.setAttributeNode(unwrapAttr(attrRecord, doc));
#SuppressWarnings("unchecked")
GenericArray<Object> childArray = (GenericArray<Object>) record.get("children");
for (Object childObj : childArray) {
if (childObj instanceof GenericRecord)
el.appendChild(unwrapElement((GenericRecord) childObj, doc));
if (childObj instanceof Utf8)
el.appendChild(doc.createTextNode("" + childObj));
}
return el;
}
private static Attr unwrapAttr(GenericRecord record, Document doc) {
Attr attr = doc.createAttribute("" + record.get("name"));
attr.setValue("" + record.get("value"));
return attr;
}
private static void saveDocument(Document doc, File file) {
try {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.transform(new DOMSource(doc), new StreamResult(file));
} catch (TransformerException e) {
throw new RuntimeException(e);
}
}
public static void main(String[] args)
{
Object nodeObject = null;
Node myNode = null;
Transformer transformer = null;
try
{
try {
transformer =
TransformerFactory.newInstance().newTransformer();
} catch (TransformerConfigurationException e) {
e.printStackTrace();
}
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(true);
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.parse("/tmp/customers.xml");
System.out.printf("Version = %s%n", doc.getXmlVersion());
System.out.printf("Encoding = %s%n", doc.getXmlEncoding());
System.out.printf("Standalone = %b%n%n", doc.getXmlStandalone());
if (doc.hasChildNodes())
{
int customerNumber = 0;
NodeList nl = doc.getDocumentElement().getChildNodes();
for (int i = 0; i < nl.getLength(); i++) {
Node node = nl.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
System.out.println(node.toString());
customerNumber++;
File avroFile = new File("/tmp/customer" + customerNumber + ".avro");
File xmlFile = new File("/tmp/customer" + customerNumber + ".xml");
File xmlFile1 = new File("/tmp/customer" + customerNumber + "-foo.xml");
try {
transformer.transform(
new DOMSource(node), new StreamResult(xmlFile));
File outputFile = new File("/tmp/customer" + customerNumber + ".avro");
xmlToAvro(xmlFile, outputFile);
} catch (TransformerException e) {
e.printStackTrace();
}
}
}
}
}
catch (IOException ioe)
{
System.err.println("IOE: " + ioe);
}
catch (SAXException saxe)
{
System.err.println("SAXE: " + saxe);
}
catch (FactoryConfigurationError fce)
{
System.err.println("FCE: " + fce);
}
catch (ParserConfigurationException pce)
{
System.err.println("PCE: " + pce);
}
}
}
This code works overall, but it ignores any content, which is enclosed into
![CDATA[
tag. As it happens, most of the actual useful data in the customers.xml files is enclosed into these tags.
Is there a way to modify this code, to make it not ignore the CDATA contents?
Instead of hand-writing parser code, you might want to split the problem in two parts: first, bind XML into POJO (using JAXB or Jackson XML module); and then write POJO as Avro (using Apache Avro lib, or Jackson Avro module). All you need for that would be POJO definition that matches expected structure for data as XML and Avro. Result should be less code, and basically specifying what needs to happen and now how to do it.
I got "java.lang.NullPointerException" when I try to parse an XML and the XML is not complete.
Here the code
import java.io.File;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.Statement;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class Read {
public static void main(String[] args) {
String tableName = "myTable";
String query = ("select * FROM " + tableName);
try {
Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver");
java.sql.Connection conn = DriverManager
.getConnection("jdbc:sqlserver://localhost:1433;databaseName=myDatabase;user=myUser;password=myPassword");
Statement state = conn.createStatement();
ResultSet result = state.executeQuery(query);
ResultSetMetaData resultMeta = result.getMetaData();
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
int cpt=1;
String [][]d ;
d = new String [1000][1000];
d[0][0]= new String("test");
d[0][1]=new String ("test");
while(result.next()){
for(int i = 1; i <= resultMeta.getColumnCount(); i++)
{
if( result.getString(i)== null)
{ d[cpt][i]= new String(" ");}
else
{ String res =result.getString(i);
d[cpt][i]= new String (res);
}
}
cpt=cpt+1;
}
//Element Table Name
Document doc = docBuilder.newDocument();
Element p_tableName = doc.createElement(tableName);
doc.appendChild(p_tableName);
for(int j=1;j<= 15;j++)
{
Element p_object = doc.createElement("Object");
p_tableName.appendChild(p_object);
for(int i=1;i <= resultMeta.getColumnCount() ;i++)
{
Element nomChamps = doc.createElement(resultMeta.getColumnName(i));
nomChamps.appendChild(doc.createTextNode(d[j][i]));
p_object.appendChild(nomChamps);
}
}
Transformer tf = TransformerFactory.newInstance().newTransformer();
//Format XML
tf.setOutputProperty(OutputKeys.INDENT, "yes");
tf.setOutputProperty(OutputKeys.METHOD, "xml");
tf.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
tf.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
DOMSource source = new DOMSource(doc);
StreamResult res = new StreamResult(new File("D:\\test/file2.xml"));
**//I've got a problem whit this next line;**
**tf.transform(source, res);**
System.out.println("ACTION COMPLETE !!");
result.close();
state.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
With "tf.transform(source, res)" the xml is created but not finished and I've got an error, but if I delete this line the error leave and my XML is not created.
Thx for helping
You can use an error listener to manage this, something like this:
tf.setErrorListener(new ErrorListener() {
#Override
public void warning(TransformerException exception) throws TransformerException {
exception.printStackTrace();
}
#Override
public void error(TransformerException exception) throws TransformerException {
exception.printStackTrace();
}
#Override
public void fatalError(TransformerException exception) throws TransformerException {
exception.printStackTrace();
}
} );
I have the following simplified XML:
<?xml version="1.0" encoding="UTF-8"?>
<ExportData>
<Rows>
<R>
<companyCodestringtrue>101</companyCodestringtrue>
<transactionQualifierstring>Sales</transactionQualifierstring>
<menuItemNumberlong>4302150</menuItemNumberlong>
<productQuantityinttrue>14</productQuantityinttrue>
<productValueInclVATdecimaltrue>1.90</productValueInclVATdecimaltrue>
<productValueExclVATdecimaltrue>1.775701</productValueExclVATdecimaltrue>
</R>
<R>
<companyCodestringtrue>101</companyCodestringtrue>
<transactionQualifierstring>Sales</transactionQualifierstring>
<menuItemNumberlong>333555</menuItemNumberlong>
<productQuantityinttrue>0</productQuantityinttrue>
<productValueInclVATdecimaltrue>3.90</productValueInclVATdecimaltrue>
<productValueExclVATdecimaltrue>3.775701</productValueExclVATdecimaltrue>
</R>
<R>
<companyCodestringtrue>101</companyCodestringtrue>
<transactionQualifierstring>Sales</transactionQualifierstring>
<menuItemNumberlong>1235665</menuItemNumberlong>
<productQuantityinttrue>5</productQuantityinttrue>
<productValueInclVATdecimaltrue>4.90</productValueInclVATdecimaltrue>
<productValueExclVATdecimaltrue>4.775701</productValueExclVATdecimaltrue>
</R>
</Rows>
</ExportData>
I need to delete each complete <R> element if the <productQuantityinttrue> element equals "0".
I came up with the following Java code:
package filterPositions;
import java.io.File;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
public class FilterPositions {
public static String result = "";
public static void main(String[] args) throws Exception {
try {
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
File filePath = new File("C:/LSA_SALES_EXPORT_1507_test_zero_qu.xml");
Document doc = docBuilder.parse(filePath);
Node rootNode = doc.getDocumentElement();
final Element element = doc.getDocumentElement();
// output new XML Document
DocumentBuilder parser = docFactory.newDocumentBuilder();
Document newdoc = parser.newDocument();
newdoc.adoptNode(traversingXML(element));
writeXmlFile(newdoc, "LSA_SALES_EXPORT_1507_test_zero_qu_OUT.xml");
System.out.println("Done...");
System.out.println("Exiting...");
} catch (Exception e) {
e.printStackTrace();
}
}
public static Element traversingXML(Element element) {
NodeList positionen = element.getElementsByTagName("R");
Element e = null;
for (int i = 0; i < positionen.getLength(); i++) {
e = (Element) positionen.item(i);
for (Node child = e.getFirstChild(); child != null; child = child.getNextSibling()) {
if (child instanceof Element && "productQuantityinttrue".equals(child.getNodeName())&& "0".equals(child.getTextContent())) {
e.getParentNode().removeChild(e);
}
}
}
System.out.println(e);
return e;
}
public static void writeXmlFile(Document doc, String filename) {
try {
// Prepare the DOM document for writing
Source source = new DOMSource();
// Prepare the output file
File file = new File(filename);
Result result = new StreamResult(file);
// Write the DOM document to the file
Transformer xformer = TransformerFactory.newInstance()
.newTransformer();
xformer.transform(source, result);
} catch (TransformerConfigurationException e) {
} catch (TransformerException e) {
}
}
}
I am not sure if my method "traversingXML" is working properly. My problem right now is that the adapted XML structure (one deleted) is not written to newdoc.
You don't copy the original document to newdoc; instead you create a new, empty XML document.
Instead, try this code:
...
final Element element = doc.getDocumentElement(); // original code up to here
traversingXML(element); // delete the node
writeXmlFile(doc, "LSA_SALES_EXPORT_1507_test_zero_qu_OUT.xml"); // save modified document
Is it possible to convert a MS Word to XML file using Apache POI ?
If it is, can you point me to any tutorials for doing that?
I'd say you have two options, both powered by Apache POI
One is to use Apache Tika. Tika is a text and metadata extraction toolkit, and is able to extract fairly rich text from Word documents by making appropriate calls to POI. The result is that Tika will give you XHTML style XML for the contents of your word document.
The other option is to use a class that was added fairly recently to POI, which is WordToHtmlConverter. This will turn your word document into HTML for you, and generally will preserve slightly more of the structure and formatting than Tika will.
Depending on the kind of XML you're hoping to get out, one of these should be a good bet for you. I'd suggest you try both against some of your sample files, and see which one is the best fit for your problem domain and needs.
The purpose of HWPF subproject is exactly that: process Word files.
http://poi.apache.org/hwpf/index.html
Then, to convert the data to XML you have to build XML by the ususal ways: StAX, JDOM, XStream...
Apache offers a Quick Guide:
http://poi.apache.org/hwpf/quick-guide.html
and I also have found that:
http://sanjaal.com/java/tag/simple-java-tutorial-to-read-microsoft-document-in-java/
If you want to process docx files, you might want to look at the OpenXML4J subproject:
http://poi.apache.org/oxml4j/index.html
package com.govind.service;
import java.io.File;
import java.io.FileInputStream;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
/**
* DOC to XML converter service
*
* #author govind.sharma
*
*/
public class DocToXmlConverter {
static final Logger logger = Logger.getLogger(DocToXmlConverter.class);
DocumentBuilderFactory docFactory = null;
DocumentBuilder docBuilder = null;
Element rootElement = null;
Document docxml = null;
boolean subHeaders = false;
Element UrlElement = null;
/**
* #param path
* #param fileName
*/
public void processDocxToXml(String path, String fileName) {
XWPFDocument xdoc = null;
FileInputStream fis = null;
String fullPath = path + "/" + fileName + ".docx";
try {
// Read file
fis = new FileInputStream(fullPath);
xdoc = new XWPFDocument(OPCPackage.open(fis));
initializeXml();
// get Document Body Paragraph content
List < XWPFParagraph > paragraphList = xdoc.getParagraphs();
for (XWPFParagraph paragraph: paragraphList) {
String styleName = paragraph.getStyle();
String paraText = paragraph.getParagraphText();
String bulletsPoints = paragraph.getNumFmt();
createXmlTags(styleName, paraText, bulletsPoints);
}
// write the content into XML file
generateXml(path, fileName);
logger.info("Doc to Xml Convertion completed.");
} catch (Exception ex) {
logger.error("Exception while generating XML from DOC" + ex.getMessage());
System.exit(0);
}
}
/**
* #param path
* #param fileName
*/
public void processDocToXml(String path, String fileName) {
HWPFDocument doc = null;
String fullPath = path + "/" + fileName + ".doc";
WordExtractor we = null;
try {
POIFSFileSystem fis = new POIFSFileSystem(new FileInputStream(fullPath));
doc = new HWPFDocument(fis);
} catch (Exception e) {
logger.error("Unable to Read File..." + e.getMessage());
System.exit(0);
}
try {
we = new WordExtractor(doc);
Range range = doc.getRange();
initializeXml();
String[] paragraphs = we.getParagraphText();
for (int i = 0; i < paragraphs.length; i++) {
org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
int j = 0;
while (true) {
CharacterRun run = pr.getCharacterRun(j++);
StyleDescription style = doc.getStyleSheet().getStyleDescription(run.getStyleIndex());
String styleName = style.getName();
String paraText = run.text();
String bulletsPoints = null;
createXmlTags(styleName, paraText, bulletsPoints);
if (run.getEndOffset() == pr.getEndOffset()) {
break;
}
}
}
generateXml(path, fileName);
logger.info("Document to Xml Convertion completed.");
} catch (Exception ex) {
logger.error("Exception while generating XML from DOC" + ex.getMessage());
System.exit(0);
}
}
/**
*
*/
private void initializeXml() {
// initialize XML Document
try {
docFactory = DocumentBuilderFactory.newInstance();
docBuilder = docFactory.newDocumentBuilder();
docxml = docBuilder.newDocument();
rootElement = docxml.createElement("ROOT");
docxml.appendChild(rootElement);
} catch (ParserConfigurationException e) {
logger.error("Exception while initializing XML" + e.getMessage());
}
}
/**
* #param styleName
* #param paragraphText
* #param bulletsPoints
*/
private void createXmlTags(String styleName, String paragraphText, String bulletsPoints) {
// create XML Tags
if (styleName != null && paragraphText.length() > 1) {
if (styleName.equalsIgnoreCase("Style4")) {
Element pragElement = docxml.createElement("TITLE");
pragElement.appendChild(docxml.createTextNode(paragraphText.trim()));
rootElement.appendChild(pragElement);
subHeaders = true;
} else if (styleName.equalsIgnoreCase("Default")) {
Element pragElement = docxml.createElement("P");
pragElement.appendChild(docxml.createTextNode(paragraphText));
rootElement.appendChild(pragElement);
subHeaders = true;
} else if (styleName.equalsIgnoreCase("Normal")) {
Element pragElement = docxml.createElement("P");
pragElement.appendChild(docxml.createTextNode(paragraphText));
rootElement.appendChild(pragElement);
subHeaders = true;
} else if (styleName.equalsIgnoreCase("BodyCopy") && bulletsPoints != null) {
Element pragElement = docxml.createElement("LI");
pragElement.appendChild(docxml.createTextNode(paragraphText));
UrlElement.appendChild(pragElement);
subHeaders = false;
} else if (styleName.equalsIgnoreCase("BodyCopy")) {
Element pragElement = docxml.createElement("PS");
pragElement.appendChild(docxml.createTextNode(paragraphText));
rootElement.appendChild(pragElement);
subHeaders = true;
} else if (styleName.equalsIgnoreCase("ListParagraph")) {
Element pragElement = docxml.createElement("LI");
pragElement.appendChild(docxml.createTextNode(paragraphText));
UrlElement.appendChild(pragElement);
subHeaders = false;
} else if (styleName.equalsIgnoreCase("Subheader1")) {
UrlElement = docxml.createElement("UL");
Element pragElement = docxml.createElement("LI");
pragElement.appendChild(docxml.createTextNode(paragraphText));
UrlElement.appendChild(pragElement);
rootElement.appendChild(UrlElement);
subHeaders = false;
} else {
Element pragElement = docxml.createElement("PS");
pragElement.appendChild(docxml.createTextNode(paragraphText));
rootElement.appendChild(pragElement);
subHeaders = true;
}
} else if (paragraphText.trim().length() > 1) {
Element pragElement = docxml.createElement("P");
pragElement.appendChild(docxml.createTextNode(paragraphText));
rootElement.appendChild(pragElement);
subHeaders = true;
}
if (subHeaders) {
Element pragElement = docxml.createElement("NEWLINE");
pragElement.appendChild(docxml.createTextNode(""));
rootElement.appendChild(pragElement);
}
}
/**
* #param path
* #param fileName
*/
private void generateXml(String path, String fileName) {
try {
// write the content into xml file
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
transformer.setOutputProperty(OutputKeys.METHOD, "xml");
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
DOMSource source = new DOMSource(docxml);
StreamResult result = new StreamResult(new File(path + "/" + fileName + ".xml"));
transformer.transform(source, result);
} catch (Exception e) {
logger.error("Exception while generating XML" + e.getMessage());
}
}
}