Parsing XML with XmlUtils - java

I am using XmlUtils to parse and extract the values of the id attribute in a List but it returns empty.
Where am I going wrong ? Please suggest
XML:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<address-book xmlns="qwerty" xmlns:ab2="asdfgh">
<contact time-stamp="2014-02-26T16:35:20.678+02:00" id="12345">
<ns2:person-details index="9AmmUzHXBPsK:96">
<ns2:name index="1">
<ns2:name-entry index="1">
<ns2:display-name>DISP0dNXoq</ns2:display-name>
<ns2:given display-order="1">GIVENw17JCb</ns2:given>
<ns2:family display-order="1">FAMILYcl7h2y</ns2:family>
</ns2:name-entry>
</ns2:name>
<ns2:comm-addr xml:lang="en">
<ns2:uri-entry addr-uri-type="trn" index="1:1111">
<ns2:addr-uri>cnaFC#hOog6.com</ns2:addr-uri>
<ns2:label xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:nil="true"/>
</ns2:uri-entry>
<ns2:tel tel-type="Home" index="2:22222">
<ns2:tel-nb>
<ns2:tel-str>97235852622</ns2:tel-str>
</ns2:tel-nb>
</ns2:tel>
<ns2:tel tel-type="Work" index="3:33333">
<ns2:tel-nb>
<ns2:tel-str>97230557837</ns2:tel-str>
</ns2:tel-nb>
</ns2:tel>
<ns2:tel tel-type="Mobile" index="3:33333">
<ns2:tel-nb>
<ns2:tel-str>972542993697</ns2:tel-str>
</ns2:tel-nb>
</ns2:tel>
</ns2:comm-addr>
</ns2:person-details>
<contact-status>
<contact-source>contact-source-sim-1393425320678</contact-source>
</contact-status>
</contact>
<contact time-stamp="2014-02-26T16:37:19.370+02:00" id="12346">
<contact time-stamp="2014-02-26T16:38:53.345+02:00" id="12347">
<contact time-stamp="2014-02-26T16:37:30.828+02:00" id="12348">
Code:
Document document = XmlUtils.createDocument(responseString);
List<Element> list = document.getRootElement().getChildren("address-book");
for( Element ele : list){
System.out.println(ele.getChild("contact").getAttribute("id").getValue());
}
Class XmlUtils-
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.xml.sax.InputSource;
import org.apache.log4j.Logger;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.io.FileWriter;
public class XmlUtils
{
private static final Logger logger = Logger.getLogger(XmlUtils.class);
public static String getFormatedXMLString(String doc) throws JDOMException, IOException
{
return ( makeDomToFormatedString( createDocument(doc) ) ) ;
}
public static String makeDomToFormatedString(Document doc)
{
return makeDomToFormatedString(doc.getRootElement());
}
public static String makeDomToFormatedString(Element elem)
{
XMLOutputter output = new XMLOutputter();
Format format = Format.getPrettyFormat();
format.setExpandEmptyElements( true );
format.setTextMode( Format.TextMode.TRIM_FULL_WHITE );
output.setFormat( format );
return output.outputString(elem);
}
public static Document createDocument(String xml) throws JDOMException, IOException
{
InputSource in = new InputSource(new StringReader(xml));
SAXBuilder saxB = new SAXBuilder();
return ((saxB.build(in)));
}
public static Element createElement(File xmlFile) throws JDOMException, IOException
{
SAXBuilder saxB = new SAXBuilder();
Document document = saxB.build(xmlFile);
return document.getRootElement();
}
public static void writeXmlFile(Document doc,String path){
try {
XMLOutputter xmlOutputer = new XMLOutputter();
xmlOutputer.setFormat( Format.getPrettyFormat() );
xmlOutputer.output( doc , new FileWriter( path ) );
} catch (IOException e) {
logger.error("cant write xml file",e);
}
}
}

Here we go
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringReader;
import java.util.List;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.xml.sax.InputSource;
public class XmlUtils
{
public static void main(String[] args) throws JDOMException, IOException {
String test="<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?><address-book xmlns=\"qwerty\" xmlns:ab2=\"asdfgh\"><contact time-stamp=\"2014-02-26T16:35:20.678+02:00\" id=\"12345\"></contact><contact time-stamp=\"2014-02-26T16:37:19.370+02:00\" id=\"12346\"></contact><contact time-stamp=\"2014-02-26T16:38:53.345+02:00\" id=\"12347\"></contact><contact time-stamp=\"2014-02-26T16:37:30.828+02:00\" id=\"12348\"></contact></address-book>";
Document document = XmlUtils.createDocument(test);
Element rootNode=document.getRootElement();
Namespace namespace=Namespace.getNamespace("qwerty");
rootNode.setNamespace(namespace);
List list = rootNode.getChildren("contact",namespace);
for (int i = 0; i < list.size(); i++) {
Element node = (Element) list.get(i);
System.out.println("id values using Style 1 : " + node.getAttribute("id").getValue());
}
List<Element> list2 = document.getRootElement().getChildren("contact",namespace);
for( Element ele : list2){
System.out.println(ele.getAttribute("id").getValue());
}
}
public static String getFormatedXMLString(String doc) throws JDOMException, IOException
{
return ( makeDomToFormatedString( createDocument(doc) ) ) ;
}
public static String makeDomToFormatedString(Document doc)
{
return makeDomToFormatedString(doc.getRootElement());
}
public static String makeDomToFormatedString(Element elem)
{
XMLOutputter output = new XMLOutputter();
Format format = Format.getPrettyFormat();
format.setExpandEmptyElements( true );
format.setTextMode( Format.TextMode.TRIM_FULL_WHITE );
output.setFormat( format );
return output.outputString(elem);
}
public static Document createDocument(String xml) throws JDOMException, IOException
{
InputSource in = new InputSource(new StringReader(xml));
SAXBuilder saxB = new SAXBuilder();
return ((saxB.build(in)));
}
public static Element createElement(File xmlFile) throws JDOMException, IOException
{
SAXBuilder saxB = new SAXBuilder();
Document document = saxB.build(xmlFile);
return document.getRootElement();
}
public static void writeXmlFile(Document doc,String path){
try {
XMLOutputter xmlOutputer = new XMLOutputter();
xmlOutputer.setFormat( Format.getPrettyFormat() );
xmlOutputer.output( doc , new FileWriter( path ) );
} catch (IOException e) {
e.printStackTrace();
}
}
}
output will be
id values using Style 1 : 12345
id values using Style 1 : 12346
id values using Style 1 : 12347
id values using Style 1 : 12348
12345
12346
12347
12348
let me know if u face any issues :)

Related

How can I replace an attribute in xml with different value using Java?

I have an xml file:
<pickingOrderBeginEventMessage xmlns="http://www.xmlns.walmartstores.com/SuppyChain/FulfillmentManagement/GlobalIntegeratedFulfillment/">
<MessageBody>
<RoutingInfo>
<SourceNode>
<location>
<countryCode>US</countryCode>
</location>
</SourceNode>
</RoutingInfo>
<fulfillmentOrders>
<fulfillmentOrder>
<orderNbr>784</orderNbr>
</fulfillmentOrder>
</fulfillmentOrders>
</MessageBody>
</pickingOrderBeginEventMessage>
I want to change <orderNbr>784</orderNbr> to <orderNbr>784778474484747</orderNbr>
This is my method:
(Note that I am using dom4j.)
public String replaceXML(String attribute,String oldValue, String newValue) throws SAXException, DocumentException, IOException, TransformerException {
SAXReader xmlReader = new SAXReader();
Document input = xmlReader.read("src/test/resources/xml/pick_begin.xml");
xmlReader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
xmlReader.setFeature("http://xml.org/sax/features/external-general-entities", false);
xmlReader.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
String expr = String.format("//*[contains(#%s, '%s')]", attribute, oldValue);
XPath xpath = DocumentHelper.createXPath(expr);
List<Node> nodes = xpath.selectNodes(input);
for (int i = 0; i < nodes.size(); i++) {
Element element = (Element) nodes.get(i);
element.addAttribute(attribute, newValue);
}
TransformerFactory factory = TransformerFactory.newInstance();
factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
Transformer xformer = factory.newTransformer();
xformer.setOutputProperty(OutputKeys.INDENT, "yes");
Writer output = new StringWriter();
xformer.transform(new DocumentSource(input), new StreamResult(output));
return output.toString();
}
}
Where String attribute is orderNbr, oldValue is 784 and newValue is 78455556767.
But with this method, the new value is not getting replaced. Where am I going wrong?
According to the XML file in your question, orderNbr is an element and not an attribute and its text value is 784. So you want to replace the text value with 78455556767.
Your code does not change the original XML because your XPath query string does not find anything.
Therefore you need to change two things in your code.
The XPath query string.
The method you call to change the XML.
The below code contains the two changes. The changed lines are indicated with the following comment at the end of the line.
// CHANGE HERE
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.util.List;
import javax.xml.XMLConstants;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.XPath;
import org.dom4j.io.DocumentSource;
import org.dom4j.io.SAXReader;
import org.xml.sax.SAXException;
public class ChngAttr {
public static String replaceXML(String attribute,
String oldValue,
String newValue) throws DocumentException,
IOException,
SAXException,
TransformerException {
SAXReader xmlReader = new SAXReader();
Document input = xmlReader.read("pick_begin.xml");
xmlReader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
xmlReader.setFeature("http://xml.org/sax/features/external-general-entities", false);
xmlReader.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
String expr = String.format("//%s[text() = '%s']", attribute, oldValue); // CHANGE HERE
XPath xpath = DocumentHelper.createXPath(expr);
List<Node> nodes = xpath.selectNodes(input);
for (int i = 0; i < nodes.size(); i++) {
Element element = (Element) nodes.get(i);
element.setText(newValue); // CHANGE HERE
}
TransformerFactory factory = TransformerFactory.newInstance();
factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
Transformer xformer = factory.newTransformer();
xformer.setOutputProperty(OutputKeys.INDENT, "yes");
Writer output = new StringWriter();
xformer.transform(new DocumentSource(input), new StreamResult(output));
return output.toString();
}
public static void main(String[] args) throws Exception {
String result = replaceXML("orderNbr", "784", "78455556767");
System.out.println(result);
}
}

JAVA code snippet to replace single quote(') to double quote in whole XML file

I have a XML file having nested tags. We can use DOM, JDOM parser
I want to replace inside the string of all tag from single quote(') to double quote in whole XML file. tag can be nested inside tags also. I want some for loop which looks for all tag and replace value like HYPER SHIPPING'SDN BHD_First_Page --> HYPER SHIPPING''SDN BHD_First_Page
Sample code
public void iterateChildNodes(org.jdom.Element parentNode) {
if(parentNode.getChildren().size() == 0) {
if(parentNode.getText().contains("'")) {
parentNode.setText(parentNode.getText().replaceAll("'", "\'"));
LOGGER.info("************* Below Value updated");
LOGGER.info(parentNode.getText());
}
}else {
List<Element> rec = parentNode.getChildren();
for(Element i : rec) {
iterateChildNodes(i);
}
}
}
Sample XML File
<Document>
<Identifier>DOC1</Identifier>
<Type>HYPER SHIPPING SDN BHD</Type>
<Description>HYPER SHIPPING SDN BHD</Description>
<Confidence>33.12</Confidence>
<ConfidenceThreshold>10.0</ConfidenceThreshold>
<Valid>true</Valid>
<Reviewed>true</Reviewed>
<ReviewedBy>SYSTEM</ReviewedBy>
<ValidatedBy>SYSTEM</ValidatedBy>
<ErrorMessage/>
<Value>HYPER SHIPPING'SDN BHD_First_Page</Value> //Value to be replaced here
<DocumentDisplayInfo/>
<DocumentLevelFields/>
<Pages>
<Page>
<Identifier>PG0</Identifier>
<OldFileName>HYPER-KL FEB-0001-0001.tif</OldFileName>
<NewFileName>BI2E7_0.tif</NewFileName>
<SourceFileID>1</SourceFileID>
<PageLevelFields>
<PageLevelField>
<Name>Search_Engine_Classification</Name>
<Value>Park Street '10 road</Value> //Value to be replaced here
<Type/>
<Confidence>66.23</Confidence>
<LearnedFileName>HYPER KL-JUN-0001.tif</LearnedFileName>
<OcrConfidenceThreshold>0.0</OcrConfidenceThreshold>
<OcrConfidence>0.0</OcrConfidence>
<FieldOrderNumber>0</FieldOrderNumber>
<ForceReview>false</ForceReview>
</PageLevelField>
</PageLevelFields>
</Page>
</Pages>
</Document>
This code can replace all ' with " from an XML file.
Adding no description here, try to code step by step. It is very easy to understand.
(Updated)
Part 1: Using JDOM
import java.util.ArrayList;
import java.util.List;
import org.w3c.dom.NodeList;
import org.jdom2.input.SAXBuilder;
import org.jdom2.transform.JDOMSource;
import org.w3c.dom.*;
import java.io.*;
public class XmlModificationJDom {
public static void main(String[] args) {
XmlModificationJDom xmlModificationJDom = new XmlModificationJDom();
xmlModificationJDom.updateXmlAndSaveJDom();
}
public void updateXmlAndSaveJDom() {
try {
File inputFile = new File("document.xml");
SAXBuilder saxBuilder = new SAXBuilder();
org.jdom2.Document xmlDocument = saxBuilder.build(inputFile);
org.jdom2.Element rootElement = xmlDocument.getRootElement();
iterateAndUpdateElementsUsingJDom(rootElement);
saveUpdatedXmlUsingJDomSource(xmlDocument);
} catch (Exception ex) {
ex.printStackTrace();
}
}
public void iterateAndUpdateElementsUsingJDom(org.jdom2.Element element) {
if (element.getChildren().size() == 0) {
// System.out.println(element.getName() + ","+ element.getText());
if (element.getText().contains("'")) {
element.setText(element.getText().replaceAll("\'", "\""));
}
} else {
// System.out.println(element.getName());
for (org.jdom2.Element childElement : element.getChildren()) {
iterateAndUpdateElementsUsingJDom(childElement);
}
}
}
}
Part 2: Using DOM
import javax.xml.parsers.*;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.util.ArrayList;
import java.util.List;
import java.io.*;
public class XmlModificationDom {
public static void main(String[] args) {
XmlModificationDom XmlModificationDom = new XmlModificationDom();
XmlModificationDom.updateXmlAndSave();
}
public void updateXmlAndSave() {
try {
File inputFile = new File("document.xml");
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document document = dBuilder.parse(inputFile);
document.getDocumentElement().normalize();
Node parentNode = document.getFirstChild();
iterateChildNodesAndUpate(parentNode);
writeAndSaveXML(document);
} catch (Exception ex) {
ex.printStackTrace();
}
}
public void writeAndSaveXML(Document document) throws Exception {
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
DOMSource source = new DOMSource(document);
StreamResult result = new StreamResult(new File("updated-document.xml"));
transformer.transform(source, result);
}
public void iterateChildNodesAndUpate(Node parentNode) {
NodeList nodeList = parentNode.getChildNodes();
for (int index = 0; index < nodeList.getLength(); index++) {
Node node = nodeList.item(index);
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element element = (Element) node;
//System.out.print(element.getNodeName());
if (element.hasChildNodes() && element.getChildNodes().getLength() > 1) {
//System.out.println("Child > " + element.getNodeName());
iterateChildNodesAndUpate(element);
} else {
//System.out.println(" - " + element.getTextContent());
if (element.getTextContent().contains("'")) {
String str = element.getTextContent().replaceAll("\'", "\"");
element.setTextContent(str);
}
}
}
}
}
}
Input file document.xml:
<Document>
<Identifier>DOC1</Identifier>
<Type>HYPER SHIPPING SDN BHD</Type>
<Description>HYPER SHIPPING SDN BHD</Description>
<Confidence>33.12</Confidence>
<ConfidenceThreshold>10.0</ConfidenceThreshold>
<Valid>true</Valid>
<Reviewed>true</Reviewed>
<ReviewedBy>SYSTEM</ReviewedBy>
<ValidatedBy>SYSTEM</ValidatedBy>
<ErrorMessage/>
<Value>HYPER SHIPPING'SDN BHD_First_Page</Value> //Value to be replaced here
<DocumentDisplayInfo/>
<DocumentLevelFields/>
<Pages>
<Page>
<Identifier>PG0</Identifier>
<OldFileName>HYPER-KL FEB-0001-0001.tif</OldFileName>
<NewFileName>BI2E7_0.tif</NewFileName>
<SourceFileID>1</SourceFileID>
<PageLevelFields>
<PageLevelField>
<Name>Search_Engine_Classification</Name>
<Value>Park Street '10 road</Value> //Value to be replaced here
<Type/>
<Confidence>66.23</Confidence>
<LearnedFileName>HYPER KL-JUN-0001.tif</LearnedFileName>
<OcrConfidenceThreshold>0.0</OcrConfidenceThreshold>
<OcrConfidence>0.0</OcrConfidence>
<FieldOrderNumber>0</FieldOrderNumber>
<ForceReview>false</ForceReview>
</PageLevelField>
</PageLevelFields>
</Page>
</Pages>
</Document>
Output updated-document.xml/updated-document-jdom.xml:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<Document>
<Identifier>DOC1</Identifier>
<Type>HYPER SHIPPING SDN BHD</Type>
<Description>HYPER SHIPPING SDN BHD</Description>
<Confidence>33.12</Confidence>
<ConfidenceThreshold>10.0</ConfidenceThreshold>
<Valid>true</Valid>
<Reviewed>true</Reviewed>
<ReviewedBy>SYSTEM</ReviewedBy>
<ValidatedBy>SYSTEM</ValidatedBy>
<ErrorMessage/>
<Value>HYPER SHIPPING"SDN BHD_First_Page</Value><DocumentDisplayInfo/>
<DocumentLevelFields/>
<Pages>
<Page>
<Identifier>PG0</Identifier>
<OldFileName>HYPER-KL FEB-0001-0001.tif</OldFileName>
<NewFileName>BI2E7_0.tif</NewFileName>
<SourceFileID>1</SourceFileID>
<PageLevelFields>
<PageLevelField>
<Name>Search_Engine_Classification</Name>
<Value>Park Street "10 road</Value><Type/>
<Confidence>66.23</Confidence>
<LearnedFileName>HYPER KL-JUN-0001.tif</LearnedFileName>
<OcrConfidenceThreshold>0.0</OcrConfidenceThreshold>
<OcrConfidence>0.0</OcrConfidence>
<FieldOrderNumber>0</FieldOrderNumber>
<ForceReview>false</ForceReview>
</PageLevelField>
</PageLevelFields>
</Page>
</Pages>
</Document>
More details code, visit this repo
you need to add backslash on single quote and double quote
value =value.replace("\'","\"");
Just replace the removeQuote method with
private static void removeQuote(Document batchXml) throws JDOMException, Exception {
Element root = batchXml.getRootElement();
List<Element> docs = root.getChild("Documents").getChildren("Document");
for (Element doc : docs) {
String docType = doc.getChildText("Value");
value =value.replaceAll("\'", "\"");
}
}

how to convert xml to avro without ignoring !CDATA content?

I have the following source XML file named customers.xml:
<?xml version="1.0" encoding="utf-8"?>
<p:CustomerElement xmlns:p="http://www.dog.com/customer" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:schemaLocation="http://www.dog.com/customer Customer.xsd">
<Customer>
<Sender>
<transmitDate>2016-02-21T00:00:00</transmitDate>
<transmitter>Dog ETL v2.0</transmitter>
<dealerCode><![CDATA[P020]]></dealerCode>
<DMSSystem><![CDATA[DBS]]></DMSSystem>
<DMSReleaseNumber><![CDATA[5.0]]></DMSReleaseNumber>
</Sender>
<Identifier>
<updateInd><![CDATA[A]]></updateInd>
<dealerCustNumber><![CDATA[AMADOR]]></dealerCustNumber>
<dealerCustName><![CDATA[AMADOR COMPUTERS]]></dealerCustName>
<phoneNumber><![CDATA[800 111 4444]]></phoneNumber>
<faxNumber><![CDATA[780 111 4444]]></faxNumber>
<email xsi:nil="true" />
<customerType><![CDATA[R]]></customerType>
<activeCustomerInd>false</activeCustomerInd>
<parentCustomerNumber xsi:nil="true" />
<primaryStoreNumber><![CDATA[00]]></primaryStoreNumber>
<preferredLanguage><![CDATA[ENG]]></preferredLanguage>
<dealerDateInSystem>2000-01-11T00:00:00</dealerDateInSystem>
<dealerLastUpdatedDate>2015-02-05T00:00:00</dealerLastUpdatedDate>
</Identifier>
<Location>
<address2><![CDATA[ACCOUNT FLAGGED FOR DELETION]]></address2>
<address3><![CDATA[AS PER BILL FEB AA/15]]></address3>
<city><![CDATA[CHICAGO]]></city>
<postalCode><![CDATA[Q5S 1E5]]></postalCode>
<state><![CDATA[AB]]></state>
<country><![CDATA[CA]]></country>
<location><![CDATA[FLAGGED FOR DELETION]]></location>
<addressType><![CDATA[M]]></addressType>
</Location>
<Division>
<divisionCode><![CDATA[G]]></divisionCode>
<divisionName><![CDATA[CAR]]></divisionName>
<IndustryCode>
<industryCode><![CDATA[AQ99]]></industryCode>
<primaryIndustryCodeInd>true</primaryIndustryCodeInd>
</IndustryCode>
<SalesRep>
<number><![CDATA[XXX]]></number>
<name><![CDATA[KILL ACCOUNT IN PROCESS]]></name>
<type><![CDATA[M]]></type>
<par>0</par>
<email xsi:nil="true" />
<phoneNumber><![CDATA[000 000 0000]]></phoneNumber>
</SalesRep>
</Division>
</Customer>
<Customer>
<Sender>
<transmitDate>2016-02-21T00:00:00</transmitDate>
<transmitter>Dog ETL v2.0</transmitter>
<dealerCode><![CDATA[P000]]></dealerCode>
<DMSSystem><![CDATA[DBS]]></DMSSystem>
<DMSReleaseNumber><![CDATA[5.0]]></DMSReleaseNumber>
</Sender>
<Identifier>
<updateInd><![CDATA[A]]></updateInd>
<dealerCustNumber><![CDATA[UU20888]]></dealerCustNumber>
<dealerCustName><![CDATA[ ADVERTISING AND PR]]></dealerCustName>
<phoneNumber xsi:nil="true" />
<faxNumber xsi:nil="true" />
<email xsi:nil="true" />
<customerType><![CDATA[I]]></customerType>
<activeCustomerInd>true</activeCustomerInd>
<parentCustomerNumber xsi:nil="true" />
<primaryStoreNumber><![CDATA[M2]]></primaryStoreNumber>
<preferredLanguage><![CDATA[ENG]]></preferredLanguage>
<dealerDateInSystem>2015-11-18T00:00:00</dealerDateInSystem>
<dealerLastUpdatedDate>2015-11-19T00:00:00</dealerLastUpdatedDate>
</Identifier>
<Location>
<address2><![CDATA[EQUIP]]></address2>
<city><![CDATA[ADER]]></city>
<country><![CDATA[CA]]></country>
<addressType><![CDATA[M]]></addressType>
</Location>
<Division>
<divisionCode><![CDATA[A]]></divisionCode>
<divisionName><![CDATA[AGRO]]></divisionName>
<IndustryCode>
<industryCode><![CDATA[EQ00]]></industryCode>
<primaryIndustryCodeInd>true</primaryIndustryCodeInd>
</IndustryCode>
</Division>
</Customer>
</p:CustomerElement>
I have the following java code, which parses customers.xml into individual "Customer" entities, and then attempts to convert each of them into an AVRO format:
package com.dogsoft.data.xmltoavro;
import java.io.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.FactoryConfigurationError;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import jdk.nashorn.internal.runtime.regexp.joni.constants.NodeType;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import org.apache.avro.Protocol;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.avro.util.Utf8;
import org.w3c.dom.*;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
public class ParseXmlFile {
private static Protocol protocol;
public static void xmlToAvro(File xmlFile, File avroFile) throws IOException, SAXException {
try {
InputStream stream = new FileInputStream("/tmp/xml.avsc");
if (stream == null) throw new IllegalStateException("Classpath should include xml.avsc");
protocol = Protocol.parse(stream);
} catch (IOException e) {
throw new RuntimeException(e);
}
Schema schema = protocol.getType("Element");
Document doc = parse(xmlFile);
DatumWriter<GenericRecord> datumWriter = new SpecificDatumWriter<>(schema);
try (DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<>(datumWriter)) {
fileWriter.create(schema, avroFile);
Object docElement = doc.getDocumentElement();
fileWriter.append(wrapElement(doc.getDocumentElement()));
}
}
private static GenericData.Record wrapElement(Element el) {
GenericData.Record record = new GenericData.Record(protocol.getType("Element"));
record.put("name", el.getNodeName());
NamedNodeMap attributeNodes = el.getAttributes();
List<GenericData.Record> attrRecords = new ArrayList<>();
for (int i = 0; i < attributeNodes.getLength(); i++) {
Attr attr = (Attr) attributeNodes.item(i);
attrRecords.add(wrapAttr(attr));
}
record.put("attributes", attrRecords);
List<Object> childArray = new ArrayList<>();
NodeList childNodes = el.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node node = childNodes.item(i);
Object nt = node.getNodeType();
if (node.getNodeType() == Node.ELEMENT_NODE)
childArray.add(wrapElement((Element) node));
if (node.getNodeType() == Node.TEXT_NODE)
childArray.add(node.getTextContent());
}
record.put("children", childArray);
return record;
}
private static GenericData.Record wrapAttr(Attr attr) {
GenericData.Record record = new GenericData.Record(protocol.getType("Attribute"));
record.put("name", attr.getName());
record.put("value", attr.getValue());
return record;
}
private static Document parse(File file) throws IOException, SAXException {
try {
DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
return builder.parse(file);
} catch (ParserConfigurationException e) {
throw new RuntimeException(e);
}
}
public static void avroToXml(File avroFile, File xmlFile) throws IOException {
try {
InputStream stream = new FileInputStream("/tmp/xml.avsc");
if (stream == null) throw new IllegalStateException("Classpath should include xml.avsc");
protocol = Protocol.parse(stream);
} catch (IOException e) {
throw new RuntimeException(e);
}
DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(protocol.getType("Element"));
DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(avroFile, datumReader);
GenericRecord record = dataFileReader.next();
Document doc;
try {
doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
} catch (ParserConfigurationException e) {
throw new RuntimeException(e);
}
Element el = unwrapElement(record, doc);
doc.appendChild(el);
saveDocument(doc, xmlFile);
}
private static Element unwrapElement(GenericRecord record, Document doc) {
String name = "" + record.get("name");
Element el = doc.createElement(name);
#SuppressWarnings("unchecked")
GenericArray<GenericRecord> attrArray = (GenericArray<GenericRecord>) record.get("attributes");
for (GenericRecord attrRecord : attrArray)
el.setAttributeNode(unwrapAttr(attrRecord, doc));
#SuppressWarnings("unchecked")
GenericArray<Object> childArray = (GenericArray<Object>) record.get("children");
for (Object childObj : childArray) {
if (childObj instanceof GenericRecord)
el.appendChild(unwrapElement((GenericRecord) childObj, doc));
if (childObj instanceof Utf8)
el.appendChild(doc.createTextNode("" + childObj));
}
return el;
}
private static Attr unwrapAttr(GenericRecord record, Document doc) {
Attr attr = doc.createAttribute("" + record.get("name"));
attr.setValue("" + record.get("value"));
return attr;
}
private static void saveDocument(Document doc, File file) {
try {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.transform(new DOMSource(doc), new StreamResult(file));
} catch (TransformerException e) {
throw new RuntimeException(e);
}
}
public static void main(String[] args)
{
Object nodeObject = null;
Node myNode = null;
Transformer transformer = null;
try
{
try {
transformer =
TransformerFactory.newInstance().newTransformer();
} catch (TransformerConfigurationException e) {
e.printStackTrace();
}
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(true);
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.parse("/tmp/customers.xml");
System.out.printf("Version = %s%n", doc.getXmlVersion());
System.out.printf("Encoding = %s%n", doc.getXmlEncoding());
System.out.printf("Standalone = %b%n%n", doc.getXmlStandalone());
if (doc.hasChildNodes())
{
int customerNumber = 0;
NodeList nl = doc.getDocumentElement().getChildNodes();
for (int i = 0; i < nl.getLength(); i++) {
Node node = nl.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
System.out.println(node.toString());
customerNumber++;
File avroFile = new File("/tmp/customer" + customerNumber + ".avro");
File xmlFile = new File("/tmp/customer" + customerNumber + ".xml");
File xmlFile1 = new File("/tmp/customer" + customerNumber + "-foo.xml");
try {
transformer.transform(
new DOMSource(node), new StreamResult(xmlFile));
File outputFile = new File("/tmp/customer" + customerNumber + ".avro");
xmlToAvro(xmlFile, outputFile);
} catch (TransformerException e) {
e.printStackTrace();
}
}
}
}
}
catch (IOException ioe)
{
System.err.println("IOE: " + ioe);
}
catch (SAXException saxe)
{
System.err.println("SAXE: " + saxe);
}
catch (FactoryConfigurationError fce)
{
System.err.println("FCE: " + fce);
}
catch (ParserConfigurationException pce)
{
System.err.println("PCE: " + pce);
}
}
}
This code works overall, but it ignores any content, which is enclosed into
![CDATA[
tag. As it happens, most of the actual useful data in the customers.xml files is enclosed into these tags.
Is there a way to modify this code, to make it not ignore the CDATA contents?
Instead of hand-writing parser code, you might want to split the problem in two parts: first, bind XML into POJO (using JAXB or Jackson XML module); and then write POJO as Avro (using Apache Avro lib, or Jackson Avro module). All you need for that would be POJO definition that matches expected structure for data as XML and Avro. Result should be less code, and basically specifying what needs to happen and now how to do it.

Read XML using dom4j or mycila

I have the following xml as a String but I am having problem reading in a loop manner for parameter "PrdInfoTable" and "OrdInfoTable" as they are dynamic so I need to read it to an arraylist or something. I have tried several method but still unable to get it done. How can I do this?
<?xml version="1.0" encoding="utf-8"?>
<soap:Envelope xmlns:soap="http://www.w3.org/2003/05/soap-envelope"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<soap:Body>
<GetCardResponse xmlns="http://tempuri.org/">
<GetCardResult>
<ReturnResult>
<Return>
<ReturnMsgNo>1</ReturnMsgNo>
<ReturnMsg>交易成功</ReturnMsg>
</Return>
<GetCardResult>
<OrdTable>
<Facno>1234</Facno>
<TrdDate>2015/5/6 11:04:20</TrdDate>
<TrdSeq>ABCD1234</TrdSeq>
<TrdBarCode>123456789</TrdBarCode>
</OrdTable>
<PrdTable>
<GameFacName></GameFacName>
<PrdName>abc123</PrdName>
<CardId>ABCD012345</CardId>
<CardPwd>KKSDHASBDH</CardPwd>
<ExpDate>2015/02/12</ExpDate>
</PrdTable>
<PrdInfoTable>
<PrdNote>* 測12346666666666666666666666666666666</PrdNote>
</PrdInfoTable>
<PrdInfoTable>
<PrdNote>* 測56787777777777777777777</PrdNote>
</PrdInfoTable>
<PrdInfoTable>
<PrdNote>* 測12345611111111111111111</PrdNote>
</PrdInfoTable>
<OrdInfoTable>
<TxetContent>測1111111111111111111111111111111111111111</TxetContent>
</OrdInfoTable>
<OrdInfoTable>
<TxetContent>22222測22222222222222222222222222222222222</TxetContent>
</OrdInfoTable>
<OrdInfoTable>
<TxetContent>3333333333333333333333測333333333333333333</TxetContent>
</OrdInfoTable>
<OrdInfoTable>
<TxetContent>4444444測444444444444444444444444444444444</TxetContent>
</OrdInfoTable>
<OrdInfoTable>
<TxetContent>55555555555555555555555555555測55555555555</TxetContent>
</OrdInfoTable>
<FreeSnTable />
</GetCardResult>
</ReturnResult>
</GetCardResult>
</GetCardResponse>
</soap:Body>
Below is the code:
HttpClient httpClient = new HttpClient();
PostMethod post = new PostMethod(url);
post.setRequestEntity(new StringRequestEntity(xmlRequest.toString()));
post.setRequestHeader("Content-type", "application/soap+xml; charset=utf-8");
post.setRequestHeader("Content-Length", xmlRequest.length()+"");
responseCode = httpClient.executeMethod(post);
InputStream in = post.getResponseBodyAsStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
String line;
String xmlresponse = "";
while( ( line = reader.readLine() ) != null ) {
xmlresponse = xmlresponse+line;
}
System.out.println("XML Response:\n");
xmlresponse = xmlresponse.replaceAll("<", "<");
xmlresponse = xmlresponse.replaceAll(">", ">");
System.out.println(xmlresponse+"\n");
// clean everything
reader.close();
XMLTag xmlTag = XMLDoc.from(xmlresponse, true);
System.out.println("\n" + xmlTag.gotoRoot().gotoChild().gotoChild().gotoTag("GetCardResult/ReturnResult/GetCardResult/PrdInfoTable").getCurrentTagName());
System.out.println("PrdNote: "+ xmlTag.gotoRoot().gotoChild().gotoChild().getText("GetCardResult/ReturnResult/GetCardResult/PrdInfoTable/PrdNote[1]"));
System.out.println("\n" + xmlTag.gotoRoot().gotoChild().gotoChild().gotoTag("GetCardResult/ReturnResult/GetCardResult/PrdInfoTable").getCurrentTagName());
System.out.println("PrdNote: "+ xmlTag.gotoRoot().gotoChild().gotoChild().getText("GetCardResult/ReturnResult/GetCardResult/PrdInfoTable/PrdNote[2]"));
Another easy way to achieve this is using XPath expressions.
This code reads a xml file (tested with your XML code), look for PrdNote and TxetContent elements. The method extractNodesValues return an ArrayList<String> with it's values:
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
public class Test{
public static void main(String[] args){
try {
File myXmlFile = new File("src/test.xml");
String xpath_PrdNotes = "/Envelope/Body/GetCardResponse/GetCardResult/ReturnResult/GetCardResult/PrdInfoTable/PrdNote/text()";
String xpath_TxetContent = "/Envelope/Body/GetCardResponse/GetCardResult/ReturnResult/GetCardResult/OrdInfoTable/TxetContent/text()";
ArrayList<String> prdNotesValues = extractNodesValues(myXmlFile, xpath_PrdNotes );
ArrayList<String> txetContentValues = extractNodesValues(myXmlFile, xpath_TxetContent );
System.out.println("PrdNotesValues:");
for(String val : prdNotesValues){
System.out.println(val);
}
System.out.println("");
System.out.println("TxetContentValues:");
for(String val : txetContentValues){
System.out.println(val);
}
}
catch(XPathExpressionException e){ System.out.println(e.getMessage()); }
catch(IOException e){ System.out.println(e.getMessage()); }
catch(SAXException e){ System.out.println(e.getMessage()); }
catch(ParserConfigurationException e){ System.out.println(e.getMessage()); }
}
public static ArrayList<String> extractNodesValues(File f, String xpath_expression) throws XPathExpressionException, IOException, SAXException, ParserConfigurationException {
Document xmlDocument;
DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder;
XPath xPath;
NodeList nodeList;
Node n;
ArrayList<String> result;
builder = builderFactory.newDocumentBuilder();
xmlDocument = builder.parse(f);
xPath = XPathFactory.newInstance().newXPath();
result = new ArrayList<String>();
// here all values from PrdNote elements are stored
nodeList = (NodeList)xPath.compile(xpath_expression).evaluate(xmlDocument, XPathConstants.NODESET);
if(nodeList != null && nodeList.getLength() > 0) {
//iterate over all obtained nodes matching the xpath expression
for(int i=0; i<nodeList.getLength(); i++){
result.add(nodeList.item(i).getNodeValue());
}
}
return result;
}
}
Output:
PrdNotesValues:
* 測12346666666666666666666666666666666
* 測56787777777777777777777
* 測12345611111111111111111
TxetContentValues:
測1111111111111111111111111111111111111111
22222測22222222222222222222222222222222222
3333333333333333333333測333333333333333333
4444444測444444444444444444444444444444444
55555555555555555555555555555測55555555555
Hope this helps.

Updating an XML String

From the given XML String, i have to update End Date value .
Even though I'm updating the xml in updateNodeValue() method, my final output xml is same as the input xml.
Can someone tell me what is the mistake in this code
import java.io.StringReader;
import java.io.StringWriter;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
public class MyClass{
static String strXml = "<INFO><BeginDate>2013-12-02</BeginDate><EndDate>2014-01-31</EndDate></INFO>";
public static void main(String[] args) throws Exception {
System.out.println(strXml);
Document doc = StringToDocument(strXml);
updateNodeValue(doc);
String newxml = DocumentToString(doc);
System.out.println(newxml);
}
public static void updateNodeValue(Document doc) {
Node rootNode = doc.getFirstChild();
NodeList list = rootNode.getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
Element element = (Element) list.item(i);
Node node = list.item(i);
if ("EndDate".equals(node.getNodeName())) {
element.setNodeValue("2013-12-12");
return;
}
}
}
public static String DocumentToString(Document doc) throws Exception {
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
StringWriter writer = new StringWriter();
transformer.transform(new DOMSource(doc), new StreamResult(writer));
String output = writer.getBuffer().toString();
return output;
}
public static Document StringToDocument(String strXml) throws Exception {
Document doc = null;
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
StringReader strReader = new StringReader(strXml);
InputSource is = new InputSource(strReader);
doc = (Document) builder.parse(is);
} catch (Exception e) {
e.printStackTrace();
throw e;
}
return doc;
}
}
Useelement.setTextContent(...) in your updateNodeValue method.
The method you should use is not setNodeValue() but setTextContent()
See http://docs.oracle.com/javase/1.5.0/docs/api/org/w3c/dom/Node.html#setNodeValue(java.lang.String)

Categories

Resources