How to compare and get difference between two xml files using java? - java

I have two xml files as below.
Compare.xml
<?xml version="1.0"?>
<class>
<student rollno="393">
<firstname>dinkar</firstname>
<lastname>kad</lastname>
<nickname>dinkar</nickname>
<marks>85</marks>
</student>
<student rollno="493">
<firstname>Vaneet</firstname>
<lastname>Gupta</lastname>
<nickname>vinni</nickname>
<marks>95</marks>
</student>
</class>
Reference.xml
<?xml version="1.0"?>
<class>
<student rollno="393">
<firstname>ila</firstname>
<lastname>kad</lastname>
<nickname>dinkar</nickname>
<marks>85</marks>
</student>
<student rollno="493">
<firstname>Vaneet</firstname>
<lastname>Gupta</lastname>
<nickname>vinni</nickname>
<marks>95</marks>
</student>
</class>
Now I need to compare and get the difference between these two xml files. I also need the output to be exported as a log file.
Below my code:
package DomParserDemo;
import java.io.File;
import java.util.logging.FileHandler;
import java.util.logging.Logger;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.Node;
import org.w3c.dom.Element;
public class DomParserDemo {
public static void main(String[] args){
try {
File inputFile = new File("input.txt");
DocumentBuilderFactory dbFactory
= DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(inputFile);
doc.getDocumentElement().normalize();
System.out.println("Root element :"
+ doc.getDocumentElement().getNodeName());
NodeList nList = doc.getElementsByTagName("student");
System.out.println("----------------------------");
for (int temp = 0; temp < nList.getLength(); temp++) {
Node nNode = nList.item(temp);
System.out.println("\nCurrent Element :"
+ nNode.getNodeName());
if (nNode.getNodeType() == Node.ELEMENT_NODE) {
Element eElement = (Element) nNode;
System.out.println("Student roll no : "
+ eElement.getAttribute("rollno"));
System.out.println("First Name : "
+ eElement
.getElementsByTagName("firstname")
.item(0)
.getTextContent());
System.out.println("Last Name : "
+ eElement
.getElementsByTagName("lastname")
.item(0)
.getTextContent());
System.out.println("Nick Name : "
+ eElement
.getElementsByTagName("nickname")
.item(0)
.getTextContent());
System.out.println("Marks : "
+ eElement
.getElementsByTagName("marks")
.item(0)
.getTextContent());
}
}
FileHandler handler = new FileHandler("logfile.log");
// Add to the desired logger
Logger logger = Logger.getLogger("");
logger.addHandler(handler);
System.out.println("Log Generated Successfully...!!!");
} catch (Exception e) {
System.out.println("Log Generation Failed..!!!");
e.printStackTrace();
}
}
}
Now I can see the xml files in output with out tags and i need its difference and the output should be exported as a log file.
Kindly help me to finish this and thanks in advance.

Above Solution Worked for me with a slight change :
private static List<String> compareXMLLineByLine(File file1, File file2) throws Exception {
List<String> list1 = Files.lines(Paths.get(file1.getPath())).collect(Collectors.toList());
List<String> list2 = Files.lines(Paths.get(file2.getPath())).collect(Collectors.toList());
return list1.stream().filter(s->!list2.contains(s)).peek(s -> System.out.println("mismatched value: " + s)).collect(Collectors.toList());
}

You can use java-diff-utils:
https://code.google.com/p/java-diff-utils/wiki/SampleUsage

Thank you.
My present code:
package compare5;
import org.apache.log4j.Logger;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.List;
import org.custommonkey.xmlunit.DetailedDiff;
import org.custommonkey.xmlunit.Diff;
import org.custommonkey.xmlunit.Difference;
import org.xml.sax.SAXException;
public class ComparisonTest {
final static Logger logger = Logger.getLogger(ComparisonTest.class);
public static void main(String[] args) {
File f1 = new File("D:/reference.xml");
File f2= new File("D:/comparison.xml");
File f3= new File("D:/Activity log.log");
try {
//create a new file if it doesn't
f3.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
FileReader fr1 = null;
FileReader fr2 = null;
try {
fr1 = new FileReader(f1);
fr2 = new FileReader(f2);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
try {
Diff diff = new Diff(fr1, fr2);
System.out.println("Similar? " + diff.similar());
System.out.println("Identical? " + diff.identical());
DetailedDiff detDiff = new DetailedDiff(diff);
List differences = detDiff.getAllDifferences();
for (Object object : differences) {
Difference difference = (Difference)object;
System.out.println("***********************");
System.out.println(difference);
System.out.println("***********************");
}
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
ComparisonTest obj = new ComparisonTest();
obj.runMe("ila");
}
private void runMe(String parameter){
if(logger.isDebugEnabled()){
logger.debug("This is debug : " + parameter);
}
if(logger.isInfoEnabled()){
logger.info("This is info : " + parameter);
}
logger.warn("This is warn : " + parameter);
logger.error("This is error : " + parameter);
logger.fatal("This is fatal : " + parameter);
}
}

try this
private void compare(File file1, File file2) throws Exception {
List<String> list1 = Files.lines(Paths.get(file1.getPath())).collect(Collectors.toList());
List<String> list2 = Files.lines(Paths.get(file2.getPath())).collect(Collectors.toList());
list1.stream().filter(s -> !list2.contains(s)).peek(System.out::println).count() != 0;
list2.stream().filter(s -> !list1.contains(s)).peek(System.out::println).count() != 0;
}

Related

Concatenate Elements nodes consist of same state name into one element node

input XML file
<ContactD>
<addr>
<name>jack</name>
<street>south street</street>
<state>Tamilnadu</state>
<country>India</country>
<pin>621716</pin>
</addr>
<addr>
<name>Benjamin</name>
<street>north street</street>
<state>Tamilnadu</state>
<country>India</country>
<pin>621706</pin>
</addr>
<addr>
<name>Ryan</name>
<street>East street</street>
<state>Kerala</state>
<country>India</country>
<pin>67322</pin>
</addr>
</ContactD>
The output should like this:
<ContactD>
<addr>
<name>jack,Benjamin</name>
<street>south street,north street</street>
<state>Tamilnadu</state>
<country>India</country>
<pin>621716,621706</pin>
</addr>
<addr>
<name>Ryan</name>
<street>East street</street>
<state>Kerala</state>
<country>India</country>
<pin>67322</pin>
</addr>
</ContactD>
I tried using Java code I tried to match the state element after that I don't how to concatenate those two into one
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
//import com.sun.org.apache.xml.internal.security.utils.XPathFactory;
import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
public class Jparser {
private static final String FILENAME = "books.xml";
public static void main(String[] args) {
// Instantiate the Factory
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
// optional, but recommended
// process XML securely, avoid attacks like XML External Entities (XXE)
dbf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
// parse XML file
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.parse(new File(FILENAME));
// optional, but recommended
// http://stackoverflow.com/questions/13786607/normalization-in-dom-parsing-with-java-how-does-it-work
doc.getDocumentElement().normalize();
System.out.println("Root Element :" + doc.getDocumentElement().getNodeName());
System.out.println("------");
// get <staff>
NodeList list = doc.getElementsByTagName("addr");
String[] hell= new String[3];
for (int temp = 0; temp < list.getLength(); temp++) {
Node node = list.item(temp);
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element element = (Element) node;
String name = element.getElementsByTagName("name").item(0).getTextContent();
String street = element.getElementsByTagName("street").item(0).getTextContent();
hell[temp]= element.getElementsByTagName("state").item(0).getTextContent();
String country = element.getElementsByTagName("country").item(0).getTextContent();
String pin = element.getElementsByTagName("pin").item(0).getTextContent();
System.out.println("Current Element :" + node.getNodeName());
System.out.println("name : " + name);
System.out.println("street : " + street);
System.out.println("state : " + hell[temp]);
System.out.println("country : " + country);
System.out.printf("pin :"+ pin);
}
}
for (int t = 0; t < list.getLength()-1; t++) {
if(hell[t].equals(hell[t+1])) {
/////here i need to concatenate the two element nodes which has same state in one xml data what to do here!!
System.out.println("same");
}
}
} catch (ParserConfigurationException | SAXException | IOException e) {
e.printStackTrace();
}
}
}
I got stuck when I tried to contact that two element node which I identified as the same into one...I finished up matching those state nodes...need help for concatenation!!
input
<ContactD>
<addr>
<name>jack</name>
<street>south street</street>
<state>Tamilnadu</state>
<country>India</country>
<pin>621716</pin>
</addr>
<addr>
<name>Benjamin</name>
<street>north street</street>
<state>Tamilnadu</state>
<country>India</country>
<pin>621706</pin>
</addr>
<addr>
<name>Ryan</name>
<street>East street</street>
<state>Kerala</state>
<country>India</country>
<pin>67322</pin>
</addr>
<addr>
<name>yan</name>
<street>East street</street>
<state>Kerala</state>
<country>India</country>
<pin>67322</pin>
</addr>
</ContactD>
Expected output :
<?xml version="1.0" encoding="UTF-8" standalone="no"?><ContactD>
<addr>
<name>jack,Benjamin</name>
<street>south street,north street</street>
<state>Tamilnadu</state>
<country>India</country>
<pin>621716,621706</pin>
</addr>
<addr>
<name>Ryan,yan</name>
<street>East street</street>
<state>Kerala</state>
<country>India</country>
<pin>67322</pin>
</addr>
</ContactD>
Javacode for this concatenation is below : (sorry for too many reduntant comments and uneven code)
import java.io.File;
// j a v a 2 s .co m
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
//import com.sun.org.apache.xml.internal.security.utils.XPathFactory;
import javax.xml.XMLConstants;
import javax.xml.parsers.ParserConfigurationException;
import java.util.ArrayList;
import java.io.IOException;
public class Jparser {
private static final String FILENAME = "books.xml";
private static void toString(Document newDoc) throws Exception{
TransformerFactory tranFactory = TransformerFactory.newInstance();
Transformer aTransformer = tranFactory.newTransformer();
Source src = new DOMSource(newDoc);
Result dest = new StreamResult(new File("books.xml"));
aTransformer.transform(src, dest);
}
public static void main(String[] args) throws Exception {
// Instantiate the Factory
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
// optional, but recommended
// process XML securely, avoid attacks like XML External Entities (XXE)
dbf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
// parse XML file
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.parse(new File(FILENAME));
// optional, but recommended
// http://stackoverflow.com/questions/13786607/normalization-in-dom-parsing-with-java-how-does-it-work
doc.getDocumentElement().normalize();
System.out.println("Root Element :" + doc.getDocumentElement().getNodeName());
System.out.println("------");
// get <staff>
NodeList list = doc.getElementsByTagName("addr");
//String[] hell= new String[10];
ArrayList<String> hell = new ArrayList<String>();
ArrayList<Element> Ements = new ArrayList<Element>();
for (int temp = 0; temp < list.getLength(); temp++) {
Node node = list.item(temp);
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element element = (Element) node;
Ements.add(element);
String name = element.getElementsByTagName("name").item(0).getTextContent();
String street = element.getElementsByTagName("street").item(0).getTextContent();
hell.add(element.getElementsByTagName("state").item(0).getTextContent());
String country = element.getElementsByTagName("country").item(0).getTextContent();
String pin = element.getElementsByTagName("pin").item(0).getTextContent();
System.out.println("Current Element :" + node.getNodeName());
System.out.println("name : " + name);
System.out.println("street : " + street);
System.out.println("state : " + hell.get(temp));
System.out.println("country : " + country);
System.out.printf("pin :"+ pin);
}
}
System.out.println("The size of the ArrayList is: " + Ements.size());
// System.out.println(Ements.get(0).name);
for (int t = 0; t < Ements.size()-1; t++) {
// for(int s=1;s<Ements.size()-1;s++) {
//for(int d=1; d<list.getLength()-1;d++) {
if(hell.get(t).equals(hell.get(t+1))) {
/////here i need to concatenate the two element nodes which has same state in one xml data what to do here!!
// int z=t;
String[] he= {"name","street","state","country","pin"};
for(int a=0; a<he.length; a++) {
//System.out.printf(Ements.get[a]);
String zz0=Ements.get(t).getElementsByTagName(he[a]).item(0).getTextContent();
String zz1=Ements.get(t+1).getElementsByTagName(he[a]).item(0).getTextContent();
if(!zz0.equals(zz1)){
Ements.get(t).getElementsByTagName(he[a]).item(0).setTextContent(zz0+","+zz1);
}
//String zz=Ements.get(t).getElementsByTagName(he[a]).item(0).getTextContent()+","+Ements.get(t+1).getElementsByTagName(he[a]).item(0).getTextContent();
// Ements.get(t).getElementsByTagName(he[a]).item(0).setTextContent(zz);
// System.out.println(Ements.get(t).getElementsByTagName(he[a]).item(0).getTextContent());
}
doc.getDocumentElement().removeChild(Ements.get(t+1));
Ements.remove(t+1);
hell.remove(t+1);
System.out.println(Ements.size());
}
}
System.out.println(Ements.size());
// DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
// DocumentBuilder domBuilder = domFactory.newDocumentBuilder();
//Document newDoc = domBuilder.newDocument();
//Element rootElement = newDoc.createElement("parent");
// NodeList toremove=doc.getDocumentElement().getChildNodes();
//for(int i=0;i<toremove.getLength();i++) {
// doc.getDocumentElement().removeChild(toremove.item(i));
//}
//doc.getDocumentElement().appendChild(Ements.get(0));
//doc.getDocumentElement().appendChild(Ements.get(1));
toString(doc);
} catch (ParserConfigurationException | SAXException | IOException e) {
e.printStackTrace();
}
}
}

I got following error when I compile following program for xml to excel to conversion

Though I import all packages it can't work. I am new in java.
Exception in thread "main"
java.lang.NoClassDefFoundError:org/apache/commons/collections4/ListValuedMap
at demo.ReadAndPrintXMLFile.main(ReadAndPrintXMLFile.java:101) Caused
by:java.lang.ClassNotFoundException: org.apache.commons.collections4.
package demo;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.w3c.dom.*;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
public class ReadAndPrintXMLFile {
private static XSSFWorkbook workbook;
public static void main(String argv[]) {
ArrayList<String> firstNames = new ArrayList<String>();
ArrayList<String> lastNames = new ArrayList<String>();
ArrayList<String> ages = new ArrayList<String>();
try {
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
Document doc = docBuilder.parse(new File("D:/amit/book.xml"));
// normalize text representation
doc.getDocumentElement().normalize();
System.out.println("Root element of the doc is :\" "+ doc.getDocumentElement().getNodeName() + "\"");
NodeList listOfPersons = doc.getElementsByTagName("person");
int totalPersons = listOfPersons.getLength();
System.out.println("Total no of people : " + totalPersons);
for (int s = 0; s < listOfPersons.getLength(); s++)
{
Node firstPersonNode = listOfPersons.item(s);
if (firstPersonNode.getNodeType() == Node.ELEMENT_NODE)
{
Element firstPersonElement = (Element) firstPersonNode;
NodeList firstNameList = firstPersonElement.getElementsByTagName("first");
Element firstNameElement = (Element) firstNameList.item(0);
NodeList textFNList = firstNameElement.getChildNodes();
System.out.println("First Name : "+ ((Node) textFNList.item(0)).getNodeValue().trim());
firstNames.add(((Node) textFNList.item(0)).getNodeValue().trim());
NodeList lastNameList = firstPersonElement.getElementsByTagName("last");
Element lastNameElement = (Element) lastNameList.item(0);
NodeList textLNList = lastNameElement.getChildNodes();
System.out.println("Last Name : "+ ((Node) textLNList.item(0)).getNodeValue().trim());
lastNames.add(((Node) textLNList.item(0)).getNodeValue().trim());
NodeList ageList = firstPersonElement.getElementsByTagName("age");
Element ageElement = (Element) ageList.item(0);
NodeList textAgeList = ageElement.getChildNodes();
System.out.println("Age : "+ ((Node) textAgeList.item(0)).getNodeValue().trim());
ages.add(((Node) textAgeList.item(0)).getNodeValue().trim());
}// end of if clause
}// end of for loop with s var
for(String firstName:firstNames)
{
System.out.println("firstName : "+firstName);
}
for(String lastName:lastNames)
{
System.out.println("lastName : "+lastName);
}
for(String age:ages)
{
System.out.println("age : "+age);
}
}
catch (SAXParseException err)
{
System.out.println("** Parsing error" + ", line "+ err.getLineNumber() + ", uri " + err.getSystemId());
System.out.println(" " + err.getMessage());
}
catch (SAXException e)
{
Exception x = e.getException();
((x == null) ? e : x).printStackTrace();
}
catch (Throwable t)
{
t.printStackTrace();
}
workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet("Sample sheet");
Map<String, Object[]> data = new HashMap<String, Object[]>();
for(int i=0;i<firstNames.size();i++)
{
data.put(i+"",new Object[]{firstNames.get(i),lastNames.get(i),ages.get(i)});
}
Set<String> keyset = data.keySet();
int rownum = 0;
for (String key : keyset) {
Row row = sheet.createRow(rownum++);
Object[] objArr = data.get(key);
int cellnum = 0;
for (Object obj : objArr) {
Cell cell = row.createCell(cellnum++);
if (obj instanceof Date)
cell.setCellValue((Date) obj);
else if (obj instanceof Boolean)
cell.setCellValue((Boolean) obj);
else if (obj instanceof String)
cell.setCellValue((String) obj);
else if (obj instanceof Double)
cell.setCellValue((Double) obj);
}
}
try {
FileOutputStream out = new FileOutputStream(new File("D:/amit/book1.xlsx"));
workbook.write(out);
out.close();
System.out.println("Excel written successfully..");
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}// end of main
}
You are missing a dependency to org.apache.commons.collections4. This might be because of your other dependencies (which should have the collections4 dependency in theirs, but apparently there is still some sort of error).
You can try to add the JAR to the path, or if you use any build tools (Maven, Gradle, Ivy, etc.) you can add it to the dependencies list. Try and add this dependency: https://mvnrepository.com/artifact/org.apache.commons/commons-collections4/4.1

how to convert xml to avro without ignoring !CDATA content?

I have the following source XML file named customers.xml:
<?xml version="1.0" encoding="utf-8"?>
<p:CustomerElement xmlns:p="http://www.dog.com/customer" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:schemaLocation="http://www.dog.com/customer Customer.xsd">
<Customer>
<Sender>
<transmitDate>2016-02-21T00:00:00</transmitDate>
<transmitter>Dog ETL v2.0</transmitter>
<dealerCode><![CDATA[P020]]></dealerCode>
<DMSSystem><![CDATA[DBS]]></DMSSystem>
<DMSReleaseNumber><![CDATA[5.0]]></DMSReleaseNumber>
</Sender>
<Identifier>
<updateInd><![CDATA[A]]></updateInd>
<dealerCustNumber><![CDATA[AMADOR]]></dealerCustNumber>
<dealerCustName><![CDATA[AMADOR COMPUTERS]]></dealerCustName>
<phoneNumber><![CDATA[800 111 4444]]></phoneNumber>
<faxNumber><![CDATA[780 111 4444]]></faxNumber>
<email xsi:nil="true" />
<customerType><![CDATA[R]]></customerType>
<activeCustomerInd>false</activeCustomerInd>
<parentCustomerNumber xsi:nil="true" />
<primaryStoreNumber><![CDATA[00]]></primaryStoreNumber>
<preferredLanguage><![CDATA[ENG]]></preferredLanguage>
<dealerDateInSystem>2000-01-11T00:00:00</dealerDateInSystem>
<dealerLastUpdatedDate>2015-02-05T00:00:00</dealerLastUpdatedDate>
</Identifier>
<Location>
<address2><![CDATA[ACCOUNT FLAGGED FOR DELETION]]></address2>
<address3><![CDATA[AS PER BILL FEB AA/15]]></address3>
<city><![CDATA[CHICAGO]]></city>
<postalCode><![CDATA[Q5S 1E5]]></postalCode>
<state><![CDATA[AB]]></state>
<country><![CDATA[CA]]></country>
<location><![CDATA[FLAGGED FOR DELETION]]></location>
<addressType><![CDATA[M]]></addressType>
</Location>
<Division>
<divisionCode><![CDATA[G]]></divisionCode>
<divisionName><![CDATA[CAR]]></divisionName>
<IndustryCode>
<industryCode><![CDATA[AQ99]]></industryCode>
<primaryIndustryCodeInd>true</primaryIndustryCodeInd>
</IndustryCode>
<SalesRep>
<number><![CDATA[XXX]]></number>
<name><![CDATA[KILL ACCOUNT IN PROCESS]]></name>
<type><![CDATA[M]]></type>
<par>0</par>
<email xsi:nil="true" />
<phoneNumber><![CDATA[000 000 0000]]></phoneNumber>
</SalesRep>
</Division>
</Customer>
<Customer>
<Sender>
<transmitDate>2016-02-21T00:00:00</transmitDate>
<transmitter>Dog ETL v2.0</transmitter>
<dealerCode><![CDATA[P000]]></dealerCode>
<DMSSystem><![CDATA[DBS]]></DMSSystem>
<DMSReleaseNumber><![CDATA[5.0]]></DMSReleaseNumber>
</Sender>
<Identifier>
<updateInd><![CDATA[A]]></updateInd>
<dealerCustNumber><![CDATA[UU20888]]></dealerCustNumber>
<dealerCustName><![CDATA[ ADVERTISING AND PR]]></dealerCustName>
<phoneNumber xsi:nil="true" />
<faxNumber xsi:nil="true" />
<email xsi:nil="true" />
<customerType><![CDATA[I]]></customerType>
<activeCustomerInd>true</activeCustomerInd>
<parentCustomerNumber xsi:nil="true" />
<primaryStoreNumber><![CDATA[M2]]></primaryStoreNumber>
<preferredLanguage><![CDATA[ENG]]></preferredLanguage>
<dealerDateInSystem>2015-11-18T00:00:00</dealerDateInSystem>
<dealerLastUpdatedDate>2015-11-19T00:00:00</dealerLastUpdatedDate>
</Identifier>
<Location>
<address2><![CDATA[EQUIP]]></address2>
<city><![CDATA[ADER]]></city>
<country><![CDATA[CA]]></country>
<addressType><![CDATA[M]]></addressType>
</Location>
<Division>
<divisionCode><![CDATA[A]]></divisionCode>
<divisionName><![CDATA[AGRO]]></divisionName>
<IndustryCode>
<industryCode><![CDATA[EQ00]]></industryCode>
<primaryIndustryCodeInd>true</primaryIndustryCodeInd>
</IndustryCode>
</Division>
</Customer>
</p:CustomerElement>
I have the following java code, which parses customers.xml into individual "Customer" entities, and then attempts to convert each of them into an AVRO format:
package com.dogsoft.data.xmltoavro;
import java.io.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.FactoryConfigurationError;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import jdk.nashorn.internal.runtime.regexp.joni.constants.NodeType;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import org.apache.avro.Protocol;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.avro.util.Utf8;
import org.w3c.dom.*;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
public class ParseXmlFile {
private static Protocol protocol;
public static void xmlToAvro(File xmlFile, File avroFile) throws IOException, SAXException {
try {
InputStream stream = new FileInputStream("/tmp/xml.avsc");
if (stream == null) throw new IllegalStateException("Classpath should include xml.avsc");
protocol = Protocol.parse(stream);
} catch (IOException e) {
throw new RuntimeException(e);
}
Schema schema = protocol.getType("Element");
Document doc = parse(xmlFile);
DatumWriter<GenericRecord> datumWriter = new SpecificDatumWriter<>(schema);
try (DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<>(datumWriter)) {
fileWriter.create(schema, avroFile);
Object docElement = doc.getDocumentElement();
fileWriter.append(wrapElement(doc.getDocumentElement()));
}
}
private static GenericData.Record wrapElement(Element el) {
GenericData.Record record = new GenericData.Record(protocol.getType("Element"));
record.put("name", el.getNodeName());
NamedNodeMap attributeNodes = el.getAttributes();
List<GenericData.Record> attrRecords = new ArrayList<>();
for (int i = 0; i < attributeNodes.getLength(); i++) {
Attr attr = (Attr) attributeNodes.item(i);
attrRecords.add(wrapAttr(attr));
}
record.put("attributes", attrRecords);
List<Object> childArray = new ArrayList<>();
NodeList childNodes = el.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node node = childNodes.item(i);
Object nt = node.getNodeType();
if (node.getNodeType() == Node.ELEMENT_NODE)
childArray.add(wrapElement((Element) node));
if (node.getNodeType() == Node.TEXT_NODE)
childArray.add(node.getTextContent());
}
record.put("children", childArray);
return record;
}
private static GenericData.Record wrapAttr(Attr attr) {
GenericData.Record record = new GenericData.Record(protocol.getType("Attribute"));
record.put("name", attr.getName());
record.put("value", attr.getValue());
return record;
}
private static Document parse(File file) throws IOException, SAXException {
try {
DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
return builder.parse(file);
} catch (ParserConfigurationException e) {
throw new RuntimeException(e);
}
}
public static void avroToXml(File avroFile, File xmlFile) throws IOException {
try {
InputStream stream = new FileInputStream("/tmp/xml.avsc");
if (stream == null) throw new IllegalStateException("Classpath should include xml.avsc");
protocol = Protocol.parse(stream);
} catch (IOException e) {
throw new RuntimeException(e);
}
DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(protocol.getType("Element"));
DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(avroFile, datumReader);
GenericRecord record = dataFileReader.next();
Document doc;
try {
doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
} catch (ParserConfigurationException e) {
throw new RuntimeException(e);
}
Element el = unwrapElement(record, doc);
doc.appendChild(el);
saveDocument(doc, xmlFile);
}
private static Element unwrapElement(GenericRecord record, Document doc) {
String name = "" + record.get("name");
Element el = doc.createElement(name);
#SuppressWarnings("unchecked")
GenericArray<GenericRecord> attrArray = (GenericArray<GenericRecord>) record.get("attributes");
for (GenericRecord attrRecord : attrArray)
el.setAttributeNode(unwrapAttr(attrRecord, doc));
#SuppressWarnings("unchecked")
GenericArray<Object> childArray = (GenericArray<Object>) record.get("children");
for (Object childObj : childArray) {
if (childObj instanceof GenericRecord)
el.appendChild(unwrapElement((GenericRecord) childObj, doc));
if (childObj instanceof Utf8)
el.appendChild(doc.createTextNode("" + childObj));
}
return el;
}
private static Attr unwrapAttr(GenericRecord record, Document doc) {
Attr attr = doc.createAttribute("" + record.get("name"));
attr.setValue("" + record.get("value"));
return attr;
}
private static void saveDocument(Document doc, File file) {
try {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.transform(new DOMSource(doc), new StreamResult(file));
} catch (TransformerException e) {
throw new RuntimeException(e);
}
}
public static void main(String[] args)
{
Object nodeObject = null;
Node myNode = null;
Transformer transformer = null;
try
{
try {
transformer =
TransformerFactory.newInstance().newTransformer();
} catch (TransformerConfigurationException e) {
e.printStackTrace();
}
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(true);
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.parse("/tmp/customers.xml");
System.out.printf("Version = %s%n", doc.getXmlVersion());
System.out.printf("Encoding = %s%n", doc.getXmlEncoding());
System.out.printf("Standalone = %b%n%n", doc.getXmlStandalone());
if (doc.hasChildNodes())
{
int customerNumber = 0;
NodeList nl = doc.getDocumentElement().getChildNodes();
for (int i = 0; i < nl.getLength(); i++) {
Node node = nl.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
System.out.println(node.toString());
customerNumber++;
File avroFile = new File("/tmp/customer" + customerNumber + ".avro");
File xmlFile = new File("/tmp/customer" + customerNumber + ".xml");
File xmlFile1 = new File("/tmp/customer" + customerNumber + "-foo.xml");
try {
transformer.transform(
new DOMSource(node), new StreamResult(xmlFile));
File outputFile = new File("/tmp/customer" + customerNumber + ".avro");
xmlToAvro(xmlFile, outputFile);
} catch (TransformerException e) {
e.printStackTrace();
}
}
}
}
}
catch (IOException ioe)
{
System.err.println("IOE: " + ioe);
}
catch (SAXException saxe)
{
System.err.println("SAXE: " + saxe);
}
catch (FactoryConfigurationError fce)
{
System.err.println("FCE: " + fce);
}
catch (ParserConfigurationException pce)
{
System.err.println("PCE: " + pce);
}
}
}
This code works overall, but it ignores any content, which is enclosed into
![CDATA[
tag. As it happens, most of the actual useful data in the customers.xml files is enclosed into these tags.
Is there a way to modify this code, to make it not ignore the CDATA contents?
Instead of hand-writing parser code, you might want to split the problem in two parts: first, bind XML into POJO (using JAXB or Jackson XML module); and then write POJO as Avro (using Apache Avro lib, or Jackson Avro module). All you need for that would be POJO definition that matches expected structure for data as XML and Avro. Result should be less code, and basically specifying what needs to happen and now how to do it.

splitting Sitemap into more sitemaps if it has more than maxnumber of urls

I would like to split my Sitemap into Sitemaps, if it has more than maxURLs. The following example should split the Sitemap, if it has more than one url.
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.CharacterData;
import org.w3c.dom.*;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
public class SiteMapSplitter {
public static void main(String[] args){
String sitemapStr = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n" +
"<url>\n" +
"<loc>test1.html</loc>\n" +
"<lastmod>today</lastmod>\n" +
"<changefreq>daily</changefreq>\n" +
"<priority>1.0</priority>\n" +
"</url>\n" +
"<url>\n" +
"<loc>test2.html</loc>\n" +
"<lastmod>yesterday</lastmod>\n" +
"<changefreq>daily</changefreq>\n" +
"<priority>1.0</priority>\n" +
"</url></urlset>";
try {
splitSitemap(sitemapStr);
} catch (ParserConfigurationException e) {
e.printStackTrace();
}
}
static private void splitSitemap(String sitemapStr) throws ParserConfigurationException {
DocumentBuilder db = null;
try {
db = DocumentBuilderFactory.newInstance().newDocumentBuilder();
} catch (ParserConfigurationException e) {
e.printStackTrace();
}
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(sitemapStr));
Document doc = null;
try {
doc = db.parse(is);
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
NodeList nodes = doc.getElementsByTagName("url");
int maxURLs = 1;
Set<String> smURLsSet= new HashSet<String>();
if (nodes.getLength()>maxURLs){
for (int i = 0; i < nodes.getLength(); i++) {
StringBuilder smURLsBuilder = new StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n");
for (int k = 0; k<maxURLs; k++){
Element element = (Element) nodes.item(i);
smURLsBuilder.append(element);
}
smURLsSet.add(smURLsBuilder.toString());
}
Iterator i = smURLsSet.iterator();
while(i.hasNext()){
System.out.println(i.next());
}
}
}
}
The problem is that Element element = (Element) nodes.item(i); smURLsBuilder.append(element);
does not append the whole element (in this case the url and its childreen) to the smURLsBuilder. How to do this?
You should consider using an object oriented approach to the sitemap. Either with data binding (JAXB) or even shorter using data projection (Disclosure: I'm affiliated with that project). This way you do not need to create the XML by string concatenation.
public class SitemapSplitter {
static String sitemapStr = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n" +
"<url>\n" +
"<loc>test1.html</loc>\n" +
"<lastmod>today</lastmod>\n" +
"<changefreq>daily</changefreq>\n" +
"<priority>1.0</priority>\n" +
"</url>\n" +
"<url>\n" +
"<loc>test2.html</loc>\n" +
"<lastmod>yesterday</lastmod>\n" +
"<changefreq>daily</changefreq>\n" +
"<priority>1.0</priority>\n" +
"</url></urlset>";
public interface Sitemap {
#XBWrite("/urlset/url")
Sitemap setUrls(List<? extends Node> urls);
}
public static void main(String... args) {
XBProjector projector = new XBProjector(Flags.TO_STRING_RENDERS_XML);
// Get all urls from existing sitemap.
List<Node> urlNodes = projector.onXMLString(sitemapStr).evalXPath("/xbdefaultns:urlset/xbdefaultns:url").asListOf(Node.class);
for (Node urlNode: urlNodes) {
// Create a new sitemap, here with only one url
Sitemap newSitemap = projector.onXMLString(sitemapStr).createProjection(Sitemap.class).setUrls(Collections.singletonList(urlNode));
System.out.println(newSitemap);
}
}
}
This program prints out
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>test1.html</loc>
<lastmod>today</lastmod>
<changefreq>daily</changefreq>
<priority>1.0</priority>
</url>
</urlset>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>test2.html</loc>
<lastmod>yesterday</lastmod>
<changefreq>daily</changefreq>
<priority>1.0</priority>
</url>
</urlset>

How to embed xml file into java package and access it?

I have a XML file with data that is used in both my C# and Java version of a library.
Ideally I want to embed this XML file in a package in that library.
I only need to access it from within my library, so I was wondering: is that possible?
In Java, you could include the XML file itself in the JAR file. You can then use something like this:
InputStream istream = getClass().getResourceAsStream("/resource/path/to/some.xml");
And parse your InputStream as normal.
The above getResourceAsStream() looks in the current classpath, which would include the contents of any JAR files.
book.xml
<book>
<person>
<first>Kiran</first>
<last>Pai</last>
<age>22</age>
</person>
<person>
<first>Bill</first>
<last>Gates</last>
<age>46</age>
</person>
<person>
<first>Steve</first>
<last>Jobs</last>
<age>40</age>
</person>
<person>
<first>kunal</first>
<last>kumar</last>
<age>25</age>
</person>
</book>
create a xml file book.xml
made a jar file book.xml.jar and
palce it in war/web-inf/lib folder of your project..
then it will work..
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import javax.servlet.http.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
#SuppressWarnings("serial")
public class XMLParser extends HttpServlet {
InputStream istream =getClass().getResourceAsStream("/book.xml");
public void doGet(HttpServletRequest req, HttpServletResponse resp)throws IOException
{
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = null;
try {
docBuilder = docBuilderFactory.newDocumentBuilder();
} catch (ParserConfigurationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Document doc = null;
try {
doc = docBuilder.parse (istream);
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// normalize text representation
doc.getDocumentElement ().normalize ();
System.out.println ("Root element of the doc is " +
doc.getDocumentElement().getNodeName());
NodeList listOfPersons = doc.getElementsByTagName("person");
int totalPersons = listOfPersons.getLength();
System.out.println("Total no of people : " + totalPersons);
for(int s=0; s<listOfPersons.getLength() ; s++){
Node firstPersonNode = listOfPersons.item(s);
if(firstPersonNode.getNodeType() == Node.ELEMENT_NODE){
Element firstPersonElement = (Element)firstPersonNode;
//-------
NodeList firstNameList = firstPersonElement.getElementsByTagName("first");
Element firstNameElement = (Element)firstNameList.item(0);
NodeList textFNList = firstNameElement.getChildNodes();
System.out.println("First Name : " +
((Node)textFNList.item(0)).getNodeValue().trim());
//-------
NodeList lastNameList = firstPersonElement.getElementsByTagName("last");
Element lastNameElement = (Element)lastNameList.item(0);
NodeList textLNList = lastNameElement.getChildNodes();
System.out.println("Last Name : " +
((Node)textLNList.item(0)).getNodeValue().trim());
//----
NodeList ageList = firstPersonElement.getElementsByTagName("age");
Element ageElement = (Element)ageList.item(0);
NodeList textAgeList = ageElement.getChildNodes();
System.out.println("Age : " +
((Node)textAgeList.item(0)).getNodeValue().trim());
//------
}//end of if clause
}//end of for loop with s var
//System.exit (0);
}//end of main
}

Categories

Resources