Conditional XML Parsing using Java

Conditional XML Parsing using Java - java

I am looking for a suitable parser to parse through the given XML. I want to parse through the full XML only when tag - 'employee', attribute - 'validated=false' else stop parsing. How we can perform this conditional XML parsing using SAX, STAX or any other parsers ?
<?xml version="1.0"?>
<database>
<employee validated="False">
<name>Lars </name>
<street validated="False"> Test </street>
<telephone number= "0123"/>
</employee>
<employee validated="True">
<name>Baegs </name>
<street validated="True"> Test </street>
<telephone number= "0123"/>
</employee>
</database>
I have tried the below SAX parser code
List<XmlObjects> xmlObjects;
String espXmlFileName;
String tmpValue;
XmlObjects xmlObjectsTmp;
public SaxParser(String espXmlFileName) {
this.espXmlFileName = espXmlFileName;
xmlObjects = new ArrayList<XmlObjects>();
parseDocument();
printDatas();
}
private void printDatas() {
for (XmlObjects tmpB : xmlObjects) {
System.out.println(tmpB.toString());
}
}
private void parseDocument() {
SAXParserFactory factory = SAXParserFactory.newInstance();
try {
SAXParser parser = factory.newSAXParser();
parser.parse(espXmlFileName, this);
} catch (ParserConfigurationException e) {
System.out.println("ParserConfig error");
} catch (SAXException e) {
System.out.println("SAXException : xml not well formed");
} catch (IOException e) {
System.out.println("IO error");
}
}
public void startElement(String uri, String localName, String qName,
org.xml.sax.Attributes attributes) throws SAXException {
if (qName.equalsIgnoreCase("employee")) {
String value1 = attributes.getValue("validated");
if (value1.equalsIgnoreCase("FALSE")) {
if (qName.equalsIgnoreCase("name")) {
String value2 = attributes.getValue("validated");
xmlObjectsTmp.setName(attributes
.getValue("name"));
}
}
public void endElement(String uri, String localName, String qName)
throws SAXException {
if (qName.equalsIgnoreCase("employee")) {
xmlObjects.add(xmlObjectsTmp);
}
if (qName.equalsIgnoreCase("name")) {
xmlObjectsTmp.setName(tmpValue);
}
}
public static void main(String argv[]) {
new SaxParser("C:\\xml\\xml2.xml");
}

In the startElement method of your ContentHandler you can simply throw a SAXException to abort parsing when your validated attribute has the value True.
For example:
#Override
public void startElement(final String uri, final String localName,
final String qName, final Attributes attributes) throws SAXException {
if(localName.equalsIgnoreCase("employee") || localName.equalsIgnoreCase("street")) {
final String validated = attributes.getValue("validated");
if(validated != null && !validated.equals("False")) {
throw new SAXException(localName + " has already been validated");
} else {
//your processing logic here
}
}
}
You can register an ErrorHandler to deal with the error in your own way if you wish.

Related

Parse value containing special character "/" gives wrong output using SAX parser

I am have below xml structure
<fs:AsReportedItem>
<fs:BookMark>/BODY[1]/DIV[3135]/DIV[0]/TABLE[0]/TBODY[0]/TR[32]/TD[5]/DIV[0]/FONT[0]/substr(1,2)
</fs:BookMark>
</fs:AsReportedItem>
I am parsing using SAX and reading tax value in the endElement() method
Here is my sample code
private void parseDocument() {
// parse
SAXParserFactory factory = SAXParserFactory.newInstance();
try {
SAXParser parser = factory.newSAXParser();
parser.parse(FileName, this);
} catch (ParserConfigurationException e) {
System.out.println("ParserConfig error");
} catch (SAXException e) {
System.out.println("SAXException : xml not well formed");
} catch (IOException e) {
System.out.println("IO error");
}
}
public void startElement(String s, String s1, String elementName, Attributes attributes) throws SAXException {
if (OrgDataPartitonObj != null && "fs:FinancialStatementLineItemDataItem".equals(OrgDataPartitonObj.getType())) {
FinancialStatementLineItemParser.startFinancialStatementLineItemParser(OrgDataPartitonObj,financialStatementLineItemObj, elementName, attributes);
}
}
public void endElement(String s, String s1, String element) throws SAXException {
if (OrgDataPartitonObj != null && "fs:FinancialStatementLineItemDataItem".equals(OrgDataPartitonObj.getType())) {
FinancialStatementLineItemParser.getEndElementFinancialStatementLineItemParser(financialStatementLineItemObj, element, tmpValue);
}
}
public static void getEndElementFinancialStatementLineItemParser(FinancialStatementLineItem financialStatementLineItemObj, String element, String tmpValue) {
if (element.equals("fs:BookMark")) {
financialStatementLineItemObj.setBookMark(tmpValue);
}
}
#Override
public void characters(char[] buffer, int start, int length) {
tmpValue = new String(buffer, start, length);
}
When i debug then i can see only this value /substr(1,2) all value with "/" is escaped
I dont know why i am not getting full value /BODY[1]/DIV[3135]/DIV[0]/TABLE[0]/TBODY[0]/TR[32]/TD[5]/DIV[0]/FONT[0]/substr(1,2)
If any escape character is used then where i have to use .

Here the sourcecode for a DefaultHandler which collects the text:
private static DefaultHandler getHandler() {
return new DefaultHandler() {
String text;
#Override
public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
if ("BookMark".equals(qName)) {
text = "";
}
}
#Override
public void characters(char[] ch, int start, int length) throws SAXException {
text += new String(ch).trim();
}
#Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if ("BookMark".equals(qName)) {
System.out.println("endElement: " + text);
}
}
};
}

You need to change your characters()-method to
#Override
public void characters(char[] buffer, int start, int length) {
tmpValue += new String(buffer, start, length);
}
and you must reset tmpValue in the startElement()-method to "".

Just change character() method
#Override
public void characters(char[] buffer, int start, int length) {
tmpValue += new String(buffer, start, length);
}
And add this at last line in the endElement method .
public void endElement(String s, String s1, String element) throws SAXException {
if (OrgDataPartitonObj != null && "fs:FinancialStatementLineItemDataItem".equals(OrgDataPartitonObj.getType())) {
FinancialStatementLineItemParser.getEndElementFinancialStatementLineItemParser(financialStatementLineItemObj, element, tmpValue);
}
tmpValue="";
}

Parsing a big xml file Java

I have big xml files (~1GB) with this structure:
<?xml version="1.0" encoding="UTF-8"?>
<GenoExchange xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.ncbi.nlm.nih.gov/SNP/geno" xsi:schemaLocation="http://www.ncbi.nlm.nih.gov/SNP/geno ftp://ftp.ncbi.nlm.nih.gov/snp/specs/genoex_1_5.xsd" dbSNPBuildNo="146" reportId="MT" reportType="chromosome">
<Population popId="638" handle="TSC-CSHL" locPopId="TSC_42_AA">
<popClass self="NORTH AMERICA"/>
</Population>
<SnpInfo rsId="1041870" observed="C/T">
<SnpLoc genomicAssembly="107:GRCh38.p2" geneId="4512" geneSymbol="COX1" chrom="MT" start="6150" locType="2" rsOrientToChrom="fwd" contigAllele="T" contig="NC_012920:1"/>
<SsInfo ssId="1508548" locSnpId="TSC0349089" ssOrientToRs="fwd">
<ByPop popId="1303" sampleSize="184">
<AlleleFreq allele="T" freq="1"/>
<AlleleFreq allele="C" freq="0"/>
</ByPop>
</SsInfo>
</SnpInfo>
<SnpInfo rsId="1029293" observed="C/T">
<SnpLoc genomicAssembly="107:GRCh38.p2" geneId="4512" geneSymbol="COX1" chrom="MT" start="6307" locType="2" rsOrientToChrom="fwd" contigAllele="C" contig="NC_012920:1"/>
<SsInfo ssId="1494519" locSnpId="TSC0254145" ssOrientToRs="fwd">
<ByPop popId="639" sampleSize="82">
<AlleleFreq allele="T" freq="0"/>
<AlleleFreq allele="C" freq="1"/>
</ByPop>
<ByPop popId="1303" sampleSize="184">
<AlleleFreq allele="T" freq="0"/>
<AlleleFreq allele="C" freq="1"/>
</ByPop>
</SsInfo>
</SnpInfo>
I want to find a specific rsID, for example rsID="1029293" and extract all the information inside that node. I don't want to run all the file. I only want to find that ID, extract that information and end the iteration.
From what I read it's better if I use SAX or Stax parsers. I'm using SAX, this is my code:
class UserHandler extends DefaultHandler {
String rsID = null;
String i = "1029293";
#Override
public void startElement(String uri,
String localName, String qName, Attributes attributes) throws SAXException {
if (qName.equalsIgnoreCase("SnpInfo")) {
rsID = attributes.getValue("rsId");
//System.out.println("value: " + rsID);
}
if((i).equals(rsID) &&
qName.equalsIgnoreCase("SnpInfo")){
System.out.println("Start Element: " + qName + " " + rsID);
}
if ((i).equals(rsID) && qName.equalsIgnoreCase("SsInfo")) {
String a = attributes.getValue("ssId");
System.out.println("SSID: " + a);
}
if ((i).equals(rsID) && qName.equalsIgnoreCase("ByPop")) {
String p = attributes.getValue("popId");
System.out.println("POPID: " + p);
}
if ((i).equals(rsID) && qName.equalsIgnoreCase("AlleleFreq")) {
String p = attributes.getValue("allele");
String f = attributes.getValue("freq");
System.out.println("ALLELE: " + p + " FREQ: " + f);
}
if ((i).equals(rsID) && qName.equalsIgnoreCase("GTypeFreq")) {
String p = attributes.getValue("gtype");
String f = attributes.getValue("freq");
System.out.println("GTYPE: " + p + " FREQ: " + f);
}
}
#Override
public void endElement(String uri,
String localName, String qName) throws SAXException {
if (qName.equalsIgnoreCase("SnpInfo")) {
if((i).equals(rsID)
&& qName.equalsIgnoreCase("SnpInfo"))
System.out.println("End Element: " + qName);
}
}
}
public class XMLParser {
public static void main(String argv[]) {
try {
InputStream fileStream = new FileInputStream("/home/xml/gt_chr10.xml.gz");
InputStream gzipStream = new GZIPInputStream(fileStream);
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
UserHandler userhandler = new UserHandler();
saxParser.parse(gzipStream, userhandler);
} catch (Exception e) {
e.printStackTrace();
}
}
My problem is that my code searches the whole file for the ID and that takes more than 2 minutes each time. I can't have a code that takes so long.
Is there a better approach for this?

Using STAX gives you more control when parsing XML, since you actively pull elements from the stream. This way you can pull the next event, handle it and once you found your data, simply terminate the loop (using a flag or even a return statement if you must)
InputStream in = ...
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = factory.createXMLEventReader(in);
boolean found = false;
while (!found && eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
// your logic here
// once you found your element, you can terminate the loop
found = true;
break;
case XMLStreamConstants.END_ELEMENT:
// your logic here
break;
}
}
(omitted exception and resource handling for brevity)
On a side note, you will gain some performance by combining your if ((i).equals(rsID) && ... into a single one, with detail checks in nested ifs
if ((i).equals(rsID)) {
if(qName.equalsIgnoreCase("GTypeFreq")) {
...
}
}

You can throw an exception in your end element handler, to indicate to the parser that it aborts parsing (http://www.ibm.com/developerworks/library/x-tipsaxstop/):
#Override
public void endElement(String uri,
String localName, String qName) throws SAXException {
if (qName.equalsIgnoreCase("SnpInfo")) {
if((i).equals(rsID)
&& qName.equalsIgnoreCase("SnpInfo"))
System.out.println("End Element: " + qName);
throw SAXException("Element found.");
}
}

The only way to avoid parsing the whole file every time you run this is to put the data in an XML database. Parsing a 1Gb file is going to take about a minute, plus or minus depending on the speed of your machine and what processing you do on each node.
A streamed XSLT 3.0 solution is simply:
<xsl:transform version="3.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xpath-default-namespace="http://www.ncbi.nlm.nih.gov/SNP/geno">
<xsl:template name="xsl:initial-template">
<xsl:stream href="input.xml">
<xsl:copy-of select="/GenoExchange/SnpInfo[#rsId='1041870'][1]"/>
</xsl:stream>
</xsl:template>
</xsl:transform>
No need to write all that pesky SAX or StAX code.
I put the "[1]" predicate in to allow the processor to abandon the search when it has found the first hit.

The best approach is to use vtd-xml and xpath... 1GB xml file takes about 1.5GB heap space and < 10 sec in a 3~4 year old intel processor.see code example below.. One more thing, if you want to eliminate parsing entirely, you can create a vtd+XML file format so any subsequent query can directly access the vtd index portion, which could easily triple or quadruple your app performance...
import com.ximpleware.*;
public class simpleXpathSearch{
public static void main(String s[]) throws VTDException,java.io.UnsupportedEncodingException,java.io.IOException{
VTDGen vg = new VTDGen();
vg.setLCLevel(5);
if (!vg.parseFile("input.xml", false))
return;
VTDNav vn = vg.getNav();
AutoPilot ap = new AutoPilot(vn);
ap.selectXPath("/*/*[#rsID='1029293']");
int i=0;
while((i=ap.evalXPath())!=-1){
// your code logic here
}

//Main class
public static void main(String[] args) {
SAXReader.read();
}
//SAXReader
public static void read(){
try {
XMLReader processor = XMLReaderFactory.createXMLReader();
processor.setContentHandler(new SAXController());
processor.parse(new InputSource("MyXML.xml"));
} catch (SAXException | IOException e) {
System.err.println(e.getMessage());
}
}
//SAXController
// The SAXController extends DefaultHandler
private int tab = 0;
private void tabulation() {
for (int i=0; i<tab; i++)
System.out.print(" ");
}
#Override
public void startDocument() {
tabulation();
System.out.println("Starting XML Document");
tab++;
}
#Override
public void endDocument() {
tab--;
tabulation();
System.out.println("Ending XML Document");
}
#Override
public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
tabulation();
System.out.print(localName);
if (attributes.getLength()>0) {
for (int i=0; i<attributes.getLength(); i++) {
System.out.print(attributes.getLocalName(i)+": "+attributes.getValue(i));
}
}
System.out.println();
tab++;
}
#Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
tab--;
tabulation();
System.out.println(localName);
}
#Override
public void characters(char[] ch, int start, int length)
throws SAXException {
String content= new String(ch, start, length);
content= content.replaceAll("[\t\n]", "").trim();
if (!content.equals("")) {
tabulation();
System.out.println(content);
}
}

Android: get HTML text from XML

I have implemented in my app reading a XML. It works fine. But I want to format the text. I've tried in the XML:
<monumento>
<horarios><b>L-V:</b> 10 a 20<br/>S-D: 11 a 15</horarios>
<tarifas>4000</tarifas>
</monumento>
But the only thing I get if I put HTML character is that the text does not display in my app.
I'll have many xml so that I will not always know where to place <b>, <br/>...
How I can do?
Main
StringBuilder builder = new StringBuilder();
for (HorariosTarifasObj post : helper.posts) {
builder.append(post.getHorarios());
}
horario2.setText(builder.toString());
builder = new StringBuilder();
for (HorariosTarifasObj post : helper.posts) {
builder.append(post.getTarifas());
}
tarifa2.setText(builder.toString());
XMLReader
public void get() {
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser parser = factory.newSAXParser();
XMLReader reader = parser.getXMLReader();
reader.setContentHandler(this);
InputStream inputStream = new URL(URL + monumento + ".xml").openStream();
reader.parse(new InputSource(inputStream));
} catch (Exception e) {
}
}
#Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
currTag = true;
currTagVal = "";
if (localName.equals("monumento")) {
post = new HorariosTarifasObj();
}
}
#Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
currTag = false;
if(localName.equalsIgnoreCase("horarios")) {
post.setHorarios(currTagVal);
} else if(localName.equalsIgnoreCase("tarifas")) {
post.setTarifas(currTagVal);
} else if (localName.equalsIgnoreCase("monumento")) {
posts.add(post);
}
}
#Override
public void characters(char[] ch, int start, int length)
throws SAXException {
if (currTag) {
currTagVal = currTagVal + new String(ch, start, length);
currTag = false;
}
}

Try CDATA:
<monumento>
<horarios><![CDATA[<b>L-V:</b> 10 a 20<br/>S-D: 11 a 15]]></horarios>
<tarifas>4000</tarifas>
</monumento>

In XML, < and > characters are reserved for XML tags. You will need to protect them by replacing them with special encoding characters.
You can use > for > and < for <
(edit) Eomm answer is right, CDATA does this as well, and more simple
Also, to use HTML coding in TextView, you will need to use Html.fromHtml() method
For instance :
tarifa2.setText(Html.fromHtml(builder.toString()));

Error in output of a simple SAX parser

I saw an example in mykong - http://www.mkyong.com/java/how-to-read-xml-file-in-java-sax-parser/
I tried to make it work for the xml file (below) by making the following modifications to the code in the above page -
1 - Have only two if blocks in startElement() and characters() methods.
2 - Change the print statements in above methods, ie
FIRSTNAME and First Name = passenger id
LASTNAME and Last Name = name
The problem is - In the output, I see the word passenger instead of the value of passenger id. How do i fix that ?
<?xml version="1.0" encoding="utf-8"?>
<root xmlns:android="www.google.com">
<passenger id="001">
<name>Tom Cruise</name>
</passenger>
<passenger id="002">
<name>Tom Hanks</name>
</passenger>
</root>
Java Code
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class ReadXMLFileSAX{
public static void main(String argv[]) {
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
DefaultHandler handler = new DefaultHandler() {
boolean bfname = false;
boolean blname = false;
public void startElement(String uri, String localName,String qName,
Attributes attributes) throws SAXException {
System.out.println("Start Element :" + qName);
if (qName.equalsIgnoreCase("passenger id")) {
bfname = true;
}
if (qName.equalsIgnoreCase("name")) {
blname = true;
}
}
public void endElement(String uri, String localName,
String qName) throws SAXException {
System.out.println("End Element :" + qName);
}
public void characters(char ch[], int start, int length) throws SAXException {
if (bfname) {
System.out.println("passenger id : " + new String(ch, start, length));
bfname = false;
}
if (blname) {
System.out.println("name : " + new String(ch, start, length));
blname = false;
}
}
};
saxParser.parse("c:\\flight.xml", handler);
} catch (Exception e) {
e.printStackTrace();
}
}
}

In the startElement, when it's for "passenger", Attributes argument you get will have that value.
public void startElement(String uri, String localName,
String qName, Attributes attributes)
throws SAXException {
if (qName.equalsIgnoreCase("passenger") && attributes != null){
System.out.println(attributes.getValue("id"));
}
}

Parsing XML with TagSoup : bug with long attributes?

I'm trying to parse ugly HTML with TagSoup to extract value of a given tag.
Here is the tag :
<input type="hidden" name="hash_check" value="ffc39410ed8da309408a9382450ddc85" />
I want to retrieve value of attribute "value" ("ffc39410ed8da309408a9382450ddc85")
And here is my code, in my SAX handler :
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException
{
if (localName.equals("input"))
{
Log.v(TAG, Integer.toString(atts.getLength()));
if (atts.getValue("name").equals("hash_check")
{
in_input = true;
Log.v(TAG, atts.getValue("name"));
if (atts.getValue("value") != null)
Log.v(TAG,atts.getValue("value");
}
}
}
Logs are here for debugging purposes. Logcat correctly gives me "hash_check" for atts.getValue("name"), but an empty string for atts.getValue("value") although the parser is positionned to the right "input" (the one and only of my html document).
What's wrong ? Bug in TagSoup ?
Thanks
edit #bkail : thank you for your comment. Here are more details and code.
First, the URL that I'm trying to parse : http://forum.hardware.fr/hfr/Programmation/Divers-6/experts-puissant-internet-sujet_37483_1.htm
And the code used to instanciate the parser :
private static final String FORUM_URI = "http://forum.hardware.fr/hfr/Programmation/Divers-6/experts-puissant-internet-sujet_37483_1.htm";
URL hfrUrl = new URL(FORUM_URI);
Parser parser = new Parser();
HfrSAXHandler sh = new HfrSAXHandler();
parser.setContentHandler(sh);
parser.parse(new InputSource(hfrUrl.openStream()));
And finally, the whole code for my SAX parser :
public class HfrSAXHandler extends DefaultHandler
{
private boolean in_input = false;
private static final String TAG = "hfr4droid";
#Override
public void startDocument() throws SAXException
{
Log.v(TAG, "start of parsing");
}
#Override
public void endDocument() throws SAXException
{
}
#Override
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException
{
if (localName.equals("input"))
{
Log.v(TAG, Integer.toString(atts.getLength()));
if (atts.getValue("name") != null)
{
in_input = true;
Log.v(TAG, atts.getValue("name"));
if (atts.getValue("value") != null)
Log.v(TAG, Integer.toString(atts.getValue("value")));
}
}
}
#Override
public void endElement(String namespaceURI, String localName, String qName) throws SAXException
{
if (localName.equals("input"))
in_input = false;
}
}
Thanks for giving it a try.

Using Integer.toString() is the problem. Change this:
Log.v(TAG, Integer.toString(atts.getValue("value")));
to this:
Log.v(TAG, atts.getValue("value") );

Develop Reference

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Conditional XML Parsing using Java - java

Related

Parse value containing special character "/" gives wrong output using SAX parser

Parsing a big xml file Java

Android: get HTML text from XML

Error in output of a simple SAX parser

Parsing XML with TagSoup : bug with long attributes?

Categories

Resources