Jsoup WhiteList to allow comments

Jsoup WhiteList to allow comments - java

I am using jsoup 1.7.3 with Whitelist custom configuration.
Apparently it sanitizes all the HTML comments (<!-- ... -->) inside the document.
It also sanitizes the <!DOCTYPE ...> element.
How can I get jsoup Whitelist to allow comments as is?
How can I define the !DOCTYPE element as allowed element with any attribute?

This is not possible by standard JSoup classes and its not dependent on WhiteList. Its the org.jsoup.safety.Cleaner. The cleaner uses a Node traverser that allows only elements and text nodes. Also only the body is parsed. So the head and doctype are ignored completely. So to achieve this you'll have to create a custom cleaner. For example if you have an html like
<!DOCTYPE html>
<html>
<head>
<!-- This is a script -->
<script type="text/javascript">
function newFun() {
alert(1);
}
</script>
</head>
<body>
<map name="diagram_map">
<area id="area1" />
<area id="area2" />
</map>
<!-- This is another comment. -->
<div>Test</div>
</body>
</html>
You will first create a custom cleaner copying the orginal one. However please note the package should org.jsoup.safety as the cleaner uses some of the protected method of Whitelist associated with. Also there is not point in extending the Cleaner as almost all methods are private and the inner node traverser is final.
package org.jsoup.safety;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
public class CustomCleaner {
private Whitelist whitelist;
public CustomCleaner(Whitelist whitelist) {
Validate.notNull(whitelist);
this.whitelist = whitelist;
}
public Document clean(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
Document clean = Document.createShell(dirtyDocument.baseUri());
copyDocType(dirtyDocument, clean);
if (dirtyDocument.head() != null)
copySafeNodes(dirtyDocument.head(), clean.head());
if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
copySafeNodes(dirtyDocument.body(), clean.body());
return clean;
}
private void copyDocType(Document dirtyDocument, Document clean) {
dirtyDocument.traverse(new NodeVisitor() {
public void head(Node node, int depth) {
if (node instanceof DocumentType) {
clean.prependChild(node);
}
}
public void tail(Node node, int depth) { }
});
}
public boolean isValid(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
Document clean = Document.createShell(dirtyDocument.baseUri());
int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
return numDiscarded == 0;
}
private final class CleaningVisitor implements NodeVisitor {
private int numDiscarded = 0;
private final Element root;
private Element destination; // current element to append nodes to
private CleaningVisitor(Element root, Element destination) {
this.root = root;
this.destination = destination;
}
public void head(Node source, int depth) {
if (source instanceof Element) {
Element sourceEl = (Element) source;
if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
ElementMeta meta = createSafeElement(sourceEl);
Element destChild = meta.el;
destination.appendChild(destChild);
numDiscarded += meta.numAttribsDiscarded;
destination = destChild;
} else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
numDiscarded++;
}
} else if (source instanceof TextNode) {
TextNode sourceText = (TextNode) source;
TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());
destination.appendChild(destText);
} else if (source instanceof Comment) {
Comment sourceComment = (Comment) source;
Comment destComment = new Comment(sourceComment.getData(), source.baseUri());
destination.appendChild(destComment);
} else if (source instanceof DataNode) {
DataNode sourceData = (DataNode) source;
DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri());
destination.appendChild(destData);
} else { // else, we don't care about comments, xml proc instructions, etc
numDiscarded++;
}
}
public void tail(Node source, int depth) {
if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {
destination = destination.parent(); // would have descended, so pop destination stack
}
}
}
private int copySafeNodes(Element source, Element dest) {
CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest);
NodeTraversor traversor = new NodeTraversor(cleaningVisitor);
traversor.traverse(source);
return cleaningVisitor.numDiscarded;
}
private ElementMeta createSafeElement(Element sourceEl) {
String sourceTag = sourceEl.tagName();
Attributes destAttrs = new Attributes();
Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
int numDiscarded = 0;
Attributes sourceAttrs = sourceEl.attributes();
for (Attribute sourceAttr : sourceAttrs) {
if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
destAttrs.put(sourceAttr);
else
numDiscarded++;
}
Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
destAttrs.addAll(enforcedAttrs);
return new ElementMeta(dest, numDiscarded);
}
private static class ElementMeta {
Element el;
int numAttribsDiscarded;
ElementMeta(Element el, int numAttribsDiscarded) {
this.el = el;
this.numAttribsDiscarded = numAttribsDiscarded;
}
}
}
Once you have both you could do cleaning as normal. Like
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.CustomCleaner;
import org.jsoup.safety.Whitelist;
public class CustomJsoupSanitizer {
public static void main(String[] args) {
try {
Document doc = Jsoup.parse(new File("t2.html"), "UTF-8");
CustomCleaner cleaner = new CustomCleaner(Whitelist.relaxed().addTags("script"));
Document doc2 = cleaner.clean(doc);
System.out.println(doc2.html());
} catch (IOException e) {
e.printStackTrace();
}
}
}
This will give you the sanitized output for above html as
<!DOCTYPE html>
<html>
<head>
<!-- This is a script -->
<script>
function newFun() {
alert(1);
}
</script>
</head>
<body>
<!-- This is another comment. -->
<div>
Test
</div>
</body>
</html>
You can customize the cleaner to match your requirement. i.e to avoid head node or script tag etc...

The Jsoup Cleaner doesn't give you a chance here (l. 100):
} else { // else, we don't care about comments, xml proc instructions, etc
numDiscarded++;
}
Only instances of Element and TextNode may remain in the cleaned HTML.
Your only chance may be something horrible like parsing the document, replacing the comments and the doctype with a special whitelisted tag, cleaning the document and then parsing and replacing the special tags again.

Related

Liferay 7 Extending EditableFragmentEntryProcessor

I want to extend functionality of EditableFragmentEntryProcessor in Liferay 7.4 (<lfr-editable> tags in fragments) by searching in text syntaxes like {user.name} and replacing it with value from response from my external API.
e.x.
I type something like
This is super fragment and you are {user.name}.
And result should be
This is super fragment and you are Steven.
I achieve that with creating my own FragmentEntryProcessor, but I did this by putting fragment configuration variable in my custom tag
<my-data-api> ${configuration.testVariable} </my-data-api>
I tried something like this before
<my-data-api>
<lfr-editable id="some-id" type="text">
some text to edit
</lfr-editable>
</my-data-api>
And it doesn't work (and I know why).
So I want to get something like this. Appreciate any help or hints.
EDIT:
Here my custom FragmentEntryProcessor:
package com.example.fragmentEntryProcessorTest.portlet;
import com.example.test.api.api.TestPortletApi;
import com.liferay.fragment.exception.FragmentEntryContentException;
import com.liferay.fragment.model.FragmentEntryLink;
import com.liferay.fragment.processor.FragmentEntryProcessor;
import com.liferay.fragment.processor.FragmentEntryProcessorContext;
import com.liferay.portal.kernel.exception.PortalException;
import com.liferay.portal.kernel.util.Validator;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.osgi.service.component.annotations.Component;
import org.osgi.service.component.annotations.Reference;
import java.io.IOException;
/**
* #author kabatk
*/
#Component(
immediate = true, property = "fragment.entry.processor.priority:Integer=100",
service = FragmentEntryProcessor.class
)
public class FragmentEntryProcessorApiDataCopy implements FragmentEntryProcessor {
private static final String _TAG = "my-data-api";
#Reference
private TestPortletApi _api;
#Override
public String processFragmentEntryLinkHTML(
FragmentEntryLink fragmentEntryLink, String html,
FragmentEntryProcessorContext fragmentEntryProcessorContext)
throws PortalException {
Document document = _getDocument(html);
Elements elements = document.getElementsByTag(_TAG);
elements.forEach(
element -> {
String text = element.text();
String attrValue = element.attr("dataType");
String classValues = element.attr("classes");
Element myElement = null;
String result;
try {
result = _api.changeContent(text);
} catch (IOException e) {
e.printStackTrace();
result = "";
}
if(attrValue.equals("img")){
myElement = document.createElement("img");
myElement.attr("class", classValues);
myElement.attr("src", result);
}else if(attrValue.equals("text")){
myElement = document.createElement("div");
myElement.attr("class", classValues);
myElement.html(result);
}
if(myElement != null)
element.replaceWith(myElement);
else
element.replaceWith(
document.createElement("div").text("Error")
);
});
Element bodyElement = document.body();
return bodyElement.html();
}
#Override
public void validateFragmentEntryHTML(String html, String configuration)
throws PortalException {
Document document = _getDocument(html);
Elements elements = document.getElementsByTag(_TAG);
for (Element element : elements) {
if (Validator.isNull(element.attr("dataType"))) {
throw new FragmentEntryContentException("Missing 'dataType' attribute!");
}
}
}
private Document _getDocument(String html) {
Document document = Jsoup.parseBodyFragment(html);
Document.OutputSettings outputSettings = new Document.OutputSettings();
outputSettings.prettyPrint(false);
document.outputSettings(outputSettings);
return document;
}
}

Java Stax for Complex / Large XML

I have an XML file that is 4.2 GB! Obviously parsing the entire DOM is not practical. I have been looking at SAX and STAX to accomplish parsing this gigantic XML file. However all the examples I've seen are simple. The XML file I am dealing with has nested on nested on nested. There are areas where it goes 10+ levels.
I found this tutorial but not sure if its a viable solution.
http://www.javacodegeeks.com/2013/05/parsing-xml-using-dom-sax-and-stax-parser-in-java.html (botton example using STAX)
I'm not really sure how to handle nested objects.
I have created Java objects to mimic the structure of the XML. Here are a few, too many to display.
Record.java
public class Record implements Serializable {
String uid;
StaticData staticData;
DynamicData dynamicData;
}
Summary.java
public class Summary {
EWUID ewuid;
PubInfo pubInfo;
Titles titles;
Names names;
DocTypes docTypes;
Publishers publishers;
}
EWUID.java
public class EWUID {
String collId;
String edition;
}
PubInfo.java
public class PubInfo {
String coverDate;
String hasAbstract;
String issue;
String pubMonth;
String pubType;
String pubYear;
String sortDate;
String volume;
}
This is the code I've come up with so far.
public class TRWOSParser {
XMLEventReader eventReader;
XMLInputFactory inputFactory;
InputStream inputStream;
public TRWOSParser(String file) throws FileNotFoundException, XMLStreamException {
inputFactory = XMLInputFactory.newInstance();
inputStream = new FileInputStream(file);
eventReader = inputFactory.createXMLEventReader(inputStream);
}
public void parse() throws XMLStreamException{
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
if (event.isStartElement()) {
StartElement startElement = event.asStartElement();
if (startElement.getName().getLocalPart().equals("record")) {
Record record = new Record();
Iterator<Attribute> attributes = startElement.getAttributes();
while (attributes.hasNext()) {
Attribute attribute = attributes.next();
if (attribute.getName().toString().equals("UID")) {
System.out.println("UID: " + attribute.getValue());
}
}
}
}
}
}
}
Update:
The data in the XML is licensed so I cannot show the full file. This is a very very small segment in which I have scrambled the data.
<?xml version="1.0" encoding="UTF-8"?>
<records>
<REC>
<UID>WOS:000310438600004</UID>
<static_data>
<summary>
<EWUID>
<WUID coll_id="WOS" />
<edition value="WOS.SCI" />
</EWUID>
<pub_info coverdate="NOV 2012" has_abstract="N" issue="5" pubmonth="NOV" pubtype="Journal" pubyear="2012" sortdate="2012-11-01" vol="188">
<page begin="1662" end="1663" page_count="2">1662-1663</page>
</pub_info>
<titles count="6">
<title type="source">JOURNAL OF UROLOGY</title>
<title type="source_abbrev">J UROLOGY</title>
<title type="abbrev_iso">J. Urol.</title>
<title type="abbrev_11">J UROL</title>
<title type="abbrev_29">J UROL</title>
<title type="item">Something something</title>
</titles>
<names count="1">
<name addr_no="1 2 3" reprint="Y" role="author" seq_no="1">
<display_name>John Doe</display_name>
<full_name>John Doe</full_name>
<wos_standard>Doe, John</wos_standard>
<first_name>John</first_name>
<last_name>Doe</last_name>
</name>
</names>
<doctypes count="1">
<doctype>Editorial Material</doctype>
</doctypes>
<publishers>
<publisher>
<address_spec addr_no="1">
<full_address>360 PARK AVE SOUTH, NEW YORK, NY 10010-1710 USA</full_address>
<city>NEW YORK</city>
</address_spec>
<names count="1">
<name addr_no="1" role="publisher" seq_no="1">
<display_name>ELSEVIER SCIENCE INC</display_name>
<full_name>ELSEVIER SCIENCE INC</full_name>
</name>
</names>
</publisher>
</publishers>
</summary>
</static_data>
</REC>
</records>

A similar solution to lscoughlin's answer is to use DOM4J which has mechanims to deal with this scenario: http://dom4j.sourceforge.net/
In my opionin it is more straight forward and easier to follow. It might not support namespaces, though.

I'm making two assumptions 1) that there is an early level of repetition, and 2) that you can do something meaningful with a partial document.
Let's assume you can move some level of nesting in, and then handle the document multiple times, removing the nodes at the working level each time you "handle" the document. This means that only a single working subtree will be in memory at any given time.
Here's a working code snippet:
package bigparse;
import static javax.xml.stream.XMLStreamConstants.CHARACTERS;
import static javax.xml.stream.XMLStreamConstants.END_DOCUMENT;
import static javax.xml.stream.XMLStreamConstants.END_ELEMENT;
import static javax.xml.stream.XMLStreamConstants.START_DOCUMENT;
import static javax.xml.stream.XMLStreamConstants.START_ELEMENT;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.StringWriter;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class BigParse {
public static void main(String... args) {
XMLInputFactory factory = XMLInputFactory.newInstance();
DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
try {
XMLStreamReader streamReader = factory.createXMLStreamReader(new FileReader("src/main/resources/test.xml"));
DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
Document document = documentBuilder.newDocument();
Element rootElement = null;
Element currentElement = null;
int branchLevel = 0;
int maxBranchLevel = 1;
while (streamReader.hasNext()) {
int event = streamReader.next();
switch (event) {
case START_DOCUMENT:
continue;
case START_ELEMENT:
if (branchLevel < maxBranchLevel) {
Element workingElement = readElementOnly(streamReader, document);
if (rootElement == null) {
document.appendChild(workingElement);
rootElement = document.getDocumentElement();
currentElement = rootElement;
} else {
currentElement.appendChild(workingElement);
currentElement = workingElement;
}
branchLevel++;
} else {
workingLoop(streamReader, document, currentElement);
}
continue;
case CHARACTERS:
currentElement.setTextContent(streamReader.getText());
continue;
case END_ELEMENT:
if (currentElement != rootElement) {
currentElement = (Element) currentElement.getParentNode();
branchLevel--;
}
continue;
case END_DOCUMENT:
break;
}
}
} catch (ParserConfigurationException
| FileNotFoundException
| XMLStreamException e) {
throw new RuntimeException(e);
}
}
private static Element readElementOnly(XMLStreamReader streamReader, Document document) {
Element workingElement = document.createElement(streamReader.getLocalName());
for (int attributeIndex = 0; attributeIndex < streamReader.getAttributeCount(); attributeIndex++) {
workingElement.setAttribute(
streamReader.getAttributeLocalName(attributeIndex),
streamReader.getAttributeValue(attributeIndex));
}
return workingElement;
}
private static void workingLoop(final XMLStreamReader streamReader, final Document document, final Element fragmentRoot)
throws XMLStreamException {
Element startElement = readElementOnly(streamReader, document);
fragmentRoot.appendChild(startElement);
Element currentElement = startElement;
while (streamReader.hasNext()) {
int event = streamReader.next();
switch (event) {
case START_DOCUMENT:
continue;
case START_ELEMENT:
Element workingElement = readElementOnly(streamReader, document);
currentElement.appendChild(workingElement);
currentElement = workingElement;
continue;
case CHARACTERS:
currentElement.setTextContent(streamReader.getText());
continue;
case END_ELEMENT:
if (currentElement != startElement) {
currentElement = (Element) currentElement.getParentNode();
continue;
} else {
handleDocument(document, startElement);
fragmentRoot.removeChild(startElement);
startElement = null;
return;
}
}
}
}
// THIS FUNCTION DOES SOMETHING MEANINFUL
private static void handleDocument(Document document, Element startElement) {
System.out.println(stringify(document));
}
private static String stringify(Document document) {
try {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
StreamResult result = new StreamResult(new StringWriter());
DOMSource source = new DOMSource(document);
transformer.transform(source, result);
String xmlString = result.getWriter().toString();
return xmlString;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
EDIT: I made an incredibly silly mistake. It's fixed now. It's working but imperfect -- should be enough to lead you in a useful direction.

Consider using an XSLT 3.0 streaming transformation of the form:
<xsl:template name="main">
<xsl:stream href="bigInput.xml">
<xsl:for-each select="copy-of(/records/REC)">
<!-- process one record -->
</xsl:for-each>
</xsl:stream>
</xsl:template>
You can process this using Saxon-EE 9.6.
The "process one record" logic could use the Saxon SQL extension, or it could invoke an extension function: the context node will be a REC element with its contained tree, fully navigable within the subtree, but with no ability to navigate outside the REC element currently being processed.

How to retrieve a list of included client libs from a component in CQ?

Is it possible to determine, what client libs have been loaded prior to a component?
We are running multiple site backed by different Javascript frameworks. In order to run a single component across the board, it's not sufficient to just use
<cq:includeClientLib categories="blah"/>
We need to identify the respective framework (i.e. AngularJS, Vanilla, jQuery, blah) in order to facilitate the integration.
We are looking for a decent server side solution.

I haven't actually done this, but it would presumably be possible if you are buffering your output to clone the JspWriter buffer or examine it to see what it already contains. That sounds ugly to me, though. But this is decompiled code for how the cq:includeClientLib tag adds libraries to the output, which may show you how you can read back what was previously written:
package com.adobe.granite.ui.tags;
import com.day.cq.widget.HtmlLibraryManager;
import java.io.IOException;
import javax.servlet.ServletRequest;
import javax.servlet.jsp.JspException;
import javax.servlet.jsp.JspWriter;
import javax.servlet.jsp.PageContext;
import javax.servlet.jsp.tagext.TagSupport;
import org.apache.sling.api.SlingHttpServletRequest;
import org.apache.sling.api.scripting.SlingBindings;
import org.apache.sling.scripting.jsp.util.TagUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class IncludeClientLibraryTag extends TagSupport {
private static final long serialVersionUID = -3068291967085012331L;
private static final Logger log = LoggerFactory.getLogger(IncludeClientLibraryTag.class);
private String categories;
private String js;
private String css;
private String theme;
private Boolean themed;
public IncludeClientLibraryTag() {
}
public void setPageContext(PageContext pageContext) {
super.setPageContext(pageContext);
this.categories = null;
this.js = null;
this.css = null;
this.theme = null;
this.themed = null;
}
public void setCategories(String categories) {
this.categories = categories;
}
public void setJs(String js) {
this.js = js;
}
public void setCss(String css) {
this.css = css;
}
public void setTheme(String theme) {
this.theme = theme;
}
public void setThemed(boolean themed) {
this.themed = Boolean.valueOf(themed);
}
public int doEndTag() throws JspException {
SlingHttpServletRequest request = TagUtil.getRequest(this.pageContext);
HtmlLibraryManager libManager = this.getHtmlLibraryManager(request);
if(libManager == null) {
log.warn("<ui:includeClientLib>: Could not retrieve HtmlLibraryManager service, skipping inclusion.");
return 6;
} else {
JspWriter out = this.pageContext.getOut();
try {
if(this.categories != null) {
libManager.writeIncludes(request, out, toArray(this.categories));
} else if(this.theme != null) {
libManager.writeThemeInclude(request, out, toArray(this.theme));
} else if(this.js != null) {
if(this.themed != null) {
libManager.writeJsInclude(request, out, this.themed.booleanValue(), toArray(this.js));
} else {
libManager.writeJsInclude(request, out, toArray(this.js));
}
} else if(this.css != null) {
if(this.themed != null) {
libManager.writeCssInclude(request, out, this.themed.booleanValue(), toArray(this.css));
} else {
libManager.writeCssInclude(request, out, toArray(this.css));
}
}
return 6;
} catch (IOException var6) {
String libs = this.categories != null?"categories: " + this.categories:(this.theme != null?"theme: " + this.theme:(this.js != null?"js: " + this.js:(this.css != null?"css: " + this.css:"")));
throw new JspException("Could not include client library: " + libs, var6);
}
}
}
private HtmlLibraryManager getHtmlLibraryManager(ServletRequest request) {
SlingBindings bindings = (SlingBindings)request.getAttribute(SlingBindings.class.getName());
return (HtmlLibraryManager)bindings.getSling().getService(HtmlLibraryManager.class);
}
private static String[] toArray(String commaSeparatedList) {
if(commaSeparatedList == null) {
return new String[0];
} else {
String[] split = commaSeparatedList.split(",");
for(int i = 0; i < split.length; ++i) {
split[i] = split[i].trim();
}
return split;
}
}
}
I think the best solution may be to use the client library dependencies or embed attributes in your library, though, or let the client-side JavaScript test if a library is present (ex. test if the jQuery object is undefined) and then take appropriate action. In other words, let the client side determine the final rendering based on what libraries exist on in the client. It sounds like this may not be possible for your situation, though.
dependencies: This is a list of other client library categories on
which this library folder depends. For example, given two
cq:ClientLibraryFolder nodes F and G, if a file in F requires another
file in G in order to function properly, then at least one of the
categories of G should be among the dependencies of F.
embed: Used to > embed code from other libraries. If node F embeds nodes G and H, the
resulting HTML will be a concetration of content from nodes G and H.

Multiple NameSpace in Xml Xpath value

Am new in using Xpath parsing in Java for Xmls. But I learnt it and it worked pretty well until this below issue am not sure how to go traverse to next node in this . Please find the below code and Let me know what needs to be corrected .
package test;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
public class CallTestcall {
public static void main(String[] args) throws Exception {
DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
domFactory.setNamespaceAware(true);
DocumentBuilder builder = domFactory.newDocumentBuilder();
String responsePath1 = "C:/Verizon/webserviceTestTool/generatedResponse/example.xml";
Document doc1 = builder.parse(responsePath1);
String responsePath0 = "C:/Verizon/webserviceTestTool/generatedResponse/response.xml";
Document doc0 = builder.parse(responsePath0);
example0(doc0);
example1(doc1);
}
private static void example0(Document example)
throws XPathExpressionException, TransformerException {
System.out.println("\n*** First example - namespacelookup hardcoded ***");
XPath xPath = XPathFactory.newInstance().newXPath();
xPath.setNamespaceContext(new HardcodedNamespaceResolver());
String result = xPath.evaluate("s:Envelope/s:Body/ns1:UpdateSessionResponse",
example);
// I tried all the Values to traverse further to UpdateSessionResult but am not able to I used the following xpath expressions
result = xPath.evaluate("s:Envelope/s:Body/ns1:UpdateSessionResponse/a:UpdateSessionResult",
example);
result = xPath.evaluate("s:Envelope/s:Body/ns1:UpdateSessionResponse/i:UpdateSessionResult",
example);
System.out.println("example0 : "+result);
}
private static void example1(Document example)
throws XPathExpressionException, TransformerException {
System.out.println("\n*** First example - namespacelookup hardcoded ***");
XPath xPath = XPathFactory.newInstance().newXPath();
xPath.setNamespaceContext(new HardcodedNamespaceResolver());
String result = xPath.evaluate("books:booklist/technical:book/:author",
example);
System.out.println("example1 : "+result);
}
}
Please find the class that implements nameSpaceContext where I have added the prefixes
package test;
import java.util.Iterator;
import javax.xml.XMLConstants;
import javax.xml.namespace.NamespaceContext;
public class HardcodedNamespaceResolver implements NamespaceContext {
/**
* This method returns the uri for all prefixes needed. Wherever possible it
* uses XMLConstants.
*
* #param prefix
* #return uri
*/
public String getNamespaceURI(String prefix) {
if (prefix == null) {
throw new IllegalArgumentException("No prefix provided!");
} else if (prefix.equals(XMLConstants.DEFAULT_NS_PREFIX)) {
return "http://univNaSpResolver/book";
} else if (prefix.equals("books")) {
return "http://univNaSpResolver/booklist";
} else if (prefix.equals("fiction")) {
return "http://univNaSpResolver/fictionbook";
} else if (prefix.equals("technical")) {
return "http://univNaSpResolver/sciencebook";
} else if (prefix.equals("s")) {
return "http://schemas.xmlsoap.org/soap/envelope/";
} else if (prefix.equals("a")) {
return "http://channelsales.corp.cox.com/vzw/v1/data/";
} else if (prefix.equals("i")) {
return "http://www.w3.org/2001/XMLSchema-instance";
} else if (prefix.equals("ns1")) {
return "http://channelsales.corp.cox.com/vzw/v1/";
}
else {
return XMLConstants.NULL_NS_URI;
}
}
public String getPrefix(String namespaceURI) {
// Not needed in this context.
return null;
}
public Iterator getPrefixes(String namespaceURI) {
// Not needed in this context.
return null;
}
}
Please find my Xml ::::
String XmlString = "<s:Envelope xmlns:s="http://schemas.xmlsoap.org/soap/envelope/"><s:Body><UpdateSessionResponse xmlns="http://channelsales.corp.cox.com/vzw/v1/"><UpdateSessionResult xmlns:a="http://channelsales.corp.cox.com/vzw/v1/data/" xmlns:i="http://www.w3.org/2001/XMLSchema-instance">
<a:ResponseHeader>
<a:SuccessFlag>true</a:SuccessFlag>
<a:ErrorCode i:nil="true"/>
<a:ErrorMessage i:nil="true"/>
<a:Timestamp>2012-12-05T15:28:35.5363903-05:00</a:Timestamp>
</a:ResponseHeader>
<a:SessionId>cd3ce09e-eb33-48e8-b628-ecd406698aee</a:SessionId>
<a:CacheKey i:nil="true"/>

Try the following. It works for me.
result = xPath.evaluate("/s:Envelope/s:Body/ns1:UpdateSessionResponse/ns1:UpdateSessionResult",
example);
Since you are searching from the root of the document, precede the xpath expression with a forward slash (/)
Also, in the XML fragment below, the string xmlns="http... means you are setting that to be the default namespace. In your namespace resolver you are giving this the prefix ns1. So even though UpdateSessionResult is defining two namespace prefixes a and i, it does not use those prefixes itself (for example <a:UpdateSessionResult...) therefore it belongs to the default namespace (named 'ns1')
<UpdateSessionResponse xmlns="http://channelsales.corp.cox.com/vzw/v1/">
<UpdateSessionResult xmlns:a="http://channelsales.corp.cox.com/vzw/v1/data/" xmlns:i="http://www.w3.org/2001/XMLSchema-instance">
That's why you need to use ns1:UpdateSessionResult instead of either a:UpdateSessionResult or i:UpdateSessionResult

how to get HTML DOM path by text content?

a HTML file:
<html>
<body>
<div class="main">
<p id="tID">content</p>
</div>
</body>
</html>
i has a String == "content",
i want to use "content" get HTML DOM path:
html body div.main p#tID
chrome developer tools has this feature(Elements tag,bottom bar), i want to know how to do it in java?
thanks for your help :)

Have fun :)
JAVA CODE
import java.io.File;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
public class Teste {
public static void main(String[] args) {
try {
// read and clean document
TagNode tagNode = new HtmlCleaner().clean(new File("test.xml"));
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
// use XPath to find target node
XPath xpath = XPathFactory.newInstance().newXPath();
Node node = (Node) xpath.evaluate("//*[text()='content']", document, XPathConstants.NODE);
// assembles jquery/css selector
String result = "";
while (node != null && node.getParentNode() != null) {
result = readPath(node) + " " + result;
node = node.getParentNode();
}
System.out.println(result);
// returns html body div#myDiv.foo.bar p#tID
} catch (Exception e) {
e.printStackTrace();
}
}
// Gets id and class attributes of this node
private static String readPath(Node node) {
NamedNodeMap attributes = node.getAttributes();
String id = readAttribute(attributes.getNamedItem("id"), "#");
String clazz = readAttribute(attributes.getNamedItem("class"), ".");
return node.getNodeName() + id + clazz;
}
// Read attribute
private static String readAttribute(Node node, String token) {
String result = "";
if(node != null) {
result = token + node.getTextContent().replace(" ", token);
}
return result;
}
}
XML EXAMPLE
<html>
<body>
<br>
<div id="myDiv" class="foo bar">
<p id="tID">content</p>
</div>
</body>
</html>
EXPLANATIONS
Object document points to evaluated XML.
The XPath //*[text()='content'] finds everthing with text = 'content', and find the node.
The while loops up to the first node, getting id and classes of current element.
MORE EXPLANATIONS
In this new solution I'm using HtmlCleaner. So, you can have <br>, for example, and cleaner will replace with <br/>.
To use HtmlCleaner, just download the newest jar here.

Develop Reference

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Jsoup WhiteList to allow comments - java

Related

Liferay 7 Extending EditableFragmentEntryProcessor

Java Stax for Complex / Large XML

How to retrieve a list of included client libs from a component in CQ?

Multiple NameSpace in Xml Xpath value

how to get HTML DOM path by text content?

Categories

Resources