how to get HTML DOM path by text content? - java

a HTML file:
<html>
<body>
<div class="main">
<p id="tID">content</p>
</div>
</body>
</html>
i has a String == "content",
i want to use "content" get HTML DOM path:
html body div.main p#tID
chrome developer tools has this feature(Elements tag,bottom bar), i want to know how to do it in java?
thanks for your help :)

Have fun :)
JAVA CODE
import java.io.File;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
public class Teste {
public static void main(String[] args) {
try {
// read and clean document
TagNode tagNode = new HtmlCleaner().clean(new File("test.xml"));
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
// use XPath to find target node
XPath xpath = XPathFactory.newInstance().newXPath();
Node node = (Node) xpath.evaluate("//*[text()='content']", document, XPathConstants.NODE);
// assembles jquery/css selector
String result = "";
while (node != null && node.getParentNode() != null) {
result = readPath(node) + " " + result;
node = node.getParentNode();
}
System.out.println(result);
// returns html body div#myDiv.foo.bar p#tID
} catch (Exception e) {
e.printStackTrace();
}
}
// Gets id and class attributes of this node
private static String readPath(Node node) {
NamedNodeMap attributes = node.getAttributes();
String id = readAttribute(attributes.getNamedItem("id"), "#");
String clazz = readAttribute(attributes.getNamedItem("class"), ".");
return node.getNodeName() + id + clazz;
}
// Read attribute
private static String readAttribute(Node node, String token) {
String result = "";
if(node != null) {
result = token + node.getTextContent().replace(" ", token);
}
return result;
}
}
XML EXAMPLE
<html>
<body>
<br>
<div id="myDiv" class="foo bar">
<p id="tID">content</p>
</div>
</body>
</html>
EXPLANATIONS
Object document points to evaluated XML.
The XPath //*[text()='content'] finds everthing with text = 'content', and find the node.
The while loops up to the first node, getting id and classes of current element.
MORE EXPLANATIONS
In this new solution I'm using HtmlCleaner. So, you can have <br>, for example, and cleaner will replace with <br/>.
To use HtmlCleaner, just download the newest jar here.

Related

JAVA & XML : com.sun.org.apache.xpath.internal.XPathException: Can not convert #STRING to a NodeList

I am writing a java function that is parsing an xml element & extracting the given xpath expression. Below is the function :
public static Node getDataNode(Element payload, final HashMap<String, String> namespaces, String xpathStr) {
Node node = null;
try {
// Create a namespace context based on the namespaces passed in.
NamespaceContext ctx = new NamespaceContext() {
public String getNamespaceURI(String prefix) {
return namespaces.get(prefix);
}
public Iterator getPrefixes(String val) {
return null;
}
public String getPrefix(String uri) {
return null;
}
};
XPathFactory xpathFact = XPathFactory.newInstance();
XPath xpath = xpathFact.newXPath();
xpath.setNamespaceContext(ctx);
XPathExpression expr = xpath.compile(xpathStr);
System.out.println("Got request to process node : " + payload.getLocalName() + " with " + xpathStr);
System.out.println(xpathStr + " has been compiled successfully.");
((XMLElement) payload).print(System.out);
node = (Node) expr.evaluate(payload, XPathConstants.NODE);
} catch (XPathExpressionException ex) {
ex.printStackTrace();
return null;
} catch (IOException io) {
io.printStackTrace();
return null;
}
return node;
}
Below is the logs for this part of function :
Got request to process node : Body with ".//soapenv:Body/pip:request"
".//soapenv:Body/pip:request" has been compiled successfully.
<soapenv:Body xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/">
<pip:request xmlns:pip="http://xmlns.oracle.com/ServiceBusApplication/UserInterfaceTest/Pipeline">textContent</pip:request>
</soapenv:Body>
I have tried different xpath expression like //soapenv:Body/pip:request, .//soapenv:Body/pip:request but still i am getting the error :
com.sun.org.apache.xpath.internal.XPathException: Can not convert #STRING to a NodeList!
at com.sun.org.apache.xpath.internal.objects.XObject.error(XObject.java:711)
at com.sun.org.apache.xpath.internal.objects.XObject.nodeset(XObject.java:441)
at com.sun.org.apache.xpath.internal.jaxp.XPathExpressionImpl.getResultAsType(XPathExpressionImpl.java:357)
at com.sun.org.apache.xpath.internal.jaxp.XPathExpressionImpl.eval(XPathExpressionImpl.java:101)
at com.sun.org.apache.xpath.internal.jaxp.XPathExpressionImpl.evaluate(XPathExpressionImpl.java:182)
Please let me know what is wrong in the code.Appreciate your help in resolving the issue . Thanks.
Unable to reproduce. Tested with MCVE code below on Oracle JDK 1.5 and on Oracle JDK 9.
Only change made to getDataNode method is commenting out the ((XMLElement) payload).print(System.out) statement, since Oracle JDK doesn't have an XMLElement type.
Test
import java.io.StringReader;
import java.util.HashMap;
import java.util.Iterator;
import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
String xml = "<soapenv:Body xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\">\n" +
" <pip:request xmlns:pip=\"http://xmlns.oracle.com/ServiceBusApplication/UserInterfaceTest/Pipeline\">textContent</pip:request>\n" +
" </soapenv:Body>";
DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
domFactory.setNamespaceAware(true);
DocumentBuilder domBuilder = domFactory.newDocumentBuilder();
Document document = domBuilder.parse(new InputSource(new StringReader(xml)));
HashMap<String, String> namespaces = new HashMap<String, String>();
namespaces.put("soapenv", "http://schemas.xmlsoap.org/soap/envelope/");
namespaces.put("pip", "http://xmlns.oracle.com/ServiceBusApplication/UserInterfaceTest/Pipeline");
Node node = getDataNode(document.getDocumentElement(), namespaces, ".//soapenv:Body/pip:request");
System.out.println(node != null ? node.getTextContent() : null);
node = getDataNode(document.getDocumentElement(), namespaces, "/soapenv:Body/pip:request");
System.out.println(node != null ? node.getTextContent() : null);
node = getDataNode(document.getDocumentElement(), namespaces, ".//pip:request");
System.out.println(node != null ? node.getTextContent() : null);
Output
Got request to process node : Body with .//soapenv:Body/pip:request
.//soapenv:Body/pip:request has been compiled successfully.
null
Got request to process node : Body with /soapenv:Body/pip:request
/soapenv:Body/pip:request has been compiled successfully.
textContent
Got request to process node : Body with .//pip:request
.//pip:request has been compiled successfully.
textContent
As you can see, code runs fine, but the .//soapenv:Body/pip:request XPath is not correct for the given XML, since there is no <soapenv:Body> tag inside the given payload element.

JSOUP HTML parsing from URL

I'm using JSOUP in Java to parse HTMLs like these two:
This and this.
In the first case, I get the output.
And I have a problem with the connection:
doc = Jsoup.connect(url).get();
There are some URLs which can easily be parsed, and I've got the output, but there are URLs too which produces empty output like this:
Title: [].
I can't understand what the problem is if both URLs are the same.
This is my code:
Document doc;
try {
doc = Jsoup.connect("http://ekonomika.sme.sk/c/8047766/s-velkymi-chybami-stavali-aj-budovu-centralnej-banky.html").get();
String title = doc.title();
System.out.println("title : " + title);
}
catch (IOException e) {
e.printStackTrace();
}
Take a look at what's in the head of the second url
Element h = doc.head();
System.out.println("head : " + h);
You'll see there are some meta refresh tags and an empty title:
<head>
<noscript>
<meta http-equiv="refresh" content="1;URL='/c/8047766/s-velkymi-chybami-stavali-aj-budovu-centralnej-banky.html?piano_d=1'">
</noscript>
<meta http-equiv="refresh" content="10;URL='/c/8047766/s-velkymi-chybami-stavali-aj-budovu-centralnej-banky.html?piano_t=1'">
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title></title>
</head>
Which explains the empty title. You have to follow the redirect.
Here is my code for parsing, with this URL I have no output.
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package commentparser;
import java.io.IOException;
import static java.lang.Boolean.FALSE;
import static java.lang.Boolean.TRUE;
import java.net.URL;
import static java.sql.JDBCType.NULL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import static javafx.beans.binding.Bindings.length;
import static jdk.nashorn.internal.objects.ArrayBufferView.length;
import static oracle.jrockit.jfr.events.Bits.length;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class CommentParser {
public static void main(String[] args) {
Document doc;
try {
doc = Jsoup.connect("http://ekonomika.sme.sk/c/8047766/s-velkymi-chybami-stavali-aj-budovu-centralnej-banky.html").followRedirects(true).get();
String title = doc.title();
System.out.println("title : " + title);
//Link for discussions
if(doc.select("a[href^=/diskusie/reaction_show]").isEmpty() == FALSE){
Elements description = doc.select("a[href^=/diskusie/reaction_show]");
for (Element link : description) {
// get the value from href attribute
System.out.println("Diskusie: " + link.attr("href"));
}
}
//Author of article
if(doc.select("span[class^=autor]").isEmpty() == FALSE){
Elements description = doc.select("span[class^=autor]");
for (Element link : description) {
// get the value from href attribute
//System.out.println("\nlink : " + link.attr("b"));
System.out.println(link.text());
}
}
// get all links
Elements links = doc.select("a[href]");
for (Element link : links) {
// get the value from href attribute
System.out.println("\nlink : " + link.attr("href"));
System.out.println("text : " + link.text());
}
} catch (IOException e) {
e.printStackTrace();
}
}
}

Java Stax for Complex / Large XML

I have an XML file that is 4.2 GB! Obviously parsing the entire DOM is not practical. I have been looking at SAX and STAX to accomplish parsing this gigantic XML file. However all the examples I've seen are simple. The XML file I am dealing with has nested on nested on nested. There are areas where it goes 10+ levels.
I found this tutorial but not sure if its a viable solution.
http://www.javacodegeeks.com/2013/05/parsing-xml-using-dom-sax-and-stax-parser-in-java.html (botton example using STAX)
I'm not really sure how to handle nested objects.
I have created Java objects to mimic the structure of the XML. Here are a few, too many to display.
Record.java
public class Record implements Serializable {
String uid;
StaticData staticData;
DynamicData dynamicData;
}
Summary.java
public class Summary {
EWUID ewuid;
PubInfo pubInfo;
Titles titles;
Names names;
DocTypes docTypes;
Publishers publishers;
}
EWUID.java
public class EWUID {
String collId;
String edition;
}
PubInfo.java
public class PubInfo {
String coverDate;
String hasAbstract;
String issue;
String pubMonth;
String pubType;
String pubYear;
String sortDate;
String volume;
}
This is the code I've come up with so far.
public class TRWOSParser {
XMLEventReader eventReader;
XMLInputFactory inputFactory;
InputStream inputStream;
public TRWOSParser(String file) throws FileNotFoundException, XMLStreamException {
inputFactory = XMLInputFactory.newInstance();
inputStream = new FileInputStream(file);
eventReader = inputFactory.createXMLEventReader(inputStream);
}
public void parse() throws XMLStreamException{
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
if (event.isStartElement()) {
StartElement startElement = event.asStartElement();
if (startElement.getName().getLocalPart().equals("record")) {
Record record = new Record();
Iterator<Attribute> attributes = startElement.getAttributes();
while (attributes.hasNext()) {
Attribute attribute = attributes.next();
if (attribute.getName().toString().equals("UID")) {
System.out.println("UID: " + attribute.getValue());
}
}
}
}
}
}
}
Update:
The data in the XML is licensed so I cannot show the full file. This is a very very small segment in which I have scrambled the data.
<?xml version="1.0" encoding="UTF-8"?>
<records>
<REC>
<UID>WOS:000310438600004</UID>
<static_data>
<summary>
<EWUID>
<WUID coll_id="WOS" />
<edition value="WOS.SCI" />
</EWUID>
<pub_info coverdate="NOV 2012" has_abstract="N" issue="5" pubmonth="NOV" pubtype="Journal" pubyear="2012" sortdate="2012-11-01" vol="188">
<page begin="1662" end="1663" page_count="2">1662-1663</page>
</pub_info>
<titles count="6">
<title type="source">JOURNAL OF UROLOGY</title>
<title type="source_abbrev">J UROLOGY</title>
<title type="abbrev_iso">J. Urol.</title>
<title type="abbrev_11">J UROL</title>
<title type="abbrev_29">J UROL</title>
<title type="item">Something something</title>
</titles>
<names count="1">
<name addr_no="1 2 3" reprint="Y" role="author" seq_no="1">
<display_name>John Doe</display_name>
<full_name>John Doe</full_name>
<wos_standard>Doe, John</wos_standard>
<first_name>John</first_name>
<last_name>Doe</last_name>
</name>
</names>
<doctypes count="1">
<doctype>Editorial Material</doctype>
</doctypes>
<publishers>
<publisher>
<address_spec addr_no="1">
<full_address>360 PARK AVE SOUTH, NEW YORK, NY 10010-1710 USA</full_address>
<city>NEW YORK</city>
</address_spec>
<names count="1">
<name addr_no="1" role="publisher" seq_no="1">
<display_name>ELSEVIER SCIENCE INC</display_name>
<full_name>ELSEVIER SCIENCE INC</full_name>
</name>
</names>
</publisher>
</publishers>
</summary>
</static_data>
</REC>
</records>
A similar solution to lscoughlin's answer is to use DOM4J which has mechanims to deal with this scenario: http://dom4j.sourceforge.net/
In my opionin it is more straight forward and easier to follow. It might not support namespaces, though.
I'm making two assumptions 1) that there is an early level of repetition, and 2) that you can do something meaningful with a partial document.
Let's assume you can move some level of nesting in, and then handle the document multiple times, removing the nodes at the working level each time you "handle" the document. This means that only a single working subtree will be in memory at any given time.
Here's a working code snippet:
package bigparse;
import static javax.xml.stream.XMLStreamConstants.CHARACTERS;
import static javax.xml.stream.XMLStreamConstants.END_DOCUMENT;
import static javax.xml.stream.XMLStreamConstants.END_ELEMENT;
import static javax.xml.stream.XMLStreamConstants.START_DOCUMENT;
import static javax.xml.stream.XMLStreamConstants.START_ELEMENT;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.StringWriter;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class BigParse {
public static void main(String... args) {
XMLInputFactory factory = XMLInputFactory.newInstance();
DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
try {
XMLStreamReader streamReader = factory.createXMLStreamReader(new FileReader("src/main/resources/test.xml"));
DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
Document document = documentBuilder.newDocument();
Element rootElement = null;
Element currentElement = null;
int branchLevel = 0;
int maxBranchLevel = 1;
while (streamReader.hasNext()) {
int event = streamReader.next();
switch (event) {
case START_DOCUMENT:
continue;
case START_ELEMENT:
if (branchLevel < maxBranchLevel) {
Element workingElement = readElementOnly(streamReader, document);
if (rootElement == null) {
document.appendChild(workingElement);
rootElement = document.getDocumentElement();
currentElement = rootElement;
} else {
currentElement.appendChild(workingElement);
currentElement = workingElement;
}
branchLevel++;
} else {
workingLoop(streamReader, document, currentElement);
}
continue;
case CHARACTERS:
currentElement.setTextContent(streamReader.getText());
continue;
case END_ELEMENT:
if (currentElement != rootElement) {
currentElement = (Element) currentElement.getParentNode();
branchLevel--;
}
continue;
case END_DOCUMENT:
break;
}
}
} catch (ParserConfigurationException
| FileNotFoundException
| XMLStreamException e) {
throw new RuntimeException(e);
}
}
private static Element readElementOnly(XMLStreamReader streamReader, Document document) {
Element workingElement = document.createElement(streamReader.getLocalName());
for (int attributeIndex = 0; attributeIndex < streamReader.getAttributeCount(); attributeIndex++) {
workingElement.setAttribute(
streamReader.getAttributeLocalName(attributeIndex),
streamReader.getAttributeValue(attributeIndex));
}
return workingElement;
}
private static void workingLoop(final XMLStreamReader streamReader, final Document document, final Element fragmentRoot)
throws XMLStreamException {
Element startElement = readElementOnly(streamReader, document);
fragmentRoot.appendChild(startElement);
Element currentElement = startElement;
while (streamReader.hasNext()) {
int event = streamReader.next();
switch (event) {
case START_DOCUMENT:
continue;
case START_ELEMENT:
Element workingElement = readElementOnly(streamReader, document);
currentElement.appendChild(workingElement);
currentElement = workingElement;
continue;
case CHARACTERS:
currentElement.setTextContent(streamReader.getText());
continue;
case END_ELEMENT:
if (currentElement != startElement) {
currentElement = (Element) currentElement.getParentNode();
continue;
} else {
handleDocument(document, startElement);
fragmentRoot.removeChild(startElement);
startElement = null;
return;
}
}
}
}
// THIS FUNCTION DOES SOMETHING MEANINFUL
private static void handleDocument(Document document, Element startElement) {
System.out.println(stringify(document));
}
private static String stringify(Document document) {
try {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
StreamResult result = new StreamResult(new StringWriter());
DOMSource source = new DOMSource(document);
transformer.transform(source, result);
String xmlString = result.getWriter().toString();
return xmlString;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
EDIT: I made an incredibly silly mistake. It's fixed now. It's working but imperfect -- should be enough to lead you in a useful direction.
Consider using an XSLT 3.0 streaming transformation of the form:
<xsl:template name="main">
<xsl:stream href="bigInput.xml">
<xsl:for-each select="copy-of(/records/REC)">
<!-- process one record -->
</xsl:for-each>
</xsl:stream>
</xsl:template>
You can process this using Saxon-EE 9.6.
The "process one record" logic could use the Saxon SQL extension, or it could invoke an extension function: the context node will be a REC element with its contained tree, fully navigable within the subtree, but with no ability to navigate outside the REC element currently being processed.

Jsoup WhiteList to allow comments

I am using jsoup 1.7.3 with Whitelist custom configuration.
Apparently it sanitizes all the HTML comments (<!-- ... -->) inside the document.
It also sanitizes the <!DOCTYPE ...> element.
How can I get jsoup Whitelist to allow comments as is?
How can I define the !DOCTYPE element as allowed element with any attribute?
This is not possible by standard JSoup classes and its not dependent on WhiteList. Its the org.jsoup.safety.Cleaner. The cleaner uses a Node traverser that allows only elements and text nodes. Also only the body is parsed. So the head and doctype are ignored completely. So to achieve this you'll have to create a custom cleaner. For example if you have an html like
<!DOCTYPE html>
<html>
<head>
<!-- This is a script -->
<script type="text/javascript">
function newFun() {
alert(1);
}
</script>
</head>
<body>
<map name="diagram_map">
<area id="area1" />
<area id="area2" />
</map>
<!-- This is another comment. -->
<div>Test</div>
</body>
</html>
You will first create a custom cleaner copying the orginal one. However please note the package should org.jsoup.safety as the cleaner uses some of the protected method of Whitelist associated with. Also there is not point in extending the Cleaner as almost all methods are private and the inner node traverser is final.
package org.jsoup.safety;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
public class CustomCleaner {
private Whitelist whitelist;
public CustomCleaner(Whitelist whitelist) {
Validate.notNull(whitelist);
this.whitelist = whitelist;
}
public Document clean(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
Document clean = Document.createShell(dirtyDocument.baseUri());
copyDocType(dirtyDocument, clean);
if (dirtyDocument.head() != null)
copySafeNodes(dirtyDocument.head(), clean.head());
if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
copySafeNodes(dirtyDocument.body(), clean.body());
return clean;
}
private void copyDocType(Document dirtyDocument, Document clean) {
dirtyDocument.traverse(new NodeVisitor() {
public void head(Node node, int depth) {
if (node instanceof DocumentType) {
clean.prependChild(node);
}
}
public void tail(Node node, int depth) { }
});
}
public boolean isValid(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
Document clean = Document.createShell(dirtyDocument.baseUri());
int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
return numDiscarded == 0;
}
private final class CleaningVisitor implements NodeVisitor {
private int numDiscarded = 0;
private final Element root;
private Element destination; // current element to append nodes to
private CleaningVisitor(Element root, Element destination) {
this.root = root;
this.destination = destination;
}
public void head(Node source, int depth) {
if (source instanceof Element) {
Element sourceEl = (Element) source;
if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
ElementMeta meta = createSafeElement(sourceEl);
Element destChild = meta.el;
destination.appendChild(destChild);
numDiscarded += meta.numAttribsDiscarded;
destination = destChild;
} else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
numDiscarded++;
}
} else if (source instanceof TextNode) {
TextNode sourceText = (TextNode) source;
TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());
destination.appendChild(destText);
} else if (source instanceof Comment) {
Comment sourceComment = (Comment) source;
Comment destComment = new Comment(sourceComment.getData(), source.baseUri());
destination.appendChild(destComment);
} else if (source instanceof DataNode) {
DataNode sourceData = (DataNode) source;
DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri());
destination.appendChild(destData);
} else { // else, we don't care about comments, xml proc instructions, etc
numDiscarded++;
}
}
public void tail(Node source, int depth) {
if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {
destination = destination.parent(); // would have descended, so pop destination stack
}
}
}
private int copySafeNodes(Element source, Element dest) {
CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest);
NodeTraversor traversor = new NodeTraversor(cleaningVisitor);
traversor.traverse(source);
return cleaningVisitor.numDiscarded;
}
private ElementMeta createSafeElement(Element sourceEl) {
String sourceTag = sourceEl.tagName();
Attributes destAttrs = new Attributes();
Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
int numDiscarded = 0;
Attributes sourceAttrs = sourceEl.attributes();
for (Attribute sourceAttr : sourceAttrs) {
if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
destAttrs.put(sourceAttr);
else
numDiscarded++;
}
Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
destAttrs.addAll(enforcedAttrs);
return new ElementMeta(dest, numDiscarded);
}
private static class ElementMeta {
Element el;
int numAttribsDiscarded;
ElementMeta(Element el, int numAttribsDiscarded) {
this.el = el;
this.numAttribsDiscarded = numAttribsDiscarded;
}
}
}
Once you have both you could do cleaning as normal. Like
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.CustomCleaner;
import org.jsoup.safety.Whitelist;
public class CustomJsoupSanitizer {
public static void main(String[] args) {
try {
Document doc = Jsoup.parse(new File("t2.html"), "UTF-8");
CustomCleaner cleaner = new CustomCleaner(Whitelist.relaxed().addTags("script"));
Document doc2 = cleaner.clean(doc);
System.out.println(doc2.html());
} catch (IOException e) {
e.printStackTrace();
}
}
}
This will give you the sanitized output for above html as
<!DOCTYPE html>
<html>
<head>
<!-- This is a script -->
<script>
function newFun() {
alert(1);
}
</script>
</head>
<body>
<!-- This is another comment. -->
<div>
Test
</div>
</body>
</html>
You can customize the cleaner to match your requirement. i.e to avoid head node or script tag etc...
The Jsoup Cleaner doesn't give you a chance here (l. 100):
} else { // else, we don't care about comments, xml proc instructions, etc
numDiscarded++;
}
Only instances of Element and TextNode may remain in the cleaned HTML.
Your only chance may be something horrible like parsing the document, replacing the comments and the doctype with a special whitelisted tag, cleaning the document and then parsing and replacing the special tags again.

How to clean JTextPanes/JEditorPanes html content to string in Java?

I try to get pretty (cleaned) text content from JTextPane. Here is example code from JTextPane:
JTextPane textPane = new JTextPane ();
textPane.setContentType ("text/html");
textPane.setText ("This <b>is</b> a <b>test</b>.");
String text = textPane.getText ();
System.out.println (text);
Text look like this in JTexPane:
This is a test.
I get this kind of print to console:
<html>
<head>
</head>
<body>
This <b>is</b> a <b>test</b>.
</body>
</html>
I've used substring() and/or replace() code, but it is uncomfortable to use:
String text = textPane.getText ().replace ("<html> ... <body>\n , "");
Is there any simple function to remove all other tags than <b> tags (content) from string?
Sometimes JTextPane add <p> tags around content so I want to get rid of them also.
Like this:
<html>
<head>
</head>
<body>
<p style="margin-top: 0">
hdfhdfgh
</p>
</body>
</html>
I want to get only text content with tags:
This <b>is</b> a <b>test</b>.
I subclassed HTMLWriter and overrode startTag and endTag to skip all tags outside of <body>.
I did not test much, it seems to work ok. One drawback is that the output string has quite a lot of whitespace. Getting rid of that shouldn't be too hard.
import java.io.*;
import javax.swing.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
public class Foo {
public static void main(String[] args) throws Exception {
JTextPane textPane = new JTextPane();
textPane.setContentType("text/html");
textPane.setText("<p>This</p> <b>is</b> a <b>test</b>.");
StringWriter writer = new StringWriter();
HTMLDocument doc = (HTMLDocument) textPane.getStyledDocument();
HTMLWriter htmlWriter = new OnlyBodyHTMLWriter(writer, doc);
htmlWriter.write();
System.out.println(writer.toString());
}
private static class OnlyBodyHTMLWriter extends HTMLWriter {
public OnlyBodyHTMLWriter(Writer w, HTMLDocument doc) {
super(w, doc);
}
private boolean inBody = false;
private boolean isBody(Element elem) {
// copied from HTMLWriter.startTag()
AttributeSet attr = elem.getAttributes();
Object nameAttribute = attr
.getAttribute(StyleConstants.NameAttribute);
HTML.Tag name = null;
if (nameAttribute instanceof HTML.Tag) {
name = (HTML.Tag) nameAttribute;
}
return name == HTML.Tag.BODY;
}
#Override
protected void startTag(Element elem) throws IOException,
BadLocationException {
if (inBody) {
super.startTag(elem);
}
if (isBody(elem)) {
inBody = true;
}
}
#Override
protected void endTag(Element elem) throws IOException {
if (isBody(elem)) {
inBody = false;
}
if (inBody) {
super.endTag(elem);
}
}
}
}
You could use the HTML parser that the JEditorPane uses itself, HTMLEditorKit.ParserDelegator.
See this example, and the API docs.
I find solution to this problem by using substring and replace -methods:
// Get textPane content to string
String text = textPane.getText();
// Then I take substring to remove tags (html, head, body)
text = text.substring(44, text.length() - 19);
// Sometimes program sets <p style="margin-top: 0"> and </p> -tags so I remove them
// This isn't necessary to use.
text = text.replace("<p style=\"margin-top: 0\">\n ", "").replace("\n </p>", ""));
// This is for convert possible escape characters example & -> &
text = StringEscapeUtils.unescapeHtml(text);
There is link to StringEscapeUtils -libraries which convert escape characters back to normal view. Thanks to Ozhan Duz for the suggestion.
(commons-lang - download)
String text = textPane.getDocument.getText (0,textPane.getText().length());

Categories

Resources