How to get href using jsoup - java

I have some url. I want to get all href's from the html url is pointing to and all href from all gotten hrefs(recursively). The point is I want to set depth of that "recursion"
For example, if depth = 1, I need only href's from the HTML. If depth = 2, I need hrefs from HTML(that make suppose list1) and hrefs from each of href from list1 and so on
Here is what I have using jsoup:
import org.jsoup.*;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
public class Parser {
private final static String FILE_PATH = "src/main/resources/href.txt";
private List<String> result;
private int currentDepth;
private int maxDepth;
public Parser(int maxDepth) {
result = new ArrayList<String>();
this.maxDepth = maxDepth;
}
public void parseURL(String url) throws IOException {
url = url.toLowerCase();
if (!result.contains(url)) {
Connection connection = Jsoup.connect(url);
Document document = connection.get();
Elements links = document.select("a[href]");
for (Element link : links) {
String href = link.attr("href");
result.add(href);
parseURL(link.absUrl("href"));
currentDepth++;
if (currentDepth == maxDepth)
return;
}
}
}
}
How should I fix recursion condition to make it right?

I think you should check the depth first before calling the recursive function.
if (currentDepth >= maxDepth){
// do nothing
}else{
parseURL(...)
}

public void parseURL(String url) throws IOException {
url = url.toLowerCase();
if (!result.contains(url)) {
Connection connection = Jsoup.connect(url);
Document document = connection.get();
Elements links = document.getElementsByAttribute("href");
// Elements links = document.select("a[href]");
for (Element link : links) {
String href = link.attr("href");
result.add(href);
parseURL(link.absUrl("href"));
currentDepth++;
if (currentDepth == maxDepth)
return;
}
}
}
You can try this in your code, you can get all Elements from method getElementsByAttribute(String attribute) which have specified attribute

Related

read xml content from http endpoint like a map in Java

How could I get value based on a key from an XML content http enpoint, so it is something like
<authority.result result="found 7 matches" startToken="xxxxxxx">
<TestEntry keyId="0right0" test="test" valueId="rightValue123" version="1"/>
<TestEntry keyId="0wrong" test="test" valueId="0wrongValue" version="1"/>
<TestEntry keyId="0wrong0" test="test" valueId="wrong" version="1"/>
</authority.result>
I would like to get the valueId when keyId=="0right0" only, I previously wrote following but could not get value for a specific key.
URL url = new URL(endpoint);
XMLStreamReader reader = XMLInputFactory.newInstance().createXMLStreamReader(url.openStream());
String latest;
while (reader.hasNext()) {
if (reader.next() == XMLStreamConstants.START_ELEMENT) {
if (reader.getLocalName().equals("valueId")) {
latest = reader.getElementText();
return latest;
}
}
}
You need to distinguish XML element from an attribute. To read attribute name and value you have to use getAttributeName and getAttributeValue methods respectively.
Below you find example code how to read attributes:
import javax.xml.namespace.QName;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Optional;
public class XmlStreamApp {
public static void main(String[] args) throws IOException, XMLStreamException {
...
XMLStreamReader reader = XMLInputFactory.newInstance().createXMLStreamReader(stream);
Optional<String> value = findValueForTestEntry(reader, "0right0");
System.out.println(value);
}
private static Optional<String> findValueForTestEntry(XMLStreamReader reader, String keyValue) throws XMLStreamException {
while (reader.hasNext()) {
if (reader.next() == XMLStreamConstants.START_ELEMENT) {
String localName = reader.getLocalName();
if ("TestEntry".equals(localName)) {
Optional<String> optionalValue = getValueForKey(reader, keyValue);
if (optionalValue.isPresent()) {
return optionalValue;
}
}
}
}
return Optional.empty();
}
private static Optional<String> getValueForKey(XMLStreamReader reader, String keyValue) {
String value = "";
boolean found = false;
for (int attr = reader.getAttributeCount() - 1; attr >= 0; attr--) {
QName attributeName = reader.getAttributeName(attr);
if (attributeName.getLocalPart().equals("keyId")) {
found = keyValue.equals(reader.getAttributeValue(attr));
}
if (attributeName.getLocalPart().equals("valueId")) {
value = reader.getAttributeValue(attr);
}
}
return found ? Optional.of(value) : Optional.empty();
}
}
Above code prints:
Optional[rightValue123]
You could use an xpath to get to the desired value :
string(//TestEntry[#keyId="0right0"]/#valueId)

Need to display the WebCrawler output in the form of tree-like structure

I have below code to fetch the pages inside the given URL but I am not sure how to display them in tree like structure.
public class BasicWebCrawler {
private HashSet<String> links;
public BasicWebCrawler() {
links = new HashSet<String>();
}
public void getPageLinks(String URL) {
//4. Check if you have already crawled the URLs
//(we are intentionally not checking for duplicate content in this example)
if (!links.contains(URL)) {
try {
//4. (i) If not add it to the index
if (links.add(URL)) {
System.out.println(URL);
}
//2. Fetch the HTML code
Document document = Jsoup.connect(URL).get();
//3. Parse the HTML to extract links to other URLs
Elements linksOnPage = document.select("a[href^=\"" +URL+ "\"]");
//5. For each extracted URL... go back to Step 4.
for (Element page : linksOnPage) {
getPageLinks(page.attr("abs:href"));
}
} catch (IOException e) {
System.err.println("For '" + URL + "': " + e.getMessage());
}
}
}
public static void main(String[] args) {
//1. Pick a URL from the frontier
new BasicWebCrawler().getPageLinks("https://www.wikipedia.com/");
}
}
Okay, I think I managed to do what you asked, when all links on site are checked or site has no links then the recursion will finish, but in internet it's actually not doable, it's funny where can you go from one site just by clicking first not checked link:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.HashSet;
public class BasicWebCrawler {
private HashSet<String> links;
public BasicWebCrawler() {
links = new HashSet<String>();
}
public void getPageLinks(String URL, int level) {
//4. Check if you have already crawled the URLs
//(we are intentionally not checking for duplicate content in this example)
if (!links.contains(URL)) {
try {
//4. (i) If not add it to the index
if (links.add(URL)) {
for(int i = 0; i < level; i++) {
System.out.print("-");
}
System.out.println(URL);
}
//2. Fetch the HTML code
Document document = Jsoup.connect(URL).get();
//3. Parse the HTML to extract links to other URLs
Elements linksOnPage = document.select("a[href]");
//5. For each extracted URL... go back to Step 4.
for (Element page : linksOnPage) {
getPageLinks(page.attr("abs:href"), level + 1);
}
} catch (IOException e) {
System.err.println("For '" + URL + "': " + e.getMessage());
}
}
}
public static void main(String[] args) {
//1. Pick a URL from the frontier
new BasicWebCrawler().getPageLinks("http://mysmallwebpage.com/", 0);
}
}

How to add exception to not parse some types of files using jsoup in java ?

I am writing a web crawler program using Jsoup library. (Sorry i can not post my code becase it too long to post it here).I need to crawl only URLs that can leed me to new links without crawling URLs with that starts with http or https and ending with image files, pdf, rar or zip files. I need just to crawl URLs that ending with .html, .htm, .jsp , .php and .asp etc.
I have two question regarding this issue:
1- How can i prevent the program to not read other unneeded URLs (like: images, PDFs or RARs) ?
2- How can i improve this class to not waisting time to load whole URL content to memory then parse URLs from it ?
This is my code below :
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.math.BigInteger;
import java.util.Formatter;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.security.*;
import java.nio.file.Path;
import java.nio.file.Paths;
public class HTMLParser {
private static final int READ_TIMEOUT_IN_MILLISSECS = (int) TimeUnit.MILLISECONDS.convert(30, TimeUnit.SECONDS);
private static HashMap <String, Integer> filecounter = new HashMap<> ();
public static List<LinkNodeLight> parse(LinkNode inputLink){
List<LinkNodeLight> outputLinks = new LinkedList<>();
try {
inputLink.setIpAdress(IpFromUrl.getIp(inputLink.getUrl()));
String url = inputLink.getUrl();
if (inputLink.getIpAdress() != null) {
url.replace(URLWeight.getHostName(url), inputLink.getIpAdress());
}
Document parsedResults = Jsoup
.connect(url)
.timeout(READ_TIMEOUT_IN_MILLISSECS)
.userAgent("Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6")
.get();
inputLink.setSize(parsedResults.html().length());
/* IP address moved here in order to speed up the process */
inputLink.setStatus(LinkNodeStatus.OK);
inputLink.setDomain(URLWeight.getDomainName(inputLink.getUrl()));
if (true) {
/* save the file to the html */
String filename = parsedResults.title();//digestBig.toString(16) + ".html";
if (filename.length() > 24) {
filename = filename.substring(0, 24);
}
filename = filename.replaceAll("[^\\w\\d\\s]", "").trim();
filename = filename.replaceAll("\\s+", " ");
if (!filecounter.containsKey(filename)) {
filecounter.put(filename, 1);
} else {
Integer tmp = filecounter.remove(filename);
filecounter.put(filename, tmp + 1);
}
filename = filename + "-" + (filecounter.get(filename)).toString() + ".html";
filename = Paths.get("downloads", filename).toString();
inputLink.setFileName(filename);
/* use md5 of url as file name */
try (PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename)))) {
out.println("<!--" + inputLink.getUrl() + "-->");
out.print(parsedResults.html());
out.flush();
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
String tag;
Elements tagElements;
List<LinkNode> result;
tag = "a[href";
tagElements = parsedResults.select(tag);
result = toLinkNodeObject(inputLink, tagElements, tag);
outputLinks.addAll(result);
tag = "area[href";
tagElements = parsedResults.select(tag);
result = toLinkNodeObject(inputLink, tagElements, tag);
outputLinks.addAll(result);
} catch (IOException e) {
inputLink.setParseException(e);
inputLink.setStatus(LinkNodeStatus.ERROR);
}
return outputLinks;
}
static List<LinkNode> toLinkNodeObject(LinkNode parentLink, Elements tagElements, String tag) {
List<LinkNode> links = new LinkedList<>();
for (Element element : tagElements) {
if(isFragmentRef(element)){
continue;
}
String absoluteRef = String.format("abs:%s", tag.contains("[") ? tag.substring(tag.indexOf("[") + 1, tag.length()) : "href");
String url = element.attr(absoluteRef);
if(url!=null && url.trim().length()>0) {
LinkNode link = new LinkNode(url);
link.setTag(element.tagName());
link.setParentLink(parentLink);
links.add(link);
}
}
return links;
}
static boolean isFragmentRef(Element element){
String href = element.attr("href");
return href!=null && (href.trim().startsWith("#") || href.startsWith("mailto:"));
}
}
To add another solution to Pshemo for your first question. You may want to make a regex to compare to so that you don't even take the element and put it in the list
in method "static List toLinkNodeObject" maybe something like
"[http].+[^(pdf|rar|zip)]" and match your url to the regex. This will speed up the program too because you won't even be adding those links to parse for.
String url = element.attr(absoluteRef);
if(url!=null && url.trim().length()>0
&& url.matches("[http].+[^(pdf|rar|zip)]")) {
LinkNode link = new LinkNode(url);
link.setTag(element.tagName());
link.setParentLink(parentLink);
links.add(link);
}
As to speed up the class as a whole, it would help to multithread the downloading and parsing and allow the multiple threads to get and validate the information.

How can I get all website links recursively?

I need to write a code which will get all the links in a website recursively. Since I'm new to this is what I've got so far;
List<WebElement> no = driver.findElements(By.tagName("a"));
nooflinks = no.size();
for (WebElement pagelink : no)
{
String linktext = pagelink.getText();
link = pagelink.getAttribute("href");
}
Now what I need to do is if the list finds a link of the same domain, then it should get all the links from that URL and then return back to the previous loop and resume from the next link. This should go on till the last URL in the Whole Website is found. That is for example, Home Page is base URL and it has 5 URLs of other pages, then after getting the first of the 5 URLs the loop should get all the links of that first URL return back to Home Page and resume from second URL. Now if second URL has Sub-sub URL, then the loop should find links for those first then resume to second URL and then go back to Home Page and resume from third URL.
Can anybody help me out here???
I saw this post recently. I don't know if you are still looking for ANY solution for this problem. If not, I thought it might be useful:
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.Iterator;
public class URLReading {
public static void main(String[] args) {
try {
String url="";
HashMap<String, String> h = new HashMap<>();
Url = "https://abidsukumaran.wordpress.com/";
Document doc = Jsoup.connect(url).get();
// Page Title
String title = doc.title();
//System.out.println("title: " + title);
// Links in page
Elements links = doc.select("a[href]");
List url_array = new ArrayList();
int i=0;
url_array.add(url);
String root = url;
h.put(url, title);
Iterator<String> keySetIterator = h.keySet().iterator();
while((i<=h.size())){
try{
url = url_array.get(i).toString();
doc = Jsoup.connect(url).get();
title = doc.title();
links = doc.select("a[href]");
for (Element link : links) {
String res= h.putIfAbsent(link.attr("href"), link.text());
if (res==null){
url_array.add(link.attr("href"));
System.out.println("\nURL: " + link.attr("href"));
System.out.println("CONTENT: " + link.text());
}
}
}catch(Exception e){
System.out.println("\n"+e);
}
i++;
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
You can use Set and HashSet. You may try like this:
Set<String> getLinksFromSite(int Level, Set<String> Links) {
if (Level < 5) {
Set<String> locallinks = new HashSet<String>();
for (String link : Links) {
Set<String> new_links = ;
locallinks.addAll(getLinksFromSite(Level+1, new_links));
}
return locallinks;
} else {
return Links;
}
}
I would think the following idiom would be useful in this context:
Set<String> visited = new HashSet<>();
Deque<String> unvisited = new LinkedList<>();
unvisited.add(startingURL);
while (!unvisited.isEmpty()) {
String current = unvisited.poll();
visited.add(current);
for /* each link in current */ {
if (!visited.contains(link.url())
unvisited.add(link.url());
}
}

Pulling all urls from any webpage, having trouble with indexOf [homework]

the indexOf always returns negative 7 no matter what i put, i will be using the website http://www.columbusstate.edu
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
public class WebCrawler
{
private static int linkCount = 0;
public static void main(String[] args) throws IOException
{
instance variables
ArrayList<String> links = new ArrayList<String>();
System.out.println("Enter the website you would like to web crawl");
Scanner input = new Scanner(System.in);
String address=input.next();
goes to website
URL locator = new URL(address);
Scanner in=new Scanner(locator.openStream());
String str="";
PrintWriter out=new PrintWriter("links.txt");
searches the webpage and pulls the links, or it should anyways.
while(in.hasNextLine())
{
str=in.next();
if(str.contains("href=\"http://"))
{
linkCount++;
int start = str.indexOf("ht");
int end = str.indexOf("/\"");
if(links.contains(str.substring(start, end))){
}
else{
links.add("Line Number "+linkCount+""+str.substring(start, end));
}
}
else if(str.contains("href=\"https://")){
linkCount++;
int start = str.indexOf("ht");
int end = str.indexOf("://")+15;
if(links.contains(str.substring(start, end))){
}
else{
links.add("Line Number "+linkCount+""+str.substring(start, end));
}
}
}
int num = links.size();
System.out.println(num);
out.println("Number of links on this webpage is "+linkCount);
out.println("Links are:");
for(int i = links.size()-1; i>0; i--){
out.println(links.get(i));
}
out.close();
}
}
If you really looking for a way to extract links from a web page then it's better to use a proper HTML parser than trying to do it manually. Here an example with JSOUP
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HTMLUtils {
private HTMLUtils() {}
public static List<String>extractLinks(String url) throws IOException {
final ArrayList<String> result = new ArrayList<String>();
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
// href ...
for (Element link : links) {
result.add(link.attr("abs:href"));
// result.add(link.text());
}
return result;
}
public final static void main(String[] args) throws Exception{
String site = "http://www.columbusstate.edu";
List<String> links = HTMLUtils.extractLinks(site);
for (String link : links) {
System.out.println(link);
}
}
}

Categories

Resources