How can re-implement this using concurrent executor, or just a much better way. meaning threadpool executor .
Basically i want the crawler to crawl the given url and maybe later follow the urls found to another website and so one.
package Mainpackge;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class main {
public static void main(String[] args) {
//List of urs to collect data from
String[] urls = new String[]{
"http://www.answers.com/",
"http://www.britannica.com/",
"https://ie.yahoo.com/?p=us",
"https://en.wikipedia.org/wiki/Main_Page",
"http://ww w.worldbook.com/",
"http://www.computerlanguage.com/",
"http://www.howstuffworks.com/",
"http://www.dmoz.org/Computers/Computer_Science/"
};
// Create and start workers
List<Worker> workers = new ArrayList<>(urls.length);
for (String url : urls) {
Worker w = new Worker(url);
workers.add(w);
new Thread(w).start();
}
// Retrieve results
for (Worker w : workers) {
Elements results = w.waitForResults();
if (results != null)
for (Element result : results) { result.absUrl("a") ;
System.out.println(w.getName()+": "+result.absUrl("href"));
}
else
System.err.println(w.getName()+" had some error!");
}
}
}
class Worker implements Runnable {
private String url;
private Elements results;
private String name;
private static int number = 0;
private final Object lock = new Object();
public Worker(String url) {
this.url = url;
this.name = "Worker-" + (number++);
}
public String getName() {
return name;
}
#Override
public void run() {
try {
Document doc = Jsoup.connect(this.url).get();
Elements links = doc.select("a");
// Update results
synchronized (lock) {
this.results = links;
lock.notifyAll();
}
} catch (IOException e) {
// You should implement a better error handling code..
System.err.println("Error while parsing: "+this.url);
e.printStackTrace();
}
}
public Elements waitForResults() {
synchronized (lock) {
try {
while (this.results == null) {
lock.wait();
}
return this.results;
} catch (InterruptedException e) {
// Again better error handling
e.printStackTrace();
}
return null;
}
}
}
Full example using an ExecutorService and Callable implementation for your threads.
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
public class ThreadPoolExample {
public static void main(String[] args) throws InterruptedException, ExecutionException {
List<String> urls = Arrays.asList(new String[]{
"http://www.answers.com/",
"http://www.britannica.com/",
"https://ie.yahoo.com/?p=us",
"https://en.wikipedia.org/wiki/Main_Page",
"http://ww w.worldbook.com/",
"http://www.computerlanguage.com/",
"http://www.howstuffworks.com/",
"http://www.dmoz.org/Computers/Computer_Science/"
});
ExecutorService ex = Executors.newFixedThreadPool(10);
ex.awaitTermination(2, TimeUnit.SECONDS);
List<Future<Element>> results = new ArrayList<>();
for (String string : urls) {
results.add(ex.submit(new Crawler(string)));
}
for (Future<Element> future : results) {
// Get will wait for the thread to be done
for (String url : future.get().urls) {
// ADD A NEW THREAD FOR EACH URLS YOU FOUND !
ex.submit(new Crawler(url));
}
}
ex.shutdown();
}
public static class Crawler implements Callable<Element>{
String url;
public Crawler(String url) {
this.url = url;
}
#Override
public Element call() throws Exception {
// Implement your crawling logic and return your elements
return new Element(Arrays.asList(new String[]{"all new urls", "that you found while crwaling"}));
}
}
public static class Element{
List<String> urls;
public Element(List<String> urls) {
this.urls = urls;
}
#Override
public String toString() {
return "Elements found : " + urls.size();
}
}
}
Related
Why would this code be having memory issues? It runs fine once, and then when I try to run it again it hangs on "Enabling plugin". It'll then give me an OutOfMemoryException such as
"Exception: java.lang.OutOfMemoryError thrown from the UncaughtExceptionHandler in thread "Worker-Main-10""
The code I am using is as follows from the Spigot API
import org.bukkit.Bukkit;
import org.bukkit.ChatColor;
import org.bukkit.entity.Bat;
import org.bukkit.entity.Entity;
import org.bukkit.entity.Player;
import org.bukkit.plugin.java.JavaPlugin;
import org.bukkit.scheduler.BukkitScheduler;
import java.io.*;
import java.util.ArrayList;
import java.util.Scanner;
import java.util.UUID;
public class COVID19 extends JavaPlugin {
private static ArrayList<CovidInfection> infections;
#Override
public void onEnable() {
infections = new ArrayList<CovidInfection>();
System.out.println("1");
try {
readInfections();
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
}
System.out.println("2");
this.getCommand("getInfected").setExecutor(new CommandGetInfected());
BukkitScheduler scheduler = getServer().getScheduler();
scheduler.scheduleSyncRepeatingTask(this, new Runnable() {
#Override
public void run() {
batCovid();
}
}, 0, 10);
System.out.println(4);
}
#Override
public void onDisable() {
try {
writeInfections();
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
public void batCovid() {
System.out.println(3);
for(Player player : Bukkit.getOnlinePlayers()) {
for(Entity nearby : player.getNearbyEntities(6, 6, 6)) {
if (nearby instanceof Bat) {
String name = player.getName();
UUID uuid = player.getUniqueId();
infections.add(new CovidInfection(uuid, name, 14));
}
}
}
}
public void readInfections() throws FileNotFoundException {
File file = new File("infected.txt");
if(file.length() == 0) {
return;
}
Scanner input = new Scanner(file);
String line = input.nextLine();
while (!(line.equals(""))) {
infections.add(parseInfectionLine(line));
}
input.close();
}
public void writeInfections() throws IOException {
//File will be written as UUID,Name,DaysRemaining
FileWriter writer = new FileWriter("infected.txt", false);
for(CovidInfection infection : infections) {
writer.write(infection.toString());
}
writer.close();
}
private CovidInfection parseInfectionLine(String line) {
String[] words = line.replace("\n","").split(",");
return new CovidInfection(UUID.fromString(words[0]), words[1], Integer.parseInt(words[2]));
}
public static String getInfected() {
String compiled = "";
for (CovidInfection infection : infections) {
compiled += infection.toString() + "\n";
}
return compiled;
}
}
import org.bukkit.ChatColor;
import org.bukkit.command.Command;
import org.bukkit.command.CommandExecutor;
import org.bukkit.command.CommandSender;
import org.bukkit.entity.Player;
public class CommandGetInfected implements CommandExecutor {
#Override
public boolean onCommand(CommandSender sender, Command cmd, String label, String[] args) {
String message = COVID19.getInfected();
if(!(message.equals(""))) {
sender.sendMessage(message);
} else {
sender.sendMessage("There are no infected!");
}
return(true);
}
}
import java.util.UUID;
public class CovidInfection {
private UUID uuid;
private String name;
private int days;
public CovidInfection(UUID uuid, String name, int days) {
this.uuid = uuid;
this.name = name;
this.days = days;
}
public int getDays() {
return days;
}
public String getName() {
return name;
}
public UUID getUuid() {
return uuid;
}
public void newDay() {
days--;
}
public String toString() {
return uuid.toString() + "," + name + "," + days + "\n";
}
}
Any help would be greatly appreciated, thank you!
Firstly, you are make I/O request on main thread.
To fix this issue, use multithreading such as explained here or here
Then, this :
Scanner input = new Scanner(file);
String line = input.nextLine();
Can't be used in a server.
An input like that already exist, it's the console sender.
To do that, I suggest you to use ServerCommandEvent and use spigot's console.
My testNg test1 will call the executor.submit(callable); the responsibility of that callable is to trigger a action and i ll store it in List<Future<T>> and i have declared that List<Future<T>> as Global variable.And my second TestNg test will pick it and it will call Future.get() and it will display the result.
can anyone help me!
import com.google.common.collect.Lists;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.stream.Collectors;
public class test1 {
public static final List<Future<Boolean>> callBackList = new ArrayList<>();
#DataProvider(name = "HappyTestCases")
protected Iterator<Object[]> getHappyTestCases(){
List<Integer> inputTestDataIntValue = new ArrayList<>();
inputTestDataIntValue.add(1);
inputTestDataIntValue.add(2);
inputTestDataIntValue.add(3);
inputTestDataIntValue.add(4);
inputTestDataIntValue.add(5);
inputTestDataIntValue.add(6);
inputTestDataIntValue.add(7);
inputTestDataIntValue.add(8);
inputTestDataIntValue.add(9);
inputTestDataIntValue.add(10);
List<Object[]> testCasesAsObjectArray = inputTestDataIntValue.stream().map(testCaseObj -> new Object[]{testCaseObj})
.collect(Collectors.toList());
return testCasesAsObjectArray.iterator();
}
#DataProvider(name = "callBackCases")
protected Iterator<Object[]> getcallBackCases(){
List<Object[]> testCasesAsObjectArray = callBackList.stream().map(testCaseObj -> new Object[]{testCaseObj})
.collect(Collectors.toList());
return testCasesAsObjectArray.iterator();
}
#org.testng.annotations.Test(dataProvider = "HappyTestCases")
public void runTestNgTest(Integer data) throws Exception {
RunInitiator(Arrays.asList(data));
}
#Test(dataProvider = "HappyTestCases")
public void runCallBackList(Future<Boolean> callback) throws Exception{
List<Boolean> reslutList = runCallBack(Arrays.asList(callback));
for(Boolean b : reslutList) {
if(b) {
System.out.println("TestCase passed for testCase Id: ");
}else {
System.out.println("TestCase failed for testCase Id: ");
}
}
}
public void RunInitiator(List<Integer> inputTestDataIntValue) throws Exception {
ExecutorService executor = Executors.newFixedThreadPool(10);
for(Integer integer : inputTestDataIntValue) {
System.out.println("intValue = " + integer);
System.out.println("executor is going to start");
Future<Boolean> callBack = executor.submit(new Task(integer));
System.out.println("Thread Name = "+ Thread.currentThread().getName());
callBackList.add(callBack);
}
System.out.println("successfully finished runIntiator");
}
public List<Boolean> runCallBack(List<Future<Boolean>> callBackList) throws Exception{
List<Boolean> resultList = Lists.newArrayList();
//iterate and give it to different test.
for(Future<Boolean> callBack : callBackList) {
Boolean result = callBack.get();
resultList.add(result);
System.out.println("result = "+ result);
}
return resultList;
}
static class Task implements Callable<Boolean> {
public Integer intValue;
public Task(int intValue) {
this.intValue = intValue;
}
#Override
public Boolean call() throws Exception {
System.out.println("Thread Name inside callable = " + Thread.currentThread().getName());
System.out.println("Sleeping for 2mins");
Thread.sleep(120000);
if (intValue < 5) {
return true;
} else {
return false;
}
}
}
}
I am building a collection program in Java that collects data from websites using their apis. I am encountering this problem where it will hang on an http call. I tried to work around it by executing an http call over an executor service with a timeout. That doesn't seem to work as it would keep timing out and retrying. I figured it might be something to do with the API so after a retry I would reinitialize a whole new object per website API. Still no solution. I am trying to identify the root cause of this but can't seem to put my finger on it.
Here is a look at my flickr manager class that handles the calls to flickr.
import java.net.SocketException;
import java.net.UnknownHostException;
import java.util.Collection;
import java.util.Collections;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.scribe.exceptions.OAuthConnectionException;
import com.flickr4java.flickr.Flickr;
import com.flickr4java.flickr.FlickrException;
import com.flickr4java.flickr.FlickrRuntimeException;
import com.flickr4java.flickr.REST;
import com.flickr4java.flickr.RequestContext;
import com.flickr4java.flickr.auth.Auth;
import com.flickr4java.flickr.auth.Permission;
import com.flickr4java.flickr.people.User;
import com.flickr4java.flickr.photos.Exif;
import com.flickr4java.flickr.photos.Extras;
import com.flickr4java.flickr.photos.Photo;
import com.flickr4java.flickr.photos.PhotoList;
import com.flickr4java.flickr.photos.SearchParameters;
import com.flickr4java.flickr.photos.Size;
import com.google.common.util.concurrent.RateLimiter;
public class FlickrManager {
private final static Logger LOG = Logger.getLogger(FlickrManager.class.getName());
private final static ExecutorService executorService = Executors.newSingleThreadExecutor();
private Flickr flickr;
private final int MAX_PER_PAGE = 500;
private final RateLimiter rateLimiter;
private String ApiKey;
private String ApiSecret;
private String authToken;
private String authTokenSecret;
private Integer hostPort;
private String hostAddress;
private String httpScheme;
public FlickrManager(Flickr flickr, double apiCallsPerSecond) throws FlickrException {
this.flickr = flickr;
flickr.getTestInterface().echo(Collections.emptyMap());
//get flickr info to reinitialize flickr object if necessary
this.ApiKey = flickr.getApiKey();
this.ApiSecret = flickr.getSharedSecret();
this.hostPort = flickr.getTransport().getPort();
this.hostAddress = flickr.getTransport().getHost();
this.httpScheme = flickr.getTransport().getScheme();
if(flickr.getAuth() != null){
this.authToken = flickr.getAuth().getToken();
this.authTokenSecret = flickr.getAuth().getTokenSecret();
}
this.rateLimiter = RateLimiter.create(apiCallsPerSecond);
}
private void initialize(){
this.flickr = null;
REST rest = new REST(this.hostAddress,this.hostPort);
rest.setScheme(this.httpScheme);
this.flickr = new Flickr(this.ApiKey, this.ApiSecret,rest);
if(this.authToken != null && this.authTokenSecret != null){
RequestContext requestContext = RequestContext.getRequestContext();
Auth auth = new Auth();
auth.setPermission(Permission.READ);
auth.setToken(this.authToken);
auth.setTokenSecret(this.authTokenSecret);
requestContext.setAuth(auth);
flickr.setAuth(auth);
}
}
public User getUserInfo(String flickrProfileId) throws FlickrException{
return doFlickrAction(new CallableFlickrTask<User>(){
#Override
public User execute() throws FlickrException {
return flickr.getPeopleInterface().getInfo(flickrProfileId);
}
});
}
public PhotoList<Photo> search(SearchParameters params, int page) throws FlickrException{
return doFlickrAction(new CallableFlickrTask<PhotoList<Photo>>(){
#Override
public PhotoList<Photo> execute() throws FlickrException {
return flickr.getPhotosInterface().search(params, MAX_PER_PAGE, page);
}
});
}
public PhotoList<Photo> getUserPhotos(String userNSID, int page) throws FlickrException{
return doFlickrAction(new CallableFlickrTask<PhotoList<Photo>>(){
#Override
public PhotoList<Photo> execute() throws FlickrException {
return flickr.getPeopleInterface().getPhotos(
userNSID,
null, null, null, null, null,
Flickr.CONTENTTYPE_PHOTO, null,
Extras.ALL_EXTRAS, 100, page);
}
});
}
//Catch the execption inside the function for failure to get exif
public Collection<Exif> getPhotoExif(Photo photo) throws FlickrException, FlickrRuntimeException {
return doFlickrAction(new CallableFlickrTask<Collection<Exif>>(){
#Override
public Collection<Exif> execute() throws FlickrException {
return flickr.getPhotosInterface().getExif(photo.getId(),photo.getSecret());
}
});
}
public Collection<Size> getAvailablePhotoSizes(Photo photo) throws FlickrException{
return doFlickrAction(new CallableFlickrTask<Collection<Size>>(){
#Override
public Collection<Size> execute() throws FlickrException {
return flickr.getPhotosInterface().getSizes(photo.getId());
}
});
}
private abstract class CallableFlickrTask<T> {
public abstract T execute() throws FlickrException, FlickrRuntimeException;
}
private <T> T doFlickrAction(CallableFlickrTask<T> callable) throws FlickrException {
while(true){
rateLimiter.acquire();
Future<T> future = executorService.submit(new Callable<T>() {
#Override
public T call() throws Exception {
return callable.execute();
}});
try {
return future.get(5, TimeUnit.MINUTES);
} catch (InterruptedException e) {
LOG.log(Level.INFO,"Interrupted exception: {0}",e.getMessage());
initialize(); //initialize if it's been interupted
} catch (ExecutionException e) {
Throwable cause = e.getCause();
if( cause instanceof UnknownHostException ||
cause instanceof SocketException ||
cause instanceof OAuthConnectionException ){
//sleep and retry
LOG.log(Level.INFO,"Unknown Host or Socket exception. Retry: {0}",e.getMessage());
try {
Thread.sleep(10000);
initialize();
} catch (InterruptedException ex) {
LOG.log(Level.INFO, "Thread sleep was interrupted exception: {0}", ex.getMessage());
}
}
//if it's not of the above exceptions, then rethrow
else if (cause instanceof FlickrException) {
throw (FlickrException) cause;
}
else {
throw new IllegalStateException(e);
}
} catch (TimeoutException e) {
LOG.log(Level.INFO,"Timeout Exception: {0}",e.getMessage());
initialize(); //initialize again after timeout
}
}
}
}
I also used jvisualvm to get a look at what the collection is doing while it's hanging. The thread dump is here: Thread dump
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 7 years ago.
Improve this question
I am developing a web crawler application. When i run the program i am getting these error messages below:
i've got these errors after running the program for more that 3 hours. I tried to allocate memory by changing eclipse.ini setting to 2048 MB of ram as it was answered in this topic but still get the same errors after 3 hours or less. I should run the program for more that 2-3 days non-stopping to get analyse the results.
Can you tell me what i am missing here to get these error below ?
These are my classes:
seeds.txt
http://www.stanford.edu
http://www.archive.org
WebCrawler.java
package pkg.crawler;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.TimeUnit;
import org.jsoup.HttpStatusException;
import org.jsoup.UnsupportedMimeTypeException;
import org.joda.time.DateTime;
public class WebCrawler {
public static Queue <LinkNodeLight> queue = new PriorityBlockingQueue <> (); // priority queue
public static final int n_threads = 5; // amount of threads
private static Set<String> processed = new LinkedHashSet <> (); // set of processed urls
private PrintWriter out; // output file
private PrintWriter err; // error file
private static Integer cntIntra = new Integer (0); // counters for intra- links in the queue
private static Integer cntInter = new Integer (0); // counters for inter- links in the queue
private static Integer dub = new Integer (0); // amount of skipped urls
public static void main(String[] args) throws Exception {
System.out.println("Running web crawler: " + new Date());
WebCrawler webCrawler = new WebCrawler();
webCrawler.createFiles();
try (Scanner in = new Scanner(new File ("seeds.txt"))) {
while (in.hasNext()) {
webCrawler.enque(new LinkNode (in.nextLine().trim()));
}
} catch (IOException e) {
e.printStackTrace();
return;
}
webCrawler.processQueue();
webCrawler.out.close();
webCrawler.err.close();
}
public void processQueue(){
/* run in threads */
Runnable r = new Runnable() {
#Override
public void run() {
/* queue may be empty but process is not finished, that's why we need to check if any links are being processed */
while (true) {
LinkNode link = deque();
if (link == null)
continue;
link.setStartTime(new DateTime());
boolean process = processLink(link);
link.setEndTime(new DateTime());
if (!process)
continue;
/* print the data to the csv file */
if (link.getStatus() != null && link.getStatus().equals(LinkNodeStatus.OK)) {
synchronized(out) {
out.println(getOutputLine(link));
out.flush();
}
} else {
synchronized(err) {
err.println(getOutputLine(link));
err.flush();
}
}
}
}
};
/* run n_threads threads which perform dequeue and process */
LinkedList <Thread> threads = new LinkedList <> ();
for (int i = 0; i < n_threads; i++) {
threads.add(new Thread(r));
threads.getLast().start();
}
for (Thread thread : threads) {
try {
thread.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
/* returns true if link was actually processed */
private boolean processLink(LinkNode inputLink) {
String url = getUrlGeneralForm(inputLink);
boolean process = true;
synchronized (processed) {
if (processed.contains(url)) {
process = false;
synchronized (dub) {dub++;}
} else
processed.add(url);
}
/* start processing only if the url have not been processed yet or not being processed */
if (process) {
System.out.println("Processing url " + url);
List<LinkNodeLight> outputLinks = parseAndWieghtResults(inputLink);
for (LinkNodeLight outputLink : outputLinks) {
String getUrlGeneralForumOutput = getUrlGeneralForm(outputLink);
/* add the new link to the queue only if it has not been processed yet */
process = true;
synchronized (processed) {
if (processed.contains(getUrlGeneralForumOutput)) {
process = false;
synchronized (dub) {dub++;}
}
}
if (process) {
enque(outputLink);
}
}
return true;
}
return false;
}
void enque(LinkNodeLight link){
link.setEnqueTime(new DateTime());
/* the add method requires implicit priority */
synchronized (queue) {
if (link.interLinks)
synchronized (cntInter) {cntInter++;}
else
synchronized (cntIntra) {cntIntra++;}
//queue.add(link, 100 - (int)(link.getWeight() * 100.f));
queue.add(link);
}
}
/**
* Picks an element from the queue
* #return top element from the queue or null if the queue is empty
*/
LinkNode deque(){
/* link must be checked */
LinkNode link = null;
synchronized (queue) {
link = (LinkNode) queue.poll();
if (link != null) {
link.setDequeTime(new DateTime());
if (link.isInterLinks())
synchronized (cntInter) {cntInter--;}
else
synchronized (cntIntra) {cntIntra--;}
}
}
return link;
}
private void createFiles() {
/* create output file */
try {
out = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLS.csv", false)));
out.println(generateHeaderFile());
} catch (IOException e) {
System.err.println(e);
}
/* create error file */
try {
err = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLSERROR.csv", false)));
err.println(generateHeaderFile());
} catch (IOException e) {
System.err.println(e);
}
}
/**
* formats the string so it can be valid entry in csv file
* #param s
* #return
*/
private static String format(String s) {
// replace " by ""
String ret = s.replaceAll("\"", "\"\"");
// put string into quotes
return "\"" + ret + "\"";
}
/**
* Creates the line that needs to be written in the outputfile
* #param link
* #return
*/
public static String getOutputLine(LinkNode link){
StringBuilder builder = new StringBuilder();
builder.append(link.getParentLink()!=null ? format(link.getParentLink().getUrl()) : "");
builder.append(",");
builder.append(link.getParentLink()!=null ? link.getParentLink().getIpAdress() : "");
builder.append(",");
builder.append(link.getParentLink()!=null ? link.getParentLink().linkProcessingDuration() : "");
builder.append(",");
builder.append(format(link.getUrl()));
builder.append(",");
builder.append(link.getDomain());
builder.append(",");
builder.append(link.isInterLinks());
builder.append(",");
builder.append(Util.formatDate(link.getEnqueTime()));
builder.append(",");
builder.append(Util.formatDate(link.getDequeTime()));
builder.append(",");
builder.append(link.waitingInQueue());
builder.append(",");
builder.append(queue.size());
/* Inter and intra links in queue */
builder.append(",");
builder.append(cntIntra.toString());
builder.append(",");
builder.append(cntInter.toString());
builder.append(",");
builder.append(dub);
builder.append(",");
builder.append(new Date ());
/* URL size*/
builder.append(",");
builder.append(link.getSize());
/* HTML file
builder.append(",");
builder.append(link.getFileName());*/
/* add HTTP error */
builder.append(",");
if (link.getParseException() != null) {
if (link.getParseException() instanceof HttpStatusException)
builder.append(((HttpStatusException) link.getParseException()).getStatusCode());
if (link.getParseException() instanceof SocketTimeoutException)
builder.append("Time out");
if (link.getParseException() instanceof MalformedURLException)
builder.append("URL is not valid");
if (link.getParseException() instanceof UnsupportedMimeTypeException)
builder.append("Unsupported mime type: " + ((UnsupportedMimeTypeException)link.getParseException()).getMimeType());
}
return builder.toString();
}
/**
* generates the Header for the file
* #param link
* #return
*/
private String generateHeaderFile(){
StringBuilder builder = new StringBuilder();
builder.append("Seed URL");
builder.append(",");
builder.append("Seed IP");
builder.append(",");
builder.append("Process Duration");
builder.append(",");
builder.append("Link URL");
builder.append(",");
builder.append("Link domain");
builder.append(",");
builder.append("Link IP");
builder.append(",");
builder.append("Enque Time");
builder.append(",");
builder.append("Deque Time");
builder.append(",");
builder.append("Waiting in the Queue");
builder.append(",");
builder.append("QueueSize");
builder.append(",");
builder.append("Intra in queue");
builder.append(",");
builder.append("Inter in queue");
builder.append(",");
builder.append("Dublications skipped");
/* time was printed, but no header was */
builder.append(",");
builder.append("Time");
/* URL size*/
builder.append(",");
builder.append("Size bytes");
/* HTTP errors */
builder.append(",");
builder.append("HTTP error");
return builder.toString();
}
String getUrlGeneralForm(LinkNodeLight link){
String url = link.getUrl();
if (url.endsWith("/")){
url = url.substring(0, url.length() - 1);
}
return url;
}
private List<LinkNodeLight> parseAndWieghtResults(LinkNode inputLink) {
List<LinkNodeLight> outputLinks = HTMLParser.parse(inputLink);
if (inputLink.hasParseException()) {
return outputLinks;
} else {
return URLWeight.weight(inputLink, outputLinks);
}
}
}
HTMLParser.java
package pkg.crawler;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.math.BigInteger;
import java.util.Formatter;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;
import java.security.*;
import java.nio.file.Path;
import java.nio.file.Paths;
public class HTMLParser {
private static final int READ_TIMEOUT_IN_MILLISSECS = (int) TimeUnit.MILLISECONDS.convert(30, TimeUnit.SECONDS);
private static HashMap <String, Integer> filecounter = new HashMap<> ();
public static List<LinkNodeLight> parse(LinkNode inputLink){
List<LinkNodeLight> outputLinks = new LinkedList<>();
try {
inputLink.setIpAdress(IpFromUrl.getIp(inputLink.getUrl()));
String url = inputLink.getUrl();
if (inputLink.getIpAdress() != null) {
url.replace(URLWeight.getHostName(url), inputLink.getIpAdress());
}
Document parsedResults = Jsoup
.connect(url)
.timeout(READ_TIMEOUT_IN_MILLISSECS)
.get();
inputLink.setSize(parsedResults.html().length());
/* IP address moved here in order to speed up the process */
inputLink.setStatus(LinkNodeStatus.OK);
inputLink.setDomain(URLWeight.getDomainName(inputLink.getUrl()));
if (true) {
/* save the file to the html */
String filename = parsedResults.title();//digestBig.toString(16) + ".html";
if (filename.length() > 24) {
filename = filename.substring(0, 24);
}
filename = filename.replaceAll("[^\\w\\d\\s]", "").trim();
filename = filename.replaceAll("\\s+", " ");
if (!filecounter.containsKey(filename)) {
filecounter.put(filename, 1);
} else {
Integer tmp = filecounter.remove(filename);
filecounter.put(filename, tmp + 1);
}
filename = filename + "-" + (filecounter.get(filename)).toString() + ".html";
filename = Paths.get("downloads", filename).toString();
inputLink.setFileName(filename);
/* use md5 of url as file name */
try (PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename)))) {
out.println("<!--" + inputLink.getUrl() + "-->");
out.print(parsedResults.html());
out.flush();
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
String tag;
Elements tagElements;
List<LinkNode> result;
tag = "a[href";
tagElements = parsedResults.select(tag);
result = toLinkNodeObject(inputLink, tagElements, tag);
outputLinks.addAll(result);
tag = "area[href";
tagElements = parsedResults.select(tag);
result = toLinkNodeObject(inputLink, tagElements, tag);
outputLinks.addAll(result);
} catch (IOException e) {
inputLink.setParseException(e);
inputLink.setStatus(LinkNodeStatus.ERROR);
}
return outputLinks;
}
static List<LinkNode> toLinkNodeObject(LinkNode parentLink, Elements tagElements, String tag) {
List<LinkNode> links = new LinkedList<>();
for (Element element : tagElements) {
if(isFragmentRef(element)){
continue;
}
String absoluteRef = String.format("abs:%s", tag.contains("[") ? tag.substring(tag.indexOf("[") + 1, tag.length()) : "href");
String url = element.attr(absoluteRef);
if(url!=null && url.trim().length()>0) {
LinkNode link = new LinkNode(url);
link.setTag(element.tagName());
link.setParentLink(parentLink);
links.add(link);
}
}
return links;
}
static boolean isFragmentRef(Element element){
String href = element.attr("href");
return href!=null && (href.trim().startsWith("#") || href.startsWith("mailto:"));
}
}
Util.java
package pkg.crawler;
import java.util.Date;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
public class Util {
private static DateTimeFormatter formatter;
static {
formatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss:SSS");
}
public static String linkToString(LinkNode inputLink){
return String.format("%s\t%s\t%s\t%s\t%s\t%s",
inputLink.getUrl(),
inputLink.getWeight(),
formatDate(inputLink.getEnqueTime()),
formatDate(inputLink.getDequeTime()),
differenceInMilliSeconds(inputLink.getEnqueTime(), inputLink.getDequeTime()),
inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl()
);
}
public static String linkToErrorString(LinkNode inputLink){
return String.format("%s\t%s\t%s\t%s\t%s\t%s",
inputLink.getUrl(),
inputLink.getWeight(),
formatDate(inputLink.getEnqueTime()),
formatDate(inputLink.getDequeTime()),
inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl(),
inputLink.getParseException().getMessage()
);
}
public static String formatDate(DateTime date){
return formatter.print(date);
}
public static long differenceInMilliSeconds(DateTime dequeTime, DateTime enqueTime){
return (dequeTime.getMillis()- enqueTime.getMillis());
}
public static int differenceInSeconds(Date enqueTime, Date dequeTime){
return (int)((dequeTime.getTime()/1000) - (enqueTime.getTime()/1000));
}
public static int differenceInMinutes(Date enqueTime, Date dequeTime){
return (int)((dequeTime.getTime()/60000) - (enqueTime.getTime()/60000));
}
}
URLWeight.java
package pkg.crawler;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;
public class URLWeight {
public static List<LinkNodeLight> weight(LinkNode sourceLink, List<LinkNodeLight> links) {
List<LinkNodeLight> interLinks = new LinkedList<>();
List<LinkNodeLight> intraLinks = new LinkedList<>();
for (LinkNodeLight link : links) {
if (isIntraLink(sourceLink, link)) {
intraLinks.add(link);
link.setInterLinks(false);
} else {
interLinks.add(link);
link.setInterLinks(true);
}
}
static boolean isIntraLink(LinkNodeLight sourceLink, LinkNodeLight link){
String parentDomainName = getHostName(sourceLink.getUrl());
String childDomainName = getHostName(link.getUrl());
return parentDomainName.equalsIgnoreCase(childDomainName);
}
public static String getHostName(String url) {
if(url == null){
// System.out.println("Deneme");
return "";
}
String domainName = new String(url);
int index = domainName.indexOf("://");
if (index != -1) {
domainName = domainName.substring(index + 3);
}
for (int i = 0; i < domainName.length(); i++)
if (domainName.charAt(i) == '?' || domainName.charAt(i) == '/') {
domainName = domainName.substring(0, i);
break;
}
/*if (index != -1) {
domainName = domainName.substring(0, index);
}*/
/* have to keep www in order to do replacements with IP */
//domainName = domainName.replaceFirst("^www.*?\\.", "");
return domainName;
}
public static String getDomainName(String url) {
String [] tmp= getHostName(url).split("\\.");
if (tmp.length == 0)
return "";
return tmp[tmp.length - 1];
}
}
PingTaskManager.java
package pkg.crawler;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class PingTaskManager {
private static ExecutorService executor = Executors.newFixedThreadPool(100);
public static void ping (LinkNode e) {
executor.submit(new PingTaks(e));
}
}
class PingTaks implements Runnable {
private LinkNode link;
public PingTaks( LinkNode link ) {
}
#Override
public void run() {
/* link.ping(); */
}
}
LinkNodeStatus.java
package pkg.crawler;
public enum LinkNodeStatus {
OK,
ERROR
}
LinkNodeLight.java
package pkg.crawler;
import org.joda.time.DateTime;
public class LinkNodeLight implements Comparable<LinkNodeLight> {
protected String url;
protected float weight;
protected DateTime enqueTime;
protected boolean interLinks;
public String getUrl() {
return url;
}
public float getWeight() {
return weight;
}
public void setWeight(float weight) {
this.weight = weight;
}
public DateTime getEnqueTime() {
return enqueTime;
}
public LinkNodeLight(String url) {
this.url = url;
}
public void setEnqueTime(DateTime enqueTime) {
this.enqueTime = enqueTime;
}
#Override
public int compareTo(LinkNodeLight link) {
if (this.weight < link.weight) return 1;
else if (this.weight > link.weight) return -1;
return 0;
}
}
LinkNode.java
package pkg.crawler;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.Socket;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Date;
import org.joda.time.DateTime;
public class LinkNode extends LinkNodeLight{
public LinkNode(String url) {
super(url);
}
private String tag;
private LinkNode parentLink;
private IOException parseException = null; // initialize parse Exception with null
private float weight;
private DateTime dequeTime;
private DateTime startTime;
private DateTime endTime;
private LinkNodeStatus status;
private String ipAdress;
private int size;
private String filename;
private String domain;
public DateTime getStartTime() {
return startTime;
}
public void setStartTime(DateTime startTime) {
this.startTime = startTime;
}
public DateTime getEndTime() {
return endTime;
}
public void setEndTime(DateTime endTime) {
this.endTime = endTime;
}
public DateTime getDequeTime() {
return dequeTime;
}
public String getTag() {
return tag;
}
public LinkNode getParentLink() {
return parentLink;
}
public Exception getParseException() {
return parseException;
}
public boolean hasParseException(){
return parseException!=null;
}
public void setDequeTime(DateTime dequeTime) {
this.dequeTime = dequeTime;
}
public void setTag(String tag) {
this.tag = tag;
}
public void setParentLink(LinkNode parentLink) {
this.parentLink = parentLink;
}
public void setParseException(IOException parseException) {
this.parseException = parseException;
}
#Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
LinkNode link = (LinkNode) o;
if (url != null ? !url.equals(link.url) : link.url != null) {
return false;
}
return true;
}
#Override
public int hashCode() {
return url != null ? url.hashCode() : 0;
}
public long waitingInQueue(){
return Util.differenceInMilliSeconds( dequeTime,enqueTime );
}
public long linkProcessingDuration(){
return Util.differenceInMilliSeconds( endTime,startTime );
}
#Override
public String toString() {
StringBuilder sb = new StringBuilder("LinkNode{");
sb.append("url='").append(url).append('\'');
sb.append(", score=").append(weight);
sb.append(", enqueTime=").append(enqueTime);
sb.append(", dequeTime=").append(dequeTime);
sb.append(", tag=").append(tag);
if(parentLink!=null) {
sb.append(", parentLink=").append(parentLink.getUrl());
}
sb.append('}');
return sb.toString();
}
public void setStatus(LinkNodeStatus status) {
this.status = status;
}
public LinkNodeStatus getStatus(){
if (status == null) {
status = LinkNodeStatus.ERROR;
}
return status;
}
// check server link is it exist or not
/* this method gives fake errors
public LinkNodeStatus ping () {
boolean reachable = false;
String sanitizeUrl = url.replaceFirst("^https", "http");
try {
HttpURLConnection connection = (HttpURLConnection) new URL(sanitizeUrl).openConnection();
connection.setConnectTimeout(1000);
connection.setRequestMethod("HEAD");
int responseCode = connection.getResponseCode();
System.err.println(url + " " + responseCode);
reachable = (200 <= responseCode && responseCode <= 399);
} catch (IOException exception) {
}
return reachable?LinkNodeStatus.OK: LinkNodeStatus.ERROR;
}*/
public String getIpAdress() {
return ipAdress;
}
public void setIpAdress(String ipAdress) {
this.ipAdress = ipAdress;
}
/* methods for controlling url size */
public void setSize(int size) {
this.size = size;
}
public int getSize() {
return this.size;
}
public void setFileName(String filename) {
this.filename = filename;
}
public String getFileName() {
return this.filename;
}
public String getDomain() {
return domain;
}
public void setDomain(String domain) {
this.domain = domain;
}
}
I tried to allocate memory by changing eclipse.ini setting to 2048 MB of ram as it was answered in this topic but still get the same errors after 3 hours or less.
I hate to repeat myself(*), but in eclipse.ini you set up the memory for Eclipse, which has nothing to do with the memory for your crawler.
When using command line, you need to start it via java -Xmx2G pkg.crawler.WebCrawler.
When starting from Eclipse, you need to add -Xmx2G to the run configuration ("VM arguments" rather than "Program arguments").
(*) Link to a deleted question; requires some reputation to view.
Basically I've got a little threading class used by ExecutorService and a fixed thread pool. Each thread instantiates my threading class and the call method is fired, works great!
However I really need to call another class (via instantiation or static means) to process & return some data within the call method, however when trying this I understandably get concurrent.ExecutionException, along with related methods.
I think it will be easier to paste all my code here, note its very rough
MainController
package com.multithreading.excutorservice;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.*;
public class MainController {
private static List<String> urls;
public static void main(String[] args) {
populateList();
// futures to retrieve task results
List<Future<ArrayList>> futures = new ArrayList<Future<ArrayList>>();
// results
List<ArrayList> results = new ArrayList<ArrayList>();
// pool with 5 threads
ExecutorService exec = Executors.newFixedThreadPool(5);
// enqueue tasks
for(String url: urls) {
futures.add(exec.submit(new ThreadTask(url)));
}
// attempt to move ArrayLists within Future<ArrayList> into a normal ArrayList
for(Future<ArrayList> future: futures) {
try {
results.add(future.get());
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ExecutionException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// for(ArrayList<String> s: results) {
// System.out.println(s);
// }
}
private static void populateList() {
urls = new ArrayList<String>();
urls.add("http://www.google.com");
urls.add("http://www.msn.co.uk");
urls.add("http://www.yahoo.co.uk");
urls.add("http://www.google.com");
urls.add("http://www.msn.co.uk");
urls.add("http://www.yahoo.co.uk");
}
}
ThreadTask
package com.multithreading.excutorservice;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
public class ThreadTask implements Callable<ArrayList> {
private String url;
HtmlParser parseHtml;
public ThreadTask(String url) {
this.url = url;
}
public ArrayList call() {
int counter = 0;
String html = null;
try {
URL myUrl = new URL(url);
BufferedReader reader = new BufferedReader(new InputStreamReader(myUrl.openStream()));
while ((html = reader.readLine()) != null) {
//counter += inputLine.length();
html += html;
}
}
catch (Exception ex) {
System.out.println(ex.toString());
}
ArrayList<String> storeLinks = new ArrayList<String>();
HtmlParser par = new HtmlParser();
storeLinks = par.returnNewUrls(html);
// for(String s: parseHtml) {
// System.out.println(s);
// }
//returns an ArrayList of URLS which is stored in a List<Future<ArrayList>> temporarily
return storeLinks;
}
}
HtmlParser
package com.multithreading.excutorservice;
import java.util.ArrayList;
import java.util.concurrent.Callable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HtmlParser {
private final String regex_links = "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
private ArrayList<String> extractedUrls;
public ArrayList<String> returnNewUrls (String data) {
extractedUrls = new ArrayList<String>();
Pattern p = Pattern.compile(regex_links);
Matcher m = p.matcher(data);
System.out.println("Test");
while (m.find()) {
System.out.println("Test");
extractedUrls.add(m.group(1));
}
return getLinks();
}
//returns the links
public ArrayList getLinks() {
return extractedUrls;
}
}
You're doing some pretty weird stuff here. Multiple threads are accessing the same static extractedUrls field, and each call to returnNewUrls creates a new field. In your returnNewUrls method, create a new ArrayList which is local to the method scope. Something along the lines of:
public static ArrayList<String> returnNewUrls(String data) {
ArrayList<String> urls = new ArrayList<String>();
addStuffToUrlsList();
return urls;
}
Another thing - not a bug, but you're doing unnecessary stuff - in the call method you don't need to create a new list if you're just assigning to a variable:
ArrayList<String> parseHtml = new ArrayList<String>();
parseHtml = HtmlParser.returnNewUrls(html);
This is better:
ArrayList<String> parseHtml = HtmlParser.returnNewUrls(html);
You have several concurrent tasks, but they use the same variable HtmlParser.extractedUrls? without any synchronization. Move this variable inside the returnNewUrls method.
BTW even without concurrency, using static variables is not encouraged, especially in a case like that, where it can easily be replaced with a local variable.