I have a code in Jsoup
import java.io.IOException;
import java.util.logging.*;
import org.jsoup.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import java.util.*;
public class JavaApplication17 {
public static void main(String[] args) {
try {
String str=/* url */;
Document doc = Jsoup.connect(url).get();
Elements paragraphs = doc.select("td");
List<String> text = new ArrayList<>();
for(Element p : paragraphs)
{
text.add(p.text());
}
String name[] = new String[1000];
for(int i=0,j=0;i<100;i++)
{
name[i]=text.get(i);
System.out.println(name[i]);
}
}
catch (IOException ex) {
Logger.getLogger(JavaApplication17.class.getName())
.log(Level.SEVERE, null, ex);
}
}
}
What is the equilivalent NSOUP ie. the similar .NET code...
If NSOUP is not the best option for similar use in .net then what should be used... ??
Related
I have been trying to integrate browser stack with my selenium scripts. As part of which i have added desired capabilities in my 'getBrowser' method with data provider in the Base class. As i want to run my scripts in multiple browsers. The browsers list is in "getData" method in Base class.
How can i call the getBrowser method in my testcases and have the getData(browsers list) defined in only the base class. This what i have done and its not right. i have added my base class and the test script.
Base.getBrowser(Platform platform, String browserName, String browserVersion); this line is were i am stuck.
Any help would be appreciated. Thanks
package com.gale.precision.FundVisualizer.core;
import java.net.MalformedURLException;
import java.net.URL;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.testng.annotations.DataProvider;
import org.openqa.selenium.Platform;
import org.openqa.selenium.StaleElementReferenceException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.edge.EdgeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.ie.InternetExplorerDriver;
import org.openqa.selenium.remote.DesiredCapabilities;
public class Base {
//public static WebDriver driver = null;
public static WebDriver driver;
public static String DriverPath = System.getProperty("user.dir") + "//" + "Drivers";
public static String DirectoryPath = System.getProperty("user.dir");
public static Properties prop = new Properties();
public static InputStream input = null;
public static final String USERNAME = "antonyprabhu1";
public static final String AUTOMATE_KEY = "xHRMpqxgD8sn3e3sr75s";
public static final String URL = "https://" + USERNAME + ":" + AUTOMATE_KEY + "#hub-cloud.browserstack.com/wd/hub";
#DataProvider(name = "EnvironmentDetails")
public static void getBrowser(Platform platform, String browserName, String browserVersion) throws MalformedURLException
{
DesiredCapabilities capability = new DesiredCapabilities();
capability.setPlatform(platform);
capability.setBrowserName(browserName);
capability.setVersion(browserVersion);
capability.setCapability("browserstack.debug", "true");
driver = new RemoteWebDriver(new URL(URL), capability);
try {
input = new FileInputStream(DirectoryPath + "//" + "config" + "//" + "app.properties");
prop.load(input);
} catch (IOException e) {
e.printStackTrace();
}
}
#DataProvider(name = "EnvironmentDetails", parallel = true)
public Object[][] getData() {
Object[][] testData = new Object[][] {
{
Platform.MAC, "chrome", "62.0"
}, {
Platform.WIN8,
"chrome",
"62.0"
}, {
Platform.WINDOWS,
"firefox",
"57"
}
};
return testData;
}
public static void closeBrowser() {
driver.quit();
}
}
package comparison;
import java.net.MalformedURLException;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.Platform;
import org.openqa.selenium.StaleElementReferenceException;
import org.openqa.selenium.WebElement;
import org.testng.Assert;
import org.testng.annotations.AfterClass;
import org.testng.annotations.AfterTest;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import com.gale.precision.FundVisualizer.core.Base;
import com.gale.precision.FundVisualizer.core.ExtentReport;
import com.gale.precision.FundVisualizer.pageObject.Comparison;
import com.gale.precision.FundVisualizer.pageObject.InvestmentsSearch;
#SuppressWarnings("unused")
public class AddedInvestmentDisplay extends ExtentReport {
/*============================================================================================================================
Test case : Verify that already selected Investments for comparison does not show up
======================================================================================*/
#Test(testName = "Comparison: Verify an already selected Investment for comparison does not show up in search")
public void verifyAddedInvestmentDisplay() throws InterruptedException, MalformedURLException {
//test = extent.createTest(Thread.currentThread().getStackTrace()[1].getMethodName());
Base.getBrowser(Platform platform, String browserName, String browserVersion);
InvestmentsSearch.login(Base.driver);
InvestmentsSearch.InvestmentsLink(Base.driver).click();
JavascriptExecutor jse = (JavascriptExecutor) Base.driver;
jse.executeScript("window.scrollBy(0,750)", "");
InvestmentsSearch.ViewResults(Base.driver).click();
for (int i = 0; i <= 2; i++)
{
try {
Comparison.firstCheckBox(Base.driver).click();
break;
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
//Base.clickingStaleElements(Comparison.firstCheckBox(Base.driver));
//Comparison.firstCheckBox(Base.driver).click();
for (int i = 0; i <= 2; i++)
{
try {
Comparison.analyzeOrCompare(Base.driver).click();
break;
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
Comparison.addInvestmentField(Base.driver).sendKeys("MFC027");
Comparison.firstSearchResult(Base.driver).click();
Comparison.addInvestmentField(Base.driver).sendKeys("MFC027");
Assert.assertEquals(Comparison.emptySearchResult(Base.driver).isDisplayed(), true);
}
#DataProvider(name = "EnvironmentDetails", parallel = true)
public Object[][] getData() {
Object[][] testData = new Object[][] {
{
Platform.MAC, "chrome", "62.0"
}, {
Platform.WIN8,
"chrome",
"62.0"
}, {
Platform.WINDOWS,
"firefox",
"57"
}
};
return testData;
}
#AfterClass
public void tearDown()
{
Base.closeBrowser();
}
}
I could resolve this issue by passing the parameters in the #test method
#Test(testName="Comparison: Verify adding an index in the comparison", dataProvider = "EnvironmentDetails",dataProviderClass = BrowsersDataProvider.class)
public void verifyAddingIndex(Platform platform, String browserName, String browserVersion) throws InterruptedException, MalformedURLException {
//test = extent.createTest(Thread.currentThread().getStackTrace()[1].getMethodName());
Base.getBrowser(platform,browserName,browserVersion);
InvestmentsSearch.login(Base.driver);
like
I want to extract all article content from an website using any web crawling/scraping methods.
The problem is I can get content from a single page but not its redirecting links.
Anyone please give me the proper solutions
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Main3 {
public static void main(String[] argv) throws Exception {
HTMLDocument doc = new HTMLDocument() {
public HTMLEditorKit.ParserCallback getReader(int pos) {
return new HTMLEditorKit.ParserCallback() {
public void handleText(char[] data, int pos) {
System.out.println(data);
}
};
}
};
URL url = new URI("http://tamilblog.ishafoundation.org/").toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream());
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream("ram.txt"), "UTF-8");
EditorKit kit = new HTMLEditorKit();
kit.read(rd, doc, 0);
try {
Document docs = Jsoup.connect("http://tamilblog.ishafoundation.org/").get();
Elements links = docs.select("a[href]");
Elements elements = docs.select("*");
System.out.println("Total Links :"+links.size());
for (Element element : elements) {
System.out.println(element.ownText());
}
for (Element link : links) {
System.out.println(" * a: link :"+ link.attr("a:href"));
System.out.println(" * a: text :"+ link.text());
System.out.println(" * a: text :"+ link.text());
System.out.println(" * a: Alt :"+ link.attr("alt"));
System.out.println(link.attr("p"));
}
} catch (Exception e) {
e.printStackTrace();
}
}
}`
This is the solution:
package com.github.davidepastore.stackoverflow34014436;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.BadLocationException;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Stackoverflow 34014436 question.
*
*/
public class App {
public static void main(String[] args) throws URISyntaxException,
IOException, BadLocationException {
HTMLDocument doc = new HTMLDocument() {
public HTMLEditorKit.ParserCallback getReader(int pos) {
return new HTMLEditorKit.ParserCallback() {
public void handleText(char[] data, int pos) {
System.out.println(data);
}
};
}
};
URL url = new URI("http://tamilblog.ishafoundation.org/").toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream());
OutputStreamWriter writer = new OutputStreamWriter(
new FileOutputStream("ram.txt"), "UTF-8");
EditorKit kit = new HTMLEditorKit();
kit.read(rd, doc, 0);
try {
Document docs = Jsoup.connect(
"http://tamilblog.ishafoundation.org/").get();
Elements links = docs.select("a[href]");
Elements elements = docs.select("*");
System.out.println("Total Links :" + links.size());
for (Element element : elements) {
System.out.println(element.ownText());
}
for (Element link : links) {
String hrefUrl = link.attr("href");
if (!"#".equals(hrefUrl) && !hrefUrl.isEmpty()) {
System.out.println(" * a: link :" + hrefUrl);
System.out.println(" * a: text :" + link.text());
writer.write(link.text() + " => " + hrefUrl + "\n");
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
writer.close();
}
}
}
Here we are using the writer to write the text of every link in the ram.txt file.
You should use an existing crawler such as Apache Nutch or StormCrawler.
I want to create a thread in order to crawl all links of a website and store it in LinkedHashSet, but when I print the size of this LinkedHashSet, it prints nothing. I've started learning crawling! I've referenced The Art of Java. Here is my code:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedHashSet;
import java.util.logging.Level;
import java.util.logging.Logger;
public class TestThread {
public void crawl(URL url) {
try {
BufferedReader reader = new BufferedReader(
new InputStreamReader(url.openConnection().getInputStream()));
String line = reader.readLine();
LinkedHashSet toCrawlList = new LinkedHashSet();
while (line != null) {
toCrawlList.add(line);
System.out.println(toCrawlList.size());
}
} catch (IOException ex) {
Logger.getLogger(TestThread.class.getName()).log(Level.SEVERE, null, ex);
}
}
public static void main(String[] args) {
final TestThread test1 = new TestThread();
Thread thread = new Thread(new Runnable() {
public void run(){
try {
test1.crawl(new URL("http://stackoverflow.com/"));
} catch (MalformedURLException ex) {
Logger.getLogger(TestThread.class.getName()).log(Level.SEVERE, null, ex);
}
}
});
}
}
You should fill your list like this:
while ((line = reader.readLine()) != null) {
toCrawlList.add(line);
}
System.out.println(toCrawlList.size());
If that doesn't work, try to set a break point in your code and find out if your reader even contains anything.
I am trying to parse a json file with JSONParse and am getting this error, with the error occurring in beginning of the following json:
Unexpected token COLON(:) at position 11.
{"276716878": {
"followers": [
2435018580,
1664252310,
372262434
],
"following": [
16211434,
945440959,
130682467,
264257750,
900526363,
318231688,
40335029,
64044676
]
}}
I also wrote the json file with the following:
FileWriter out = new FileWriter(JSON_FILE);
out.write(json.toString(1));
out.flush();
out.close();
There could be some format error in the json string you passed. Validate your json string. The below sample code works fine.
import java.io.FileWriter;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.json.JSONException;
import org.json.JSONObject;
public class Test {
private static final Logger logger = Logger.getLogger(Test.class.getName());
private static final String JSON_FILE = "/home/visruth/Desktop/Visruth.txt";
public static void main(String[] args) {
String jsonText = "{\"215876567\": { \"followers\": [ 2464054938, 772677937]}}";
try (
FileWriter out = new FileWriter(JSON_FILE);
) {
JSONObject json = new JSONObject(jsonText);
int indentFactor = 1;
String prettyprintedJSON = json.toString(indentFactor);
System.out.println(prettyprintedJSON);
out.write(prettyprintedJSON);
} catch (JSONException e) {
logger.log(Level.SEVERE, e.getMessage());
} catch (IOException e) {
logger.severe(e.getMessage());
}
}
}
Assign your json text in jsonText variable and try.
I think you are referring classes from different apis for this purpose. Just use only one api.
Here is a demo code which works fine with the given json string in the question.
import java.io.FileWriter;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import org.junit.Assert;
import org.junit.Test;
public class TestCase {
private static final String JSON_FILE = "/home/visruth/Desktop/Visruth.txt";
#Test
public void testJSONParser() throws Exception {
JSONParser parser = new JSONParser();
try (
FileWriter out = new FileWriter(JSON_FILE);
) {
String jsonText = "{\"276716878\": { \"followers\": [ 2435018580, 1664252310, 372262434 ], \"following\": [ 16211434, 945440959, 130682467, 264257750, 900526363, 318231688, 40335029, 64044676 ] }}";
Object obj = parser.parse(jsonText);
JSONObject jsonObject = (JSONObject) obj;
JSONObject jsonObject215876567 = (JSONObject)jsonObject.get("276716878");
JSONArray followers = (JSONArray)(jsonObject215876567.get("followers"));
Assert.assertEquals("[2435018580,1664252310,372262434]", followers.toString());
String jsonStringFromJsonObject = jsonObject.toString();// There is no argument as an int
out.write(jsonStringFromJsonObject);
} catch (ParseException e) {
Assert.fail(e.getMessage());
}
}
}
the indexOf always returns negative 7 no matter what i put, i will be using the website http://www.columbusstate.edu
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
public class WebCrawler
{
private static int linkCount = 0;
public static void main(String[] args) throws IOException
{
instance variables
ArrayList<String> links = new ArrayList<String>();
System.out.println("Enter the website you would like to web crawl");
Scanner input = new Scanner(System.in);
String address=input.next();
goes to website
URL locator = new URL(address);
Scanner in=new Scanner(locator.openStream());
String str="";
PrintWriter out=new PrintWriter("links.txt");
searches the webpage and pulls the links, or it should anyways.
while(in.hasNextLine())
{
str=in.next();
if(str.contains("href=\"http://"))
{
linkCount++;
int start = str.indexOf("ht");
int end = str.indexOf("/\"");
if(links.contains(str.substring(start, end))){
}
else{
links.add("Line Number "+linkCount+""+str.substring(start, end));
}
}
else if(str.contains("href=\"https://")){
linkCount++;
int start = str.indexOf("ht");
int end = str.indexOf("://")+15;
if(links.contains(str.substring(start, end))){
}
else{
links.add("Line Number "+linkCount+""+str.substring(start, end));
}
}
}
int num = links.size();
System.out.println(num);
out.println("Number of links on this webpage is "+linkCount);
out.println("Links are:");
for(int i = links.size()-1; i>0; i--){
out.println(links.get(i));
}
out.close();
}
}
If you really looking for a way to extract links from a web page then it's better to use a proper HTML parser than trying to do it manually. Here an example with JSOUP
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HTMLUtils {
private HTMLUtils() {}
public static List<String>extractLinks(String url) throws IOException {
final ArrayList<String> result = new ArrayList<String>();
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
// href ...
for (Element link : links) {
result.add(link.attr("abs:href"));
// result.add(link.text());
}
return result;
}
public final static void main(String[] args) throws Exception{
String site = "http://www.columbusstate.edu";
List<String> links = HTMLUtils.extractLinks(site);
for (String link : links) {
System.out.println(link);
}
}
}