I am scraping data from a web table and I have some code to read the text from each of the web elements(rows) which are in a list and then add this text to another list(columns) to be sent to a method to write to excel. The process of reading the web elements (approx 200 rows) and writing the data to the new list is very slow. Is there a quicker way? Or is this to be expected?
Here is my code:
package mypackage;
import java.io.IOException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import com.seleniumpractice.utilities.XLUtils;
import io.github.bonigarcia.wdm.WebDriverManager;
public class CovidWebTable {
static WebDriver driver;
static XLUtils xl;
static List<WebElement> header;
static List<WebElement> rows;
static List<ArrayList<String>>rowsXL;
public static void main(String[] args) throws IOException {
WebDriverManager.chromedriver().setup();
driver = new ChromeDriver();
driver.get("https://www.worldometers.info/coronavirus");
driver.manage().window().maximize();
driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(10));
WebElement table = driver.findElement(By.xpath("//table[#id='main_table_countries_today']"));
rows = table.findElements(By.xpath(".//tr[#role='row']"));
System.out.println("Total rows: "+rows.size());
xl = new XLUtils(".\\datafiles\\covid.xls");
//xl.setCellData(null, rows, rows, null);
rowsXL = new ArrayList<ArrayList<String>>();
//Add header
header = table.findElements(By.xpath(".//thead//th"));
System.out.println("Header cols: "+ header.size());
ArrayList<String> headerXL = new ArrayList<String>();
for(int col=1; col<header.size()-1; col++) {
//xl.setCellData("Covid Data", 0, col-1, header.get(col).getText());
headerXL.add(header.get(col).getText());
}
rowsXL.add(headerXL);
int xlRow = 1;
int skippedRows = 0;
for(int r=1; r<rows.size(); r++) {
String a = rows.get(r).getText();
//skip empty rows
if(rows.get(r).getText().equals("")) {
skippedRows++;
continue;
}
System.out.println("Reading row "+r);
ArrayList<String> cols = new ArrayList<String>();
for(int c=1; c<header.size(); c++) {
String data = rows.get(r).findElement(By.xpath(".//td["+(c+1)+"]")).getText();
//xl.setCellData("Covid Data", xlRow, c-1, rows.get(r).findElement(By.xpath(".//td["+(c+1)+"]")).getText());
cols.add(data);
}
rowsXL.add(cols);
xlRow++;
}
xl.setCellDataFromList(rowsXL, "Orders");
System.out.println("Scraped Rows: "+ rowsXL.size());
System.out.println("Skipped Rows: "+skippedRows);
System.out.println("Complete.");
driver.close();
}
}
you can get the text and parse them all in pattern
Related
I am scraping data from a table using selenium with Java but is slow and I am not sure why. Is there a reason why and how can I speed it up? The other thing I noticed is that it seems to slow down more as it progresses. I noticed this by observing the print statements to the console.
Here is my code:
package mypackage;
import java.io.IOException;
import java.time.Duration;
import java.util.List;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import com.seleniumpractice.utilities.XLUtils;
import io.github.bonigarcia.wdm.WebDriverManager;
public class CovidWebTable {
static WebDriver driver;
static XLUtils xl;
static List<WebElement> header;
static List<WebElement> rows;
public static void main(String[] args) throws IOException {
WebDriverManager.chromedriver().setup();
driver = new ChromeDriver();
driver.get("https://www.worldometers.info/coronavirus");
driver.manage().window().maximize();
driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(10));
WebElement table = driver.findElement(By.xpath("//table[#id='main_table_countries_today']"));
rows = table.findElements(By.xpath(".//tr[#role='row']"));
System.out.println("Total rows: "+rows.size());
xl = new XLUtils(".\\datafiles\\covid.xls");
//xl.setCellData(null, rows, rows, null);
//Add header
header = table.findElements(By.xpath(".//thead//th"));
System.out.println("Header cols: "+ header.size());
for(int col=1; col<header.size()-1; col++) {
xl.setCellData("Covid Data", 0, col-1, header.get(col).getText());
}
int xlRow = 1;
for(int r=1; r<rows.size(); r++) {
String a = rows.get(r).getText();
if(rows.get(r).getText().equals("")) {
System.out.println("Skipped row: "+r);
continue;
}
System.out.println("Writing row "+r);
for(int c=1; c<header.size(); c++) {
//String data = rows.get(r).findElement(By.xpath(".//td["+(c+1)+"]")).getText();
xl.setCellData("Covid Data", xlRow, c-1, rows.get(r).findElement(By.xpath(".//td["+(c+1)+"]")).getText());
}
xlRow++;
}
System.out.println("Complete.");
driver.close();
}
}
The code that contains the code for writing to excel:
package com.internetBanking.utilities;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
public class XLUtils {
public static FileInputStream fi;
public static FileOutputStream fo;
public static HSSFWorkbook wb;
public static HSSFSheet ws;
public static HSSFRow row;
public static HSSFCell cell;
public static int getRowCount(String xlfile, String xlsheet) throws IOException {
fi = new FileInputStream(xlfile);
wb = new HSSFWorkbook(fi);
ws = wb.getSheet(xlsheet);
int rowcount = ws.getLastRowNum();
wb.close();
fi.close();
return rowcount;
}
public static int getCellCount(String xlFile, String xlSheet, int rowNum) throws IOException {
fi = new FileInputStream(xlFile);
wb = new HSSFWorkbook(fi);
ws = wb.getSheet(xlSheet);
row = ws.getRow(rowNum);
int cellCount = row.getLastCellNum();
wb.close();
fi.close();
return cellCount;
}
public static String getCellData(String xlFile, String xlSheet, int rowNum, int colNum) throws IOException {
fi = new FileInputStream(xlFile);
wb = new HSSFWorkbook(fi);
ws = wb.getSheet(xlSheet);
row = ws.getRow(rowNum);
cell = row.getCell(colNum);
String data;
try {
String cellData = new DataFormatter().formatCellValue(cell);
return cellData;
}
catch(Exception e) {
data = "";
}
wb.close();
fi.close();
return data;
}
public static void setCellData(String xlFile, String xlSheet, int rowNum, int colNum, String data) throws IOException{
fi = new FileInputStream(xlFile);
wb = new HSSFWorkbook(fi);
ws = wb.getSheet(xlSheet);
row = ws.getRow(rowNum);
Cell cell = row.createCell(colNum);
cell.setCellValue(data);
//cell = row.getCell(colNum);
//cell.setCellValue(data);
fo = new FileOutputStream(xlFile);
wb.write(fo);
wb.close();
fi.close();
fo.close();
}
}
Ok getting to understand the code will give you a hint of what you should do, the code is iterating 231 read data from the table and write it to an excel file (daaa!) Ok but when writing into the excel file you write row by row (so!) You then iterate cell by cell on each row on each cell you call setCellData(...) in that XLUtils and here is when it starts to slow down!
In that setCellData each time you call it, it read a file from the disk, opens it, appends data then close the file and since you call it by cell, you end up calling it ~231 (rows) x 15 (col) = 3465
Imagine the time consumed (opening file/writing data/ closing file) 3465 times
So what to do,
You need to create a list of list
List<ArrayList> rows = new ArrayList()
This is a list of list ^ where each record in the list is another list
and on each row you read from the table (even headers), you create a list of cells and then you add this list to the rows list!
and eventually, you add some utility function in that XLUtils that accept List<ArrayList> that will
Open the file once, Iterate on that rows list write their values, and close the file.
Imagine it as if you are trying to move from house to house, and you were using your personal small pickup truck car to move stuff. It won't take all the house stuff so you would go back and forth for a while. On the other hand, you could have used a furniture truck that picks up all furniture and loads it at once.
I need to read a spreadsheet value and select the same value on the page .
With the code below , I can read the first and second column of the spreadsheet and fill in the fields on the page , but I can not read the third column and select the value of the Select page .
How do this using Java and Selenium?
package testeplanilha;
import java.io.File;
import java.io.IOException;
import jxl.Sheet;
import jxl.Workbook;
import jxl.read.biff.BiffException;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.ie.InternetExplorerDriver;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.support.ui.Select;
public class testePlanilha {
public static void main(String[] args) throws BiffException, IOException {
String nome = "";
String sobrenome= "";
String tipo = "";
WebDriver driver = new FirefoxDriver();
driver.get("file:///c:/index.html");
driver.manage().window().maximize();
Workbook workbook = Workbook.getWorkbook(new File("C:/plan.xls"));
Sheet sheet = workbook.getSheet(0);
int rowCount = sheet.getRows();
for(int i = 0; i < rowCount; i++){
String nome2 = sheet.getCell(0, i).getContents();
String Sobrenome2 = sheet.getCell(1, i).getContents();
String tipo2 = sheet.getCell(2, i).getContents();
driver.findElement(By.id("nome")).sendKeys(nome2);
driver.findElement(By.id("sobrenome")).sendKeys(Sobrenome2);
}
workbook.close();
Select verificaOpt = new Select(driver.findElement(By.id("select")));
System.out.println("Size: " + verificaOpt.getOptions().size());
}
}
Sheet:
target site:
Presumably your problem is that you need to select by text (not by index), and also deal with the fact that your input document has "volvo" whereas the <select> has "Volvo":
Select verificaOpt = new Select(driver.findElement(By.id("select"))); // as before
String titleCaseType = tipo2.substring(0,1).toUpperCase() + tipo2.substring(1);
verificaOpt.selectByVisibleText(titleCaseType);
I have a code to read an excel file and place it in a list Hash tables. The code that I have is:
package tests;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.List;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import jxl.read.biff.BiffException;
public class ReadExcelTest {
public static Hashtable htable=new Hashtable();
public static List<Hashtable<String,String>> exceldata = new ArrayList<Hashtable<String,String>>();
public static void main() throws IOException{
readexcel(System.getProperty("user.dir")+"//src//config//TestData.xls");
}
#Test
public static void readexcel() throws IOException
{
String inputFile=System.getProperty("user.dir")+"//src//config//TestData.xls";
File inputWorkbook = new File(inputFile);
Workbook w;
Cell firstrowelement = null;
try
{
w = Workbook.getWorkbook(inputWorkbook);
Sheet sheet = w.getSheet(0);
for (int j = 0; j <sheet.getRows(); j++)
{
for (int i = 0; i < sheet.getColumns(); i++)
{
firstrowelement = sheet.getCell(i, 0);
Cell cell = sheet.getCell(i, j);
htable.put(firstrowelement.getContents(),cell.getContents());
System.out.print(firstrowelement.getContents()+"->"+cell.getContents());
}
System.out.println(firstrowelement.getContents());
exceldata.add(j,htable);
}
//printing the list
for(Hashtable hash :exceldata)
{
Enumeration e = hash.keys();
while (e.hasMoreElements()) {
String key = (String) e.nextElement();
System.out.println(key + " : " + hash.get(key));
}
}
}
catch (BiffException e)
{
e.printStackTrace();
}
}
}
The output I am getting is all the hash tables in the list are displaying the same data instead of displaying each rows data.
Not exactly sure how to over come this issue. Any help is much appreciated.
You need to make a new htable for every row.
As it is, you keep re-using and updating a single HashTable.
(btw. prefer HashMap to HashTable).
the indexOf always returns negative 7 no matter what i put, i will be using the website http://www.columbusstate.edu
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
public class WebCrawler
{
private static int linkCount = 0;
public static void main(String[] args) throws IOException
{
instance variables
ArrayList<String> links = new ArrayList<String>();
System.out.println("Enter the website you would like to web crawl");
Scanner input = new Scanner(System.in);
String address=input.next();
goes to website
URL locator = new URL(address);
Scanner in=new Scanner(locator.openStream());
String str="";
PrintWriter out=new PrintWriter("links.txt");
searches the webpage and pulls the links, or it should anyways.
while(in.hasNextLine())
{
str=in.next();
if(str.contains("href=\"http://"))
{
linkCount++;
int start = str.indexOf("ht");
int end = str.indexOf("/\"");
if(links.contains(str.substring(start, end))){
}
else{
links.add("Line Number "+linkCount+""+str.substring(start, end));
}
}
else if(str.contains("href=\"https://")){
linkCount++;
int start = str.indexOf("ht");
int end = str.indexOf("://")+15;
if(links.contains(str.substring(start, end))){
}
else{
links.add("Line Number "+linkCount+""+str.substring(start, end));
}
}
}
int num = links.size();
System.out.println(num);
out.println("Number of links on this webpage is "+linkCount);
out.println("Links are:");
for(int i = links.size()-1; i>0; i--){
out.println(links.get(i));
}
out.close();
}
}
If you really looking for a way to extract links from a web page then it's better to use a proper HTML parser than trying to do it manually. Here an example with JSOUP
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HTMLUtils {
private HTMLUtils() {}
public static List<String>extractLinks(String url) throws IOException {
final ArrayList<String> result = new ArrayList<String>();
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
// href ...
for (Element link : links) {
result.add(link.attr("abs:href"));
// result.add(link.text());
}
return result;
}
public final static void main(String[] args) throws Exception{
String site = "http://www.columbusstate.edu";
List<String> links = HTMLUtils.extractLinks(site);
for (String link : links) {
System.out.println(link);
}
}
}
Below is my code.Whne i input url from excel most of the time it shows org.openqa.selenium.ElementNotVisibleException: Element is not currently visible error.
For site like www.travelocity.com it shows after clicking 7 8 links but for www.google.com it shows error from starting.
package test;
import java.awt.HeadlessException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import java.util.Properties;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
public class Linktestparam {
public static void main( String[] args ) throws Exception{
Properties prop = new Properties();
FileInputStream f = new FileInputStream("C:\\Documents and Settings\\bibekananda.sarangi\\workspace\\test\\src\\excelPath");
prop.load(f);
String[][] steps ;
steps = excelRead(prop.getProperty("Linkpath"));
int totallink;
for(int j = 1; j <= steps.length ; j++){
//System.out.println("no of links in " + steps[j][0] + "is" + totallink);
totallink = linktest(steps[j][0]);
}
}
public static int linktest(String url) throws Exception{
WebDriver driver = new FirefoxDriver();
driver.navigate().to(url);
Thread.sleep(12000);
//WebDriverWait wait = new WebDriverWait(driver,60);
//wait.until(ExpectedConditions.visibilityOfElementLocated(By.id("gsr")));
List<WebElement> alllinkspresent=driver.findElements(By.tagName("a"));
int totallink = driver.findElements(By.tagName("a")).size();
System.out.println("no of links in " + totallink);
for (int i = 0; i < totallink; i++)
{
int LastRow = i;
driver.findElements(By.tagName("a")).get(i).getText();
driver.findElements(By.tagName("a")).get(i).click();
System.out.println("LastRow value is" + LastRow);
Thread.sleep(18000);
String pagetitle = driver.getTitle();
System.out.println(pagetitle);
String urltext=driver.getCurrentUrl();
System.out.println(urltext);
if(pagetitle.contains("404")) {
System.out.println("404 Found");
System.out.println("FAIL");
String status="FAIL";
excelwrite(status,urltext,LastRow);
}
else{
System.out.println("PASS");
String status = "PASS";
excelwrite(status,urltext,LastRow);
driver.navigate().back();
Thread.sleep(4000);
}
}
return totallink;
//driver.close();
}
public static String[][] excelRead(String fileName) throws Exception {
File excel = new File(fileName);
FileInputStream fis = new FileInputStream(excel);
HSSFWorkbook wb = new HSSFWorkbook(fis);
HSSFSheet ws = wb.getSheet("Sheet1");
int rowNum = ws.getLastRowNum() + 1;
int colNum = ws.getRow(0).getLastCellNum();
String[][] data = new String[rowNum][colNum];
for (int i = 0 ; i < rowNum ; i++) {
HSSFRow row = ws.getRow(i);
for (int j=0 ; j < colNum ; j++){
HSSFCell cell = row.getCell(j);
String value = cellToString(cell);
data[i][j] = value;
System.out.println("The value is" + value);
}
}
return data;
}
public static String[][] excelwrite(String status,String urltext,int LastRow) throws Exception {
try{
FileInputStream file = new FileInputStream(new File("D:\\Work\\link.xls"));
HSSFWorkbook workbook = new HSSFWorkbook(file);
HSSFSheet sheet = workbook.getSheetAt(1);
Row row = sheet.createRow(LastRow);
System.out.println("LastRow value in excelwrite is " + LastRow);
Cell cell2 = row.createCell(0);
Cell cell3 = row.createCell(1);
cell3.setCellValue(status);
cell2.setCellValue(urltext);
System.out.println(status);
file.close();
FileOutputStream outFile =new FileOutputStream(new File("D:\\Work\\link.xls"));
workbook.write(outFile);
}
catch (FileNotFoundException e) {
e.printStackTrace();
}
catch (IOException e) {
e.printStackTrace();
}
catch (HeadlessException e)
{
e.printStackTrace();
}
return null;
}
public static String cellToString(HSSFCell cell) {
int type;
Object result ;
type = cell.getCellType();
switch (type) {
case 0 :
result = cell.getNumericCellValue();
break;
case 1 :
result = cell.getStringCellValue();
break;
default :
throw new RuntimeException("There are no support for this type of cell");
}
return result.toString();
}
}
============================
OUTPUT:::::::::::
The value isurl
The value ishttps://www.google.co.in/
The value ishttp://www.espire.com/contact-us
The value ishttp://www.travelocity.com/
no of links in 44
Exception in thread "main" org.openqa.selenium.ElementNotVisibleException: Element is not currently visible and so may not be interacted with
Command duration or timeout: 0 milliseconds
You need to use WebdriverWait for visibility of elements Code:
new WebDriverWait(driver, 30).until(ExpectedConditions.elementToBeClickable(By.xpath("Xpath"));