Pulling all urls from any webpage, having trouble with indexOf [homework]

Pulling all urls from any webpage, having trouble with indexOf [homework] - java

the indexOf always returns negative 7 no matter what i put, i will be using the website http://www.columbusstate.edu
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
public class WebCrawler
{
private static int linkCount = 0;
public static void main(String[] args) throws IOException
{
instance variables
ArrayList<String> links = new ArrayList<String>();
System.out.println("Enter the website you would like to web crawl");
Scanner input = new Scanner(System.in);
String address=input.next();
goes to website
URL locator = new URL(address);
Scanner in=new Scanner(locator.openStream());
String str="";
PrintWriter out=new PrintWriter("links.txt");
searches the webpage and pulls the links, or it should anyways.
while(in.hasNextLine())
{
str=in.next();
if(str.contains("href=\"http://"))
{
linkCount++;
int start = str.indexOf("ht");
int end = str.indexOf("/\"");
if(links.contains(str.substring(start, end))){
}
else{
links.add("Line Number "+linkCount+""+str.substring(start, end));
}
}
else if(str.contains("href=\"https://")){
linkCount++;
int start = str.indexOf("ht");
int end = str.indexOf("://")+15;
if(links.contains(str.substring(start, end))){
}
else{
links.add("Line Number "+linkCount+""+str.substring(start, end));
}
}
}
int num = links.size();
System.out.println(num);
out.println("Number of links on this webpage is "+linkCount);
out.println("Links are:");
for(int i = links.size()-1; i>0; i--){
out.println(links.get(i));
}
out.close();
}
}

If you really looking for a way to extract links from a web page then it's better to use a proper HTML parser than trying to do it manually. Here an example with JSOUP
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HTMLUtils {
private HTMLUtils() {}
public static List<String>extractLinks(String url) throws IOException {
final ArrayList<String> result = new ArrayList<String>();
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
// href ...
for (Element link : links) {
result.add(link.attr("abs:href"));
// result.add(link.text());
}
return result;
}
public final static void main(String[] args) throws Exception{
String site = "http://www.columbusstate.edu";
List<String> links = HTMLUtils.extractLinks(site);
for (String link : links) {
System.out.println(link);
}
}
}

Related

Spell Checker with Java, have to print out specific line

All the above and then print out the line number of the misspelled word and the line. Ex 1: I am a fox that lives in a huse
All of this should be printed on the command line

I have used rapid jSpell checker for spell check
note:- generate your own API key
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SOTest
{
public static void main(String[] args) {
try {
//Load text file into List
List<String> list = Files.readAllLines(Paths.get("Words.txt"), StandardCharsets.UTF_8);
//Iterate List to scan the spell error
for (String line : list) {
for(String err:spellCheck(line)){
if(line.contains(err)) {
line = line.replace(err, "^^"+err+"^^");
}
}
System.out.println(line);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static List<String> spellCheck(String input) throws IOException {
List<String> results = new ArrayList<String>();
byte[] postData = new String("{\t\"language\": \"enUS\",\t\"fieldvalues\": \""+input+"\",\t\"config\": {\t\t\"forceUpperCase\": false,\t\t\"ignoreIrregularCaps\": false,\t\t\"ignoreFirstCaps\": true,\t\t\"ignoreNumbers\": true,\t\t\"ignoreUpper\": false,\t\t\"ignoreDouble\": false,\t\t\"ignoreWordsWithNumbers\": true\t}}").getBytes();
String request = "https://jspell-checker.p.rapidapi.com/check";
URL url = new URL(request);
HttpURLConnection conn= (HttpURLConnection) url.openConnection();
conn.setDoOutput( true );
conn.setRequestMethod( "POST" );
conn.setRequestProperty( "x-rapidapi-host", "jspell-checker.p.rapidapi.com");
conn.setRequestProperty( "x-rapidapi-key", "32efb09328msh3e3b62d34ac8cfcp1467a4jsnb3ef821b4b23");
conn.setRequestProperty( "content-type", "application/json");
conn.setRequestProperty( "accept", "application/json");
conn.setRequestProperty( "useQueryString", "true");
conn.setUseCaches( false );
try( DataOutputStream wr = new DataOutputStream( conn.getOutputStream())) {
wr.write( postData );
}
BufferedReader br = new BufferedReader(new InputStreamReader((conn.getInputStream())));
String strCurrentLine;
while ((strCurrentLine = br.readLine()) != null) {
final String regex = "\"word\":\"[a-zA-Z]*\"";
final Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);
final Matcher matcher = pattern.matcher(strCurrentLine);
while (matcher.find()) {
results.add(matcher.group(0).split(":")[1].replace("\"", ""));
}
}
return results;
}
}
input inside Words.txt
this is example
thiss is example
my Dog is hudden under cor
output
this is example
^^thiss^^ is example
my Dog is ^^hudden^^ under ^^cor^^

This types of problem's solution is best to solve with hashTable. However, you need to do it this way, here is the solution without jSpell.
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
public class FinalProject {
private static List<String> word;
public static void main(String[] args) {
if(args.length == 0)
{
System.out.println("File name not specified");
System.exit(1);
}
SpellCheck(args[0]);
}
public static List<String> ScanFile(String file){
Path path = Paths.get(file);
try{
word = Files.readAllLines(path, StandardCharsets.UTF_8);
} catch(IOException e) {
}
return word;
}
public static void SpellCheck(String file)
{
int lineNumber;
List<String> word_list = ScanFile("..\\test\\dict.txt");
List<String> check_file = ScanFile(file);
String[] lineWords;
Boolean correctSpelled;
String currWord, dictWord ;
for(int i = 0; i < check_file.size(); i++)
{
lineWords = check_file.get(i).trim().replaceAll("[^a-zA-Z ]","").split(" ");
lineNumber = i+1;
for (int j = 0; j < lineWords.length; j++)
{
currWord = lineWords[j].toLowerCase();
correctSpelled = false;
for(int k = 0; k < word_list.size() && !correctSpelled; k++)
{
dictWord = word_list.get(k).trim().toLowerCase();
if(currWord.equals(dictWord))
{
correctSpelled = true;
}
}
if(!correctSpelled)
{
System.out.println("Error found on line number " + lineNumber);
System.out.println(check_file.toString() + " -> " + "\"" + currWord + "\"");
}
}// every word for loop
}// every line for loop
}//end spellChecker
}// end class
inputFile.txt contains
I am a fox that lives in a huse
args[0] is your dictionary.txt file
output:
Error found on line number 1
[I am a fox that lives in a huse] -> "huse"
I would try for that underline ^^^ thing. However, I have to go out. The main problem is solved. Try it. If it works, hit me a thumbs up. :)

I'm trying to read a text file and store it in an arraylist of objects

I'm trying to read a text file and store it in an arraylist of objects, but I keep getting an error saying I cannot convert a String to an Item, which is type of arraylist I am using. I have tried various solutions, but am not quite sure how its is suppossed to be done. I am new to coding and have this assignment due soon. Anything helps!
private void loadFile(String FileName)
{
Scanner in;
Item line;
try
{
in = new Scanner(new File(FileName));
while (in.hasNext())
{
line = in.nextLine();
MyStore.add(line);
}
in.close();
}
catch (IOException e)
{
System.out.println("FILE NOT FOUND.");
}
}
my apologies for not adding the Item class
public class Item
{
private int myId;
private int myInv;
//default constructor
public Item()
{
myId = 0;
myInv = 0;
}
//"normal" constructor
public Item(int id, int inv)
{
myId = id;
myInv = inv;
}
//copy constructor
public Item(Item OtherItem)
{
myId = OtherItem.getId();
myInv = OtherItem.getInv();
}
public int getId()
{
return myId;
}
public int getInv()
{
return myInv;
}
public int compareTo(Item Other)
{
int compare = 0;
if (myId > Other.getId())
{
compare = 1;
}
else if (myId < Other.getId())
{
compare = -1;
}
return compare;
}
public boolean equals(Item Other)
{
boolean equal = false;
if (myId == Other.getId())
{
equal = true;;
}
return equal;
}
public String toString()
{
String Result;
Result = String.format("%8d%8d", myId, myInv);
return Result;
}
}
This is the creation of my arraylist.
private ArrayList MyStore = new ArrayList ();
Here is a sample of my text file.
3679 87
196 60
12490 12
18618 14
2370 65

/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package com.mycompany.rosmery;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
*
* #author Sem-6-INGENIERIAINDU
*/
public class aaa {
public static void main(String arg[]) throws FileNotFoundException, IOException{
BufferedReader files=new BufferedReader(new FileReader(new File("")));
List<String> dto=new ArrayList<>();
String line;
while((line= files.readLine())!= null){
line= files.readLine();
dto.add(line);
//Hacer la logica para esos datos
}
}
}

in.nextLine() returns a String.
So, you cannot assign in.nextLine() to an instance of Item.
Your code may need to correct it as:
List<String> myStore = new ArrayList<String>();
private void loadFile(String FileName)
{
Scanner in;
try
{
in = new Scanner(new File(FileName));
while (in.hasNext())
{
myStore.add(in.nextLine());
}
in.close();
}
catch (IOException e)
{
System.out.println("FILE NOT FOUND.");
}
}
If you want to have a list of Item after reading a file, then you need provide the logic that convert given line of information into an instance of Item.
let's say your file content is in the following format.
id1,inv1
id2,inv2
.
.
Then, you can use the type Item as the following.
List<Item> myStore = new ArrayList<Item>();
private void loadFile(String FileName)
{
Scanner in;
String[] line;
try
{
in = new Scanner(new File(FileName));
while (in.hasNext())
{
line = in.nextLine().split(",");
myStore.add(new Item(line[0], line[1]));
}
in.close();
}
catch (IOException e)
{
System.out.println("FILE NOT FOUND.");
}
}

One of the possible solutions (assuming that the data in file lines is separated by a comma), with using streams:
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class Main {
public static void main(String[] args) throws IOException {
List<Item> items = loadFile("myfile.txt");
System.out.println(items);
}
private static List<Item> loadFile(String fileName) throws IOException {
try (Stream<String> stream = Files.lines(Paths.get(fileName))) {
return stream
.map(s -> Stream.of(s.split(",")).mapToInt(Integer::parseInt).toArray())
.map(i -> new Item(i[0], i[1]))
.collect(Collectors.toList());
}
}
}
or with foreach:
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class Main {
public static void main(String[] args) throws IOException {
List<Item> items = new ArrayList<>();
for (String line : loadFile("myfile.txt")) {
String[] data = line.split(",");
int id = Integer.parseInt(data[0]);
int inv = Integer.parseInt(data[1]);
items.add(new Item(id, inv));
}
System.out.println(items);
}
private static List<String> loadFile(String fileName) throws IOException {
try (Stream<String> stream = Files.lines(Paths.get(fileName))) {
return stream.collect(Collectors.toList());
}
}
}

Search for a string in html file using Jsoup

Can anyone help me with searching for a particular string in HTML file using Jsoup or any other method. There are inbuilt methods but they help in extracting title or script texts inside a specific tags and not string in general.
In this code I have used one such inbuilt method to extract title from the html page.
But I want to search a string instead.
package dynamic_tester;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class tester {
public static void main(String args[])
{
Document htmlFile = null;
{
try {
htmlFile = Jsoup.parse(new File("x.html"), "ISO-8859-1");
}
catch (IOException e)
{
e.printStackTrace();
}
String title = htmlFile.title();
System.out.println("Title = "+title);
}
}
}

Here's a sample. It reads the HTML file as text String and then performs search on that String.
package com.example;
import java.io.FileInputStream;
import java.nio.charset.Charset;
public class SearchTest {
public static void main(String[] args) throws Exception {
StringBuffer htmlStr = getStringFromFile("test.html", "ISO-8859-1");
boolean isPresent = htmlStr.indexOf("hello") != -1;
System.out.println("is Present ? : " + isPresent);
}
private static StringBuffer getStringFromFile(String fileName, String charSetOfFile) {
StringBuffer strBuffer = new StringBuffer();
try(FileInputStream fis = new FileInputStream(fileName)) {
byte[] buffer = new byte[10240]; //10K buffer;
int readLen = -1;
while( (readLen = fis.read(buffer)) != -1) {
strBuffer.append( new String(buffer, 0, readLen, Charset.forName(charSetOfFile)));
}
} catch(Exception ex) {
ex.printStackTrace();
strBuffer = new StringBuffer();
}
return strBuffer;
}
}

How to get href using jsoup

I have some url. I want to get all href's from the html url is pointing to and all href from all gotten hrefs(recursively). The point is I want to set depth of that "recursion"
For example, if depth = 1, I need only href's from the HTML. If depth = 2, I need hrefs from HTML(that make suppose list1) and hrefs from each of href from list1 and so on
Here is what I have using jsoup:
import org.jsoup.*;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
public class Parser {
private final static String FILE_PATH = "src/main/resources/href.txt";
private List<String> result;
private int currentDepth;
private int maxDepth;
public Parser(int maxDepth) {
result = new ArrayList<String>();
this.maxDepth = maxDepth;
}
public void parseURL(String url) throws IOException {
url = url.toLowerCase();
if (!result.contains(url)) {
Connection connection = Jsoup.connect(url);
Document document = connection.get();
Elements links = document.select("a[href]");
for (Element link : links) {
String href = link.attr("href");
result.add(href);
parseURL(link.absUrl("href"));
currentDepth++;
if (currentDepth == maxDepth)
return;
}
}
}
}
How should I fix recursion condition to make it right?

I think you should check the depth first before calling the recursive function.
if (currentDepth >= maxDepth){
// do nothing
}else{
parseURL(...)
}

public void parseURL(String url) throws IOException {
url = url.toLowerCase();
if (!result.contains(url)) {
Connection connection = Jsoup.connect(url);
Document document = connection.get();
Elements links = document.getElementsByAttribute("href");
// Elements links = document.select("a[href]");
for (Element link : links) {
String href = link.attr("href");
result.add(href);
parseURL(link.absUrl("href"));
currentDepth++;
if (currentDepth == maxDepth)
return;
}
}
}
You can try this in your code, you can get all Elements from method getElementsByAttribute(String attribute) which have specified attribute

Similar code in NSOUP

I have a code in Jsoup
import java.io.IOException;
import java.util.logging.*;
import org.jsoup.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import java.util.*;
public class JavaApplication17 {
public static void main(String[] args) {
try {
String str=/* url */;
Document doc = Jsoup.connect(url).get();
Elements paragraphs = doc.select("td");
List<String> text = new ArrayList<>();
for(Element p : paragraphs)
{
text.add(p.text());
}
String name[] = new String[1000];
for(int i=0,j=0;i<100;i++)
{
name[i]=text.get(i);
System.out.println(name[i]);
}
}
catch (IOException ex) {
Logger.getLogger(JavaApplication17.class.getName())
.log(Level.SEVERE, null, ex);
}
}
}
What is the equilivalent NSOUP ie. the similar .NET code...
If NSOUP is not the best option for similar use in .net then what should be used... ??

Develop Reference

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Pulling all urls from any webpage, having trouble with indexOf [homework] - java

Related

Spell Checker with Java, have to print out specific line

I'm trying to read a text file and store it in an arraylist of objects

Search for a string in html file using Jsoup

How to get href using jsoup

Similar code in NSOUP

Categories

Resources