the indexOf always returns negative 7 no matter what i put, i will be using the website http://www.columbusstate.edu
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
public class WebCrawler
{
private static int linkCount = 0;
public static void main(String[] args) throws IOException
{
instance variables
ArrayList<String> links = new ArrayList<String>();
System.out.println("Enter the website you would like to web crawl");
Scanner input = new Scanner(System.in);
String address=input.next();
goes to website
URL locator = new URL(address);
Scanner in=new Scanner(locator.openStream());
String str="";
PrintWriter out=new PrintWriter("links.txt");
searches the webpage and pulls the links, or it should anyways.
while(in.hasNextLine())
{
str=in.next();
if(str.contains("href=\"http://"))
{
linkCount++;
int start = str.indexOf("ht");
int end = str.indexOf("/\"");
if(links.contains(str.substring(start, end))){
}
else{
links.add("Line Number "+linkCount+""+str.substring(start, end));
}
}
else if(str.contains("href=\"https://")){
linkCount++;
int start = str.indexOf("ht");
int end = str.indexOf("://")+15;
if(links.contains(str.substring(start, end))){
}
else{
links.add("Line Number "+linkCount+""+str.substring(start, end));
}
}
}
int num = links.size();
System.out.println(num);
out.println("Number of links on this webpage is "+linkCount);
out.println("Links are:");
for(int i = links.size()-1; i>0; i--){
out.println(links.get(i));
}
out.close();
}
}
If you really looking for a way to extract links from a web page then it's better to use a proper HTML parser than trying to do it manually. Here an example with JSOUP
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HTMLUtils {
private HTMLUtils() {}
public static List<String>extractLinks(String url) throws IOException {
final ArrayList<String> result = new ArrayList<String>();
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
// href ...
for (Element link : links) {
result.add(link.attr("abs:href"));
// result.add(link.text());
}
return result;
}
public final static void main(String[] args) throws Exception{
String site = "http://www.columbusstate.edu";
List<String> links = HTMLUtils.extractLinks(site);
for (String link : links) {
System.out.println(link);
}
}
}
Related
All the above and then print out the line number of the misspelled word and the line. Ex 1: I am a fox that lives in a huse
All of this should be printed on the command line
I have used rapid jSpell checker for spell check
note:- generate your own API key
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SOTest
{
public static void main(String[] args) {
try {
//Load text file into List
List<String> list = Files.readAllLines(Paths.get("Words.txt"), StandardCharsets.UTF_8);
//Iterate List to scan the spell error
for (String line : list) {
for(String err:spellCheck(line)){
if(line.contains(err)) {
line = line.replace(err, "^^"+err+"^^");
}
}
System.out.println(line);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static List<String> spellCheck(String input) throws IOException {
List<String> results = new ArrayList<String>();
byte[] postData = new String("{\t\"language\": \"enUS\",\t\"fieldvalues\": \""+input+"\",\t\"config\": {\t\t\"forceUpperCase\": false,\t\t\"ignoreIrregularCaps\": false,\t\t\"ignoreFirstCaps\": true,\t\t\"ignoreNumbers\": true,\t\t\"ignoreUpper\": false,\t\t\"ignoreDouble\": false,\t\t\"ignoreWordsWithNumbers\": true\t}}").getBytes();
String request = "https://jspell-checker.p.rapidapi.com/check";
URL url = new URL(request);
HttpURLConnection conn= (HttpURLConnection) url.openConnection();
conn.setDoOutput( true );
conn.setRequestMethod( "POST" );
conn.setRequestProperty( "x-rapidapi-host", "jspell-checker.p.rapidapi.com");
conn.setRequestProperty( "x-rapidapi-key", "32efb09328msh3e3b62d34ac8cfcp1467a4jsnb3ef821b4b23");
conn.setRequestProperty( "content-type", "application/json");
conn.setRequestProperty( "accept", "application/json");
conn.setRequestProperty( "useQueryString", "true");
conn.setUseCaches( false );
try( DataOutputStream wr = new DataOutputStream( conn.getOutputStream())) {
wr.write( postData );
}
BufferedReader br = new BufferedReader(new InputStreamReader((conn.getInputStream())));
String strCurrentLine;
while ((strCurrentLine = br.readLine()) != null) {
final String regex = "\"word\":\"[a-zA-Z]*\"";
final Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);
final Matcher matcher = pattern.matcher(strCurrentLine);
while (matcher.find()) {
results.add(matcher.group(0).split(":")[1].replace("\"", ""));
}
}
return results;
}
}
input inside Words.txt
this is example
thiss is example
my Dog is hudden under cor
output
this is example
^^thiss^^ is example
my Dog is ^^hudden^^ under ^^cor^^
This types of problem's solution is best to solve with hashTable. However, you need to do it this way, here is the solution without jSpell.
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
public class FinalProject {
private static List<String> word;
public static void main(String[] args) {
if(args.length == 0)
{
System.out.println("File name not specified");
System.exit(1);
}
SpellCheck(args[0]);
}
public static List<String> ScanFile(String file){
Path path = Paths.get(file);
try{
word = Files.readAllLines(path, StandardCharsets.UTF_8);
} catch(IOException e) {
}
return word;
}
public static void SpellCheck(String file)
{
int lineNumber;
List<String> word_list = ScanFile("..\\test\\dict.txt");
List<String> check_file = ScanFile(file);
String[] lineWords;
Boolean correctSpelled;
String currWord, dictWord ;
for(int i = 0; i < check_file.size(); i++)
{
lineWords = check_file.get(i).trim().replaceAll("[^a-zA-Z ]","").split(" ");
lineNumber = i+1;
for (int j = 0; j < lineWords.length; j++)
{
currWord = lineWords[j].toLowerCase();
correctSpelled = false;
for(int k = 0; k < word_list.size() && !correctSpelled; k++)
{
dictWord = word_list.get(k).trim().toLowerCase();
if(currWord.equals(dictWord))
{
correctSpelled = true;
}
}
if(!correctSpelled)
{
System.out.println("Error found on line number " + lineNumber);
System.out.println(check_file.toString() + " -> " + "\"" + currWord + "\"");
}
}// every word for loop
}// every line for loop
}//end spellChecker
}// end class
inputFile.txt contains
I am a fox that lives in a huse
args[0] is your dictionary.txt file
output:
Error found on line number 1
[I am a fox that lives in a huse] -> "huse"
I would try for that underline ^^^ thing. However, I have to go out. The main problem is solved. Try it. If it works, hit me a thumbs up. :)
I'm trying to read a text file and store it in an arraylist of objects, but I keep getting an error saying I cannot convert a String to an Item, which is type of arraylist I am using. I have tried various solutions, but am not quite sure how its is suppossed to be done. I am new to coding and have this assignment due soon. Anything helps!
private void loadFile(String FileName)
{
Scanner in;
Item line;
try
{
in = new Scanner(new File(FileName));
while (in.hasNext())
{
line = in.nextLine();
MyStore.add(line);
}
in.close();
}
catch (IOException e)
{
System.out.println("FILE NOT FOUND.");
}
}
my apologies for not adding the Item class
public class Item
{
private int myId;
private int myInv;
//default constructor
public Item()
{
myId = 0;
myInv = 0;
}
//"normal" constructor
public Item(int id, int inv)
{
myId = id;
myInv = inv;
}
//copy constructor
public Item(Item OtherItem)
{
myId = OtherItem.getId();
myInv = OtherItem.getInv();
}
public int getId()
{
return myId;
}
public int getInv()
{
return myInv;
}
public int compareTo(Item Other)
{
int compare = 0;
if (myId > Other.getId())
{
compare = 1;
}
else if (myId < Other.getId())
{
compare = -1;
}
return compare;
}
public boolean equals(Item Other)
{
boolean equal = false;
if (myId == Other.getId())
{
equal = true;;
}
return equal;
}
public String toString()
{
String Result;
Result = String.format("%8d%8d", myId, myInv);
return Result;
}
}
This is the creation of my arraylist.
private ArrayList MyStore = new ArrayList ();
Here is a sample of my text file.
3679 87
196 60
12490 12
18618 14
2370 65
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package com.mycompany.rosmery;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
*
* #author Sem-6-INGENIERIAINDU
*/
public class aaa {
public static void main(String arg[]) throws FileNotFoundException, IOException{
BufferedReader files=new BufferedReader(new FileReader(new File("")));
List<String> dto=new ArrayList<>();
String line;
while((line= files.readLine())!= null){
line= files.readLine();
dto.add(line);
//Hacer la logica para esos datos
}
}
}
in.nextLine() returns a String.
So, you cannot assign in.nextLine() to an instance of Item.
Your code may need to correct it as:
List<String> myStore = new ArrayList<String>();
private void loadFile(String FileName)
{
Scanner in;
try
{
in = new Scanner(new File(FileName));
while (in.hasNext())
{
myStore.add(in.nextLine());
}
in.close();
}
catch (IOException e)
{
System.out.println("FILE NOT FOUND.");
}
}
If you want to have a list of Item after reading a file, then you need provide the logic that convert given line of information into an instance of Item.
let's say your file content is in the following format.
id1,inv1
id2,inv2
.
.
Then, you can use the type Item as the following.
List<Item> myStore = new ArrayList<Item>();
private void loadFile(String FileName)
{
Scanner in;
String[] line;
try
{
in = new Scanner(new File(FileName));
while (in.hasNext())
{
line = in.nextLine().split(",");
myStore.add(new Item(line[0], line[1]));
}
in.close();
}
catch (IOException e)
{
System.out.println("FILE NOT FOUND.");
}
}
One of the possible solutions (assuming that the data in file lines is separated by a comma), with using streams:
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class Main {
public static void main(String[] args) throws IOException {
List<Item> items = loadFile("myfile.txt");
System.out.println(items);
}
private static List<Item> loadFile(String fileName) throws IOException {
try (Stream<String> stream = Files.lines(Paths.get(fileName))) {
return stream
.map(s -> Stream.of(s.split(",")).mapToInt(Integer::parseInt).toArray())
.map(i -> new Item(i[0], i[1]))
.collect(Collectors.toList());
}
}
}
or with foreach:
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class Main {
public static void main(String[] args) throws IOException {
List<Item> items = new ArrayList<>();
for (String line : loadFile("myfile.txt")) {
String[] data = line.split(",");
int id = Integer.parseInt(data[0]);
int inv = Integer.parseInt(data[1]);
items.add(new Item(id, inv));
}
System.out.println(items);
}
private static List<String> loadFile(String fileName) throws IOException {
try (Stream<String> stream = Files.lines(Paths.get(fileName))) {
return stream.collect(Collectors.toList());
}
}
}
Can anyone help me with searching for a particular string in HTML file using Jsoup or any other method. There are inbuilt methods but they help in extracting title or script texts inside a specific tags and not string in general.
In this code I have used one such inbuilt method to extract title from the html page.
But I want to search a string instead.
package dynamic_tester;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class tester {
public static void main(String args[])
{
Document htmlFile = null;
{
try {
htmlFile = Jsoup.parse(new File("x.html"), "ISO-8859-1");
}
catch (IOException e)
{
e.printStackTrace();
}
String title = htmlFile.title();
System.out.println("Title = "+title);
}
}
}
Here's a sample. It reads the HTML file as text String and then performs search on that String.
package com.example;
import java.io.FileInputStream;
import java.nio.charset.Charset;
public class SearchTest {
public static void main(String[] args) throws Exception {
StringBuffer htmlStr = getStringFromFile("test.html", "ISO-8859-1");
boolean isPresent = htmlStr.indexOf("hello") != -1;
System.out.println("is Present ? : " + isPresent);
}
private static StringBuffer getStringFromFile(String fileName, String charSetOfFile) {
StringBuffer strBuffer = new StringBuffer();
try(FileInputStream fis = new FileInputStream(fileName)) {
byte[] buffer = new byte[10240]; //10K buffer;
int readLen = -1;
while( (readLen = fis.read(buffer)) != -1) {
strBuffer.append( new String(buffer, 0, readLen, Charset.forName(charSetOfFile)));
}
} catch(Exception ex) {
ex.printStackTrace();
strBuffer = new StringBuffer();
}
return strBuffer;
}
}
I have some url. I want to get all href's from the html url is pointing to and all href from all gotten hrefs(recursively). The point is I want to set depth of that "recursion"
For example, if depth = 1, I need only href's from the HTML. If depth = 2, I need hrefs from HTML(that make suppose list1) and hrefs from each of href from list1 and so on
Here is what I have using jsoup:
import org.jsoup.*;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
public class Parser {
private final static String FILE_PATH = "src/main/resources/href.txt";
private List<String> result;
private int currentDepth;
private int maxDepth;
public Parser(int maxDepth) {
result = new ArrayList<String>();
this.maxDepth = maxDepth;
}
public void parseURL(String url) throws IOException {
url = url.toLowerCase();
if (!result.contains(url)) {
Connection connection = Jsoup.connect(url);
Document document = connection.get();
Elements links = document.select("a[href]");
for (Element link : links) {
String href = link.attr("href");
result.add(href);
parseURL(link.absUrl("href"));
currentDepth++;
if (currentDepth == maxDepth)
return;
}
}
}
}
How should I fix recursion condition to make it right?
I think you should check the depth first before calling the recursive function.
if (currentDepth >= maxDepth){
// do nothing
}else{
parseURL(...)
}
public void parseURL(String url) throws IOException {
url = url.toLowerCase();
if (!result.contains(url)) {
Connection connection = Jsoup.connect(url);
Document document = connection.get();
Elements links = document.getElementsByAttribute("href");
// Elements links = document.select("a[href]");
for (Element link : links) {
String href = link.attr("href");
result.add(href);
parseURL(link.absUrl("href"));
currentDepth++;
if (currentDepth == maxDepth)
return;
}
}
}
You can try this in your code, you can get all Elements from method getElementsByAttribute(String attribute) which have specified attribute
I have a code in Jsoup
import java.io.IOException;
import java.util.logging.*;
import org.jsoup.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import java.util.*;
public class JavaApplication17 {
public static void main(String[] args) {
try {
String str=/* url */;
Document doc = Jsoup.connect(url).get();
Elements paragraphs = doc.select("td");
List<String> text = new ArrayList<>();
for(Element p : paragraphs)
{
text.add(p.text());
}
String name[] = new String[1000];
for(int i=0,j=0;i<100;i++)
{
name[i]=text.get(i);
System.out.println(name[i]);
}
}
catch (IOException ex) {
Logger.getLogger(JavaApplication17.class.getName())
.log(Level.SEVERE, null, ex);
}
}
}
What is the equilivalent NSOUP ie. the similar .NET code...
If NSOUP is not the best option for similar use in .net then what should be used... ??