Crawling amazon.com

Crawling amazon.com - java

I'm crawling amazon products and principle it's going fine.
I have three classes from this nice tutorial:
http://www.netinstructions.com/how-to-make-a-simple-web-crawler-in-java/
I added the files to the following code (class Spider):
import java.io.FileNotFoundException;
import java.util.*;
public class Spider {
public static final int MAX_PAGES_TO_SEARCH = 10000;
private Set<String> pagesVisited = new HashSet<String>();
private List<String> pagesToVisit = new LinkedList<String>();
public void search(String url) {
while (this.pagesVisited.size() < MAX_PAGES_TO_SEARCH) {
String currentUrl;
SpiderLeg leg = new SpiderLeg();
if (this.pagesToVisit.isEmpty()) {
//System.out.println("abc");
currentUrl = url;
this.pagesVisited.add(url);
} else {
//System.out.println("def");
currentUrl = this.nextUrl();
}
try {
Thread.sleep(10000);
leg.crawl(currentUrl); // Lots of stuff happening here. Look at the crawl method in
} catch (FileNotFoundException e) {
System.out.println("Oops, FileNotFoundException caught");
} catch (InterruptedException e) {
e.printStackTrace();
}
this.pagesToVisit.addAll(leg.getLinks());
//System.out.println("Test");
}
System.out.println("\n**Done** Visited " + this.pagesVisited.size() + " web page(s)");
SpiderLeg leg = new SpiderLeg();
leg.calcAdjMatrix();
for (int i = 0; i < leg.adjMatrix.length; i++) {
System.out.println(Arrays.toString(leg.adjMatrix[i]));
}
}
private String nextUrl() {
String nextUrl;
do {
if (this.pagesToVisit.isEmpty()){
return "https://www.amazon.de/Proband-Thriller-Guido-Kniesel/dp/1535287004/ref=sr_1_1?s=books&ie=UTF8&qid=1478247246&sr=1-1&keywords=%5B%5D";
}
nextUrl = this.pagesToVisit.remove(0);
} while (this.pagesVisited.contains(nextUrl));
this.pagesVisited.add(nextUrl);
return nextUrl;
}
}
class SpiderLeg:
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.*;
public class SpiderLeg {
// We'll use a fake USER_AGENT so the web server thinks the robot is a normal web browser.
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36";
private static List<String> links = new LinkedList<String>();
private static String graphLink;
private Document htmlDocument;
private static double counter = 0;
static Map<String, Set<String>> adjMap = new HashMap<String, Set<String>>();
static int[][] adjMatrix;
static List<String> mapping;
public boolean crawl(String url) throws FileNotFoundException {
if (url.isEmpty()) {
return false;
}
try{
Connection connection = Jsoup.connect(url).ignoreContentType(true).userAgent(USER_AGENT);
Document htmlDocument = connection.get();
this.htmlDocument = htmlDocument;
if(connection.response().statusCode() == 200){
// 200 is the HTTP OK status code
// indicating that everything is great.
counter++;
double progress;
progress = (counter/Spider.MAX_PAGES_TO_SEARCH)*100;
System.out.println("\n**Visiting** Received web page at " + url);
System.out.println("\n**Progress** " + progress + "%");
}
if(!connection.response().contentType().contains("text/html")) {
System.out.println("**Failure** Retrieved something other than HTML");
return false;
}
//Elements linksOnPage = htmlDocument.select("a[href*=/gp/product/]");
Elements linksOnPage = htmlDocument.select("a[href*=/dp/]");
Elements salesRank = htmlDocument.select("span.zg_hrsr_rank");
Elements category = htmlDocument.select("span.zg_hrsr_ladder a");
String categoryString = category.html();
String salesRankString = salesRank.html();
salesRankString = salesRankString.replace("\n", " ");
categoryString = categoryString.replace("\n", " ");
//System.out.println(categoryString);
System.out.println("Found (" + linksOnPage.size() + ") links");
PrintWriter pw = new PrintWriter(new FileWriter("Horror.csv", true));
StringBuilder sb = new StringBuilder();
int beginIndex = url.indexOf(".de/");
int endIndex = url.indexOf("/dp");
String title = url.substring(beginIndex+4,endIndex);
if(!adjMap.containsKey(title)){
if(categoryString.contains("Horror")){
adjMap.put(title, new HashSet<String>());
sb.append(title);
sb.append(',');
sb.append(salesRankString);
sb.append(',');
sb.append(categoryString);
sb.append(',');
for(Element link : linksOnPage){
String graphLink = link.attr("abs:href");
if(!graphLink.contains("one-click")){
if(!graphLink.contains("Kindle")){
if(!graphLink.contains("unsticky")){
this.links.add(graphLink);
//adjMap.get(url).add(graphLink);
adjMap.get(title).add(cutTitle(graphLink));
sb.append(graphLink);
sb.append(',');
}
}
}
}
sb.append('\n');
pw.write(sb.toString());
pw.close();
}
}
System.out.println("done!");
return true;
}
catch(IOException ioe) {
// We were not successful in our HTTP request
System.out.println("Error in out HTTP request " + ioe);
return false;
}
}
public static void calcAdjMatrix(){
Set<String> allMyURLs = new HashSet(adjMap.keySet());
for(String s: adjMap.keySet()){
allMyURLs.addAll(adjMap.get(s));
System.out.println(s + "\t" + adjMap.get(s));
}
int dim = allMyURLs.size();
adjMatrix = new int[dim][dim];
List<String> nodes_list = new ArrayList<>();
for(String s: allMyURLs){
nodes_list.add(s);
}
for(String s: nodes_list){
Set<String> outEdges = adjMap.get(s);
int i = nodes_list.indexOf(s);
if(outEdges != null){
for(String s1: outEdges){
int j = nodes_list.indexOf(s1);
adjMatrix[i][j] = 1;
}
}
}
}
public String cutTitle(String url) throws FileNotFoundException{
int beginIndex = url.indexOf(".de/");
int endIndex = url.indexOf("/dp");
String title;
if(url.contains(".de") && url.contains("/dp")){
title = url.substring(beginIndex+4,endIndex);
}else{
title = "wrong url";
}
return title;
}
public boolean searchForWord(String searchWord) {
if(this.htmlDocument == null) {
System.out.println("ERROR! Call crawl() before performing analysis on the document");
return false;
}
System.out.println("Searching for the word " + searchWord + "...");
String bodyText = this.htmlDocument.body().text();
return bodyText.toLowerCase().contains(searchWord.toLowerCase());
}
public List<String> getLinks(){
return this.links;
}
}
class SpiderTest:
public class SpiderTest {
public static void main(String[] args) {
Spider spider = new Spider();
spider.search("https://www.amazon.de/Wille-geschehe-Psychothriller-Guido-Kniesel/dp/1537455389/ref=pd_sim_14_1?_encoding=UTF8&psc=1&refRID=CQPDDGY4BJ4D8THNNSZ6");
}
}
Now the problem is, that after 100 URLs I think, that amazon is banning me from the server. The program doesn't find URLs anymore.
Does anyone has an idea how I can fix that?

Well, don't be rude and crawl them then.
Check their robots.txt (wiki) to see what they allow you to do. Don't be surprised if they ban you if you go places they don't want you to go.

The problem is very common when you try to crawl big websites that don't want to be crawled. They basically block you for a period of time to prevent their data being crawled or stolen.
With that being said, you have two options, either make each request from a different IP/server which will make your requests look legit and avoid the ban, or go for the easiest way which is to use a service that does that for you.
I've done both and the first one is complex, takes time and needs maintenance (you have to build a network of servers), the second option is usually not free but very fast to implement and guarantees that all your requests will always return data and you won't be banned.
There are some services on the internet that does that. I've used proxycrawl (which also has a free tier) in the past which works very good. They have an API that you can call and you only can use your same code, just changing the url you call.
This would be an example for amazon:
GET https://api.proxycrawl.com?token=yourtoken&url=https://amazon.com
And you would get always a response, even if you crawl 1000 pages a second, you will never be banned as you will be calling that proxy instead which does all the magic for you.
I hope it helps :)

You can try using proxy servers to prevent being blocked. There are services providing working proxies. I have good experience using https://gimmeproxy.com which specifically has proxies supporting amazon.
To get proxy working with Amazon, you need just to make the following request:
https://gimmeproxy.com/api/getProxy?api_key=your_api_key&websites=amazon
You will get JSON response with all proxy data which you can use later as needed:
{
"supportsHttps": true,
"protocol": "socks5",
"ip": "116.182.122.182",
"port": "1915",
"get": true,
"post": true,
"cookies": true,
"referer": true,
"user-agent": true,
"anonymityLevel": 1,
"websites": {
"example": true,
"google": false,
"amazon": true
},
"country": "BR",
"tsChecked": 1517952910,
"curl": "socks5://116.182.122.182:1915",
"ipPort": "116.182.122.182:1915",
"type": "socks5",
"speed": 37.78,
"otherProtocols": {}
}

Related

How to access the repository names in GitHub with Java?

I have an application, where I have many different repositories in the GitHub. In my application, I need to get the Name of each repository. I searched a lot in the Internet and of course in Stack Overflow to access the GitHub repos but I found no suitable solution for me.
Hopefully somebody have a good idea or has experience with it.
#Value("${github.githubUrl}")
String url;
public void getEach() {
try {
CloseableHttpClient httpClient = HttpClientBuilder.create().build();
HttpGet request = new HttpGet(this.url);
request.addHeader("content-type", "application/json");
HttpResponse result = httpClient.execute(request);
String json = EntityUtils.toString(result.getEntity(), "UTF-8");
System.out.println("Json kommmt============================= "+ json);
JsonElement jelement = new JsonParser().parse(json);
JsonArray jarr = jelement.getAsJsonArray();
for (int i = 0; i < jarr.size(); i++) {
JsonObject jo = (JsonObject) jarr.get(i);
String fullName = jo.get("full_name").toString();
fullName = fullName.substring(1, fullName.length()-1);
System.out.println("fullname kommt ================ " + fullName);
}
} catch (IOException ex) {
System.out.println(ex.getStackTrace());
}
}

I could see an existing code here.
https://github.com/hub4j/github-api/blob/9ab6d570193dc381a0cda7cc4991f471499dcf24/src/main/java/org/kohsuke/github/GHPerson.java#L70-L89
public PagedIterable<GHRepository> listRepositories(final int pageSize) {
return new PagedIterable<GHRepository>() {
public PagedIterator<GHRepository> _iterator(int pageSize) {
return new PagedIterator<GHRepository>(root.retrieve().asIterator("/users/" + login + "/repos?per_page=" + pageSize, GHRepository[].class, pageSize)) {
#Override
protected void wrapUp(GHRepository[] page) {
for (GHRepository c : page)
c.wrap(root);
}
};
}
};
}
As per github documentation, the list of repositories would be like this:
List organization repositories
Lists repositories for the specified organization.
GET /orgs/{org}/repos
Note:
Or if it's just a requirement to list the repos, try this -
curl "https://api.github.com/users/$GHUSER/repos?access_token=$GITHUB_API_TOKEN" | grep -w clone_url
Steps 1,2,3 will help you to create token as in https://crunchify.com/how-to-access-github-content-with-basic-oauth-authentication-in-java-httpclient-or-urlconnection-method/
And if really rely on java, you can try the curl in java! (you should find a way to grep)
URL url = new URL("https://api.github.com/users/" + GHUSER + "/repos?access_token=" + GITHUB_API_TOKEN); //pagesize too
try (BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(), "UTF-8"))) {
for (String line; (line = reader.readLine()) != null;) {
System.out.println(line);
}
}

Check out https://github.com/hub4j/github-api, it has all you need. This is a code sample that I assume does what you want - fetches the list of repos within a specified organization.
import org.kohsuke.github.GHRepository;
import org.kohsuke.github.GitHub;
import org.kohsuke.github.PagedIterable;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import java.io.IOException;
#SpringBootApplication
public class GhScanner implements CommandLineRunner {
#Value("${github.oauthToken}")
private String oauthToken;
public static void main(String[] args) {
SpringApplication.run(GhScanner.class, args);
}
#Override
public void run(String... args) throws Exception {
if (args.length < 1) {
throw new IllegalArgumentException("Expected organization name as the 1st argument");
}
var githubOrganization = args[0];
System.out.printf("Fetching repos for organization '%s':%n", githubOrganization);
var iterator = getRepos(githubOrganization).iterator();
while (iterator.hasNext()) {
iterator.nextPage().stream()
.map(GHRepository::getName)
.forEach(System.out::println);
}
}
private PagedIterable<GHRepository> getRepos(String githubOrganization) throws IOException {
GitHub gitHub = GitHub.connectUsingOAuth(oauthToken);
return gitHub.getOrganization(githubOrganization).listRepositories();
}
}

Apache HttpClient doesn't get content of pages using pagination links. I get 200 status but html has no content

I'm trying to web crawl pages of content with Apache HttpClient. I get status 200 when requesting the next page using the link from pagination but HTML shows 500 in the body and no content. Postman works fine and gets content even when using links from pagination.
Main class
public static void main(String[] args) {
String url = "https://www.cochranelibrary.com/cdsr/reviews/topics";
MyContentFetcher myContentFetcher = new MyContentFetcher();
MyParser myParser = new MyParser();
try {
// Load Topic list page
String topicsPage = myContentFetcher.loadHTML(url);
// Getting all the topics.
Map<Integer, MyNode> topics = myParser.getTopicList(topicsPage);
// Print all the topics and ask user to choose one
for (int id : topics.keySet())
System.out.println("-> " + id + " <- " + topics.get(id).getTopic());
System.out.println("********************");
System.out.print("Enter ID number from the list above to get reviews or enter anything else to exit:\n");
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
String id = reader.readLine();
// Validate user input, get the link and topic and cout the choice.
if (isNumber(id)) {
int idNum = Integer.parseInt(id);
if (idNum <= topics.size() && idNum > 0) {
String topic = topics.get(idNum).getTopic();
String link = topics.get(idNum).getLink();
System.out.println("You picked: " + topic + link + "\n***************************");
// Loading first page of reviews
myParser.loadReviews(myContentFetcher.loadHTML(link), topic);
// Getting links to other pages
Queue<String> paginationLinks = myParser.getLinks();
// --------------> WORKS FINE UNTIL HERE <--------------
// Problem starts here....
// Load list of reviews for chosen topic
while(!paginationLinks.isEmpty()) {
String page = myContentFetcher.loadHTML(paginationLinks.remove());
myParser.loadReviews(page, topic);
}
}
}
System.out.println("Exiting...");
} catch (IOException e) {
System.out.println("There was a problem...");
}
!!!! Here is the class that fetches HTML. I'm probably doing something wrong here...
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import java.io.IOException;
import java.util.Scanner;
public class MyContentFetcher {
public MyContentFetcher() {
}
String loadHTML(String url) throws IOException {
// Create configurations for
RequestConfig config = RequestConfig.custom()
.setCircularRedirectsAllowed(true)
.setCookieSpec(CookieSpecs.STANDARD)
.build();
// Creating a HttpClient object
CloseableHttpClient httpClient = HttpClients.custom()
.setDefaultRequestConfig(config)
.build();
// Creating a HttpGet object
HttpGet httpget = new HttpGet(url);
httpget.setHeader("User-Agent", "Mozilla/5.0 (Linux; Android 8.1.0; Pixel Build/OPM4.171019.021.D1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.109 Mobile Safari/537.36 EdgA/42.0.0.2057");
CloseableHttpResponse httpResponse = httpClient.execute(httpget);
Scanner sc = new Scanner(httpResponse.getEntity().getContent());
StringBuilder page = new StringBuilder("");
while(sc.hasNext())
page.append(sc.nextLine()).append(" ");
httpResponse.close();
httpClient.close();
return page.toString();
}
}
Here is the parser. Parser doesn't have any problems (Parse perfectly fine and as needed)
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
public class MyParser {
private Map<String, String> topics;
private Document htmlPage;
private Element reviewBlock;
public MyParser(){}
// Loads all topics from the Cochrane Library into a map -> (Topic Name, Link)
public Map<Integer, MyNode> getTopicList(String page) {
Map<Integer, MyNode> topics= new HashMap<Integer, MyNode>();
htmlPage = Jsoup.parse(page);
// Get 'a' element that is inside 'li' with a class name of browse-by-list-item
int i = 1;
MyNode info;
for(Element element : htmlPage.body().select("li.browse-by-list-item > a")) {
info = new MyNode(element.select("button").text(),
element.select("a").attr("href").trim());
topics.put(i, info);
i++;
}
return topics;
}
// Loads Reviews
public void loadReviews(String page, String topic) throws IOException {
htmlPage = Jsoup.parse(page);
// Get all review blocks
System.out.println("**************************\n" + page + "\n**************************\n");
for(Element element : htmlPage.body().select(".search-results-item-body")){
reviewBlock = element;
String review = getLink() + " | " + topic + " | " + getTitle() + " | " + getAuthor() + " | " + getDate();
System.out.println(review);
}
}
Queue<String> getLinks(){
System.out.println("GETTING LINKS");
Queue<String> links = new LinkedList<>();
for(Element element : htmlPage.body().select("li.pagination-page-list-item > a")) {
links.add(element.attr("href"));
}
return links;
}
private String getLink(){
return "https://www.cochranelibrary.com" + reviewBlock.select("a").attr("href");
}
public String getTitle(){
return reviewBlock.selectFirst("a").text();
}
public String getAuthor(){
return reviewBlock.selectFirst("div.search-result-authors").text();
}
public String getDate(){
String result = reviewBlock.select("div.search-result-date > div").text();
try {
SimpleDateFormat fmt = new SimpleDateFormat("dd MMMM yyyy", Locale.US);
Date d = fmt.parse(result);
fmt.applyPattern("yyyy-MM-dd");
result = fmt.format(d);
} catch (ParseException e) {
System.out.println("Failed parsing the date...");
}
return result;
}

Had I had the permissions, this would have been just a comment.
I ran your loadHtml function with the URL you provided and the result I get is more or less equal to the html of the page.
Could you please provide more details about the httpclient library? I'm using Java 12 (I'm positive it would work with Java 8 too) with this dependency
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.11</version>
</dependency>

To solve this problem I created a session and used cookie store and used CloseableHttpResponse to close the response after each time I fetch the page. Here is the fragment of the code that make it work:
RequestConfig config = RequestConfig.custom()
.setCircularRedirectsAllowed(true)
.build();
httpClient = HttpClients.custom()
.setDefaultRequestConfig(config)
.setMaxConnPerRoute(100)
.build();
CookieStore cookieStore = new BasicCookieStore();
httpContext = new BasicHttpContext();
httpContext.setAttribute(HttpClientContext.COOKIE_STORE, cookieStore);
HttpGet httpget = new HttpGet(url);
httpget.setHeader("User-Agent", "Whatever");
StringBuilder page = new StringBuilder("");
try {
CloseableHttpResponse response = httpClient.execute(httpget, httpContext);
System.out.println(response.getStatusLine());
Scanner sc = new Scanner(response.getEntity().getContent());
while (sc.hasNext())
page.append(sc.nextLine()).append(" ");
sc.close();
response.close();
} catch (IOException e) {
e.printStackTrace();
}
return page.toString();

WebCrawler with recursion

So I am working on a webcrawler that is supposed to download all images, files, and webpages, and then recursively do the same for all webpages found. However, I seem to have a logic error.
public class WebCrawler {
private static String url;
private static int maxCrawlDepth;
private static String filePath;
/* Recursive function that crawls all web pages found on a given web page.
* This function also saves elements from the DownloadRepository to disk.
*/
public static void crawling(WebPage webpage, int currentCrawlDepth, int maxCrawlDepth) {
webpage.crawl(currentCrawlDepth);
HashMap<String, WebPage> pages = webpage.getCrawledWebPages();
if(currentCrawlDepth < maxCrawlDepth) {
for(WebPage wp : pages.values()) {
crawling(wp, currentCrawlDepth+1, maxCrawlDepth);
}
}
}
public static void main(String[] args) {
if(args.length != 3) {
System.out.println("Must pass three parameters");
System.exit(0);
}
url = "";
maxCrawlDepth = 0;
filePath = "";
url = args[0];
try {
URL testUrl = new URL(url);
URLConnection urlConnection = testUrl.openConnection();
urlConnection.connect();
} catch (MalformedURLException e) {
System.out.println("Not a valid URL");
System.exit(0);
} catch (IOException e) {
System.out.println("Could not open URL");
System.exit(0);
}
try {
maxCrawlDepth = Integer.parseInt(args[1]);
} catch (NumberFormatException e) {
System.out.println("Argument is not an int");
System.exit(0);
}
filePath = args[2];
File path = new File(filePath);
if(!path.exists()) {
System.out.println("File Path is invalid");
System.exit(0);
}
WebPage webpage = new WebPage(url);
crawling(webpage, 0, maxCrawlDepth);
System.out.println("Web crawl is complete");
}
}
the function crawl will parse the contents of a website storing any found images, files, or links into a hashmap, for example:
public class WebPage implements WebElement {
private static Elements images;
private static Elements links;
private HashMap<String, WebImage> webImages = new HashMap<String, WebImage>();
private HashMap<String, WebPage> webPages = new HashMap<String, WebPage>();
private HashMap<String, WebFile> files = new HashMap<String, WebFile>();
private String url;
public WebPage(String url) {
this.url = url;
}
/* The crawl method parses the html on a given web page
* and adds the elements of the web page to the Download
* Repository.
*/
public void crawl(int currentCrawlDepth) {
System.out.print("Crawling " + url + " at crawl depth ");
System.out.println(currentCrawlDepth + "\n");
Document doc = null;
try {
HttpConnection httpConnection = (HttpConnection) Jsoup.connect(url);
httpConnection.ignoreContentType(true);
doc = httpConnection.get();
} catch (MalformedURLException e) {
System.out.println(e.getLocalizedMessage());
} catch (IOException e) {
System.out.println(e.getLocalizedMessage());
} catch (IllegalArgumentException e) {
System.out.println(url + "is not a valid URL");
}
DownloadRepository downloadRepository = DownloadRepository.getInstance();
if(doc != null) {
images = doc.select("img");
links = doc.select("a[href]");
for(Element image : images) {
String imageUrl = image.absUrl("src");
if(!webImages.containsValue(image)) {
WebImage webImage = new WebImage(imageUrl);
webImages.put(imageUrl, webImage);
downloadRepository.addElement(imageUrl, webImage);
System.out.println("Added image at " + imageUrl);
}
}
HttpConnection mimeConnection = null;
Response mimeResponse = null;
for(Element link: links) {
String linkUrl = link.absUrl("href");
linkUrl = linkUrl.trim();
if(!linkUrl.contains("#")) {
try {
mimeConnection = (HttpConnection) Jsoup.connect(linkUrl);
mimeConnection.ignoreContentType(true);
mimeConnection.ignoreHttpErrors(true);
mimeResponse = (Response) mimeConnection.execute();
} catch (Exception e) {
System.out.println(e.getLocalizedMessage());
}
String contentType = null;
if(mimeResponse != null) {
contentType = mimeResponse.contentType();
}
if(contentType == null) {
continue;
}
if(contentType.toString().equals("text/html")) {
if(!webPages.containsKey(linkUrl)) {
WebPage webPage = new WebPage(linkUrl);
webPages.put(linkUrl, webPage);
downloadRepository.addElement(linkUrl, webPage);
System.out.println("Added webPage at " + linkUrl);
}
}
else {
if(!files.containsValue(link)) {
WebFile webFile = new WebFile(linkUrl);
files.put(linkUrl, webFile);
downloadRepository.addElement(linkUrl, webFile);
System.out.println("Added file at " + linkUrl);
}
}
}
}
}
System.out.print("\nFinished crawling " + url + " at crawl depth ");
System.out.println(currentCrawlDepth + "\n");
}
public HashMap<String, WebImage> getImages() {
return webImages;
}
public HashMap<String, WebPage> getCrawledWebPages() {
return webPages;
}
public HashMap<String, WebFile> getFiles() {
return files;
}
public String getUrl() {
return url;
}
#Override
public void saveToDisk(String filePath) {
System.out.println(filePath);
}
}
The point of using a hashmap is to ensure that I do not parse the same website more than once. The error seems to be with my recursion. What is the issue?
Here is also some sample output for starting the crawl at http://www.google.com
Crawling https://www.google.com/ at crawl depth 0
Added webPage at http://www.google.com/intl/en/options/
Added webPage at https://www.google.com/intl/en/ads/
Added webPage at https://www.google.com/services/
Added webPage at https://www.google.com/intl/en/about.html
Added webPage at https://www.google.com/intl/en/policies/
Finished crawling https://www.google.com/ at crawl depth 0
Crawling https://www.google.com/services/ at crawl depth 1
Added webPage at http://www.google.com/intl/en/enterprise/apps/business/?utm_medium=et&utm_campaign=en&utm_source=us-en-et-nelson_bizsol
Added webPage at https://www.google.com/services/sitemap.html
Added webPage at https://www.google.com/intl/en/about/
Added webPage at https://www.google.com/intl/en/policies/
Finished crawling https://www.google.com/services/ at crawl depth 1
**Crawling https://www.google.com/intl/en/policies/ at crawl depth 2**
Added webPage at https://www.google.com/intl/en/policies/
Added webPage at https://www.google.com/intl/en/policies/terms/
Added webPage at https://www.google.com/intl/en/policies/privacy/
Added webPage at https://www.google.com/intl/en/policies/terms/
Added webPage at https://www.google.com/intl/en/policies/faq/
Added webPage at https://www.google.com/intl/en/policies/technologies/
Added webPage at https://www.google.com/intl/en/about/
Added webPage at https://www.google.com/intl/en/policies/
Finished crawling https://www.google.com/intl/en/policies/ at crawl depth 2
**Crawling https://www.google.com/intl/en/policies/ at crawl depth 3**
Notice that it parses http://www.google.com/intl/en/policies/ twice

You are creating a new map for each web-page. This will ensure that if the same link occurs on the page twice it will only be crawled once but it will not deal with the case where the same link appears on two different pages.
https://www.google.com/intl/en/policies/ appears on both https://www.google.com/ and https://www.google.com/services/.
To avoid this use a single map throughout your crawl and pass it as a parameter into the recursion.
public class WebCrawler {
private HashMap<String, WebPage> visited = new HashMap<String, WebPage>();
public static void crawling(Map<String, WebPage> visited, WebPage webpage, int currentCrawlDepth, int maxCrawlDepth) {
}
}
As you are also holding a map of the images etc you may prefer to create a new object, perhaps call it visited, and make it keep track.
public class Visited {
private HashMap<String, WebPage> webPages = new HashMap<String, WebPage>();
public boolean visit(String url, WebPage page) {
if (webPages.containsKey(page)) {
return false;
}
webPages.put(url, page);
return true;
}
private HashMap<String, WebImage> webImages = new HashMap<String, WebImage>();
public boolean visit(String url, WebImage image) {
if (webImages.containsKey(image)) {
return false;
}
webImages.put(url, image);
return true;
}
}

using dbpedia spotlight in java or scala

Does anyone know where to find a little how to on using dbpedia spotlight in java or scala? Or could anyone explain how it's done? I can't find any information on this...

The DBpedia Spotlight wiki pages would be a good place to start.
And I believe the installation page has listed the most popular ways (using a jar, or set up a web service) to use the application.
It includes instructions on using the Java/Scala API with your own installation, or calling the Web Service.
There are some additional data needed to be downloaded to run your own server for full service, good time to make a coffee for yourself.

you need download dbpedia spotlight (jar file) after that u can use next two classes ( author pablomendes ) i only make some change .
public class db extends AnnotationClient {
//private final static String API_URL = "http://jodaiber.dyndns.org:2222/";
private static String API_URL = "http://spotlight.dbpedia.org:80/";
private static double CONFIDENCE = 0.0;
private static int SUPPORT = 0;
private static String powered_by ="non";
private static String spotter ="CoOccurrenceBasedSelector";//"LingPipeSpotter"=Annotate all spots
//AtLeastOneNounSelector"=No verbs and adjs.
//"CoOccurrenceBasedSelector" =No 'common words'
//"NESpotter"=Only Per.,Org.,Loc.
private static String disambiguator ="Default";//Default ;Occurrences=Occurrence-centric;Document=Document-centric
private static String showScores ="yes";
#SuppressWarnings("static-access")
public void configiration(double CONFIDENCE,int SUPPORT,
String powered_by,String spotter,String disambiguator,String showScores){
this.CONFIDENCE=CONFIDENCE;
this.SUPPORT=SUPPORT;
this.powered_by=powered_by;
this.spotter=spotter;
this.disambiguator=disambiguator;
this.showScores=showScores;
}
public List<DBpediaResource> extract(Text text) throws AnnotationException {
LOG.info("Querying API.");
String spotlightResponse;
try {
String Query=API_URL + "rest/annotate/?" +
"confidence=" + CONFIDENCE
+ "&support=" + SUPPORT
+ "&spotter=" + spotter
+ "&disambiguator=" + disambiguator
+ "&showScores=" + showScores
+ "&powered_by=" + powered_by
+ "&text=" + URLEncoder.encode(text.text(), "utf-8");
LOG.info(Query);
GetMethod getMethod = new GetMethod(Query);
getMethod.addRequestHeader(new Header("Accept", "application/json"));
spotlightResponse = request(getMethod);
} catch (UnsupportedEncodingException e) {
throw new AnnotationException("Could not encode text.", e);
}
assert spotlightResponse != null;
JSONObject resultJSON = null;
JSONArray entities = null;
try {
resultJSON = new JSONObject(spotlightResponse);
entities = resultJSON.getJSONArray("Resources");
} catch (JSONException e) {
//throw new AnnotationException("Received invalid response from DBpedia Spotlight API.");
}
LinkedList<DBpediaResource> resources = new LinkedList<DBpediaResource>();
if(entities!=null)
for(int i = 0; i < entities.length(); i++) {
try {
JSONObject entity = entities.getJSONObject(i);
resources.add(
new DBpediaResource(entity.getString("#URI"),
Integer.parseInt(entity.getString("#support"))));
} catch (JSONException e) {
LOG.error("JSON exception "+e);
}
}
return resources;
}
}
second class
/**
* #author pablomendes
*/
public abstract class AnnotationClient {
public Logger LOG = Logger.getLogger(this.getClass());
private List<String> RES = new ArrayList<String>();
// Create an instance of HttpClient.
private static HttpClient client = new HttpClient();
public List<String> getResu(){
return RES;
}
public String request(HttpMethod method) throws AnnotationException {
String response = null;
// Provide custom retry handler is necessary
method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler(3, false));
try {
// Execute the method.
int statusCode = client.executeMethod(method);
if (statusCode != HttpStatus.SC_OK) {
LOG.error("Method failed: " + method.getStatusLine());
}
// Read the response body.
byte[] responseBody = method.getResponseBody(); //TODO Going to buffer response body of large or unknown size. Using getResponseBodyAsStream instead is recommended.
// Deal with the response.
// Use caution: ensure correct character encoding and is not binary data
response = new String(responseBody);
} catch (HttpException e) {
LOG.error("Fatal protocol violation: " + e.getMessage());
throw new AnnotationException("Protocol error executing HTTP request.",e);
} catch (IOException e) {
LOG.error("Fatal transport error: " + e.getMessage());
LOG.error(method.getQueryString());
throw new AnnotationException("Transport error executing HTTP request.",e);
} finally {
// Release the connection.
method.releaseConnection();
}
return response;
}
protected static String readFileAsString(String filePath) throws java.io.IOException{
return readFileAsString(new File(filePath));
}
protected static String readFileAsString(File file) throws IOException {
byte[] buffer = new byte[(int) file.length()];
#SuppressWarnings("resource")
BufferedInputStream f = new BufferedInputStream(new FileInputStream(file));
f.read(buffer);
return new String(buffer);
}
static abstract class LineParser {
public abstract String parse(String s) throws ParseException;
static class ManualDatasetLineParser extends LineParser {
public String parse(String s) throws ParseException {
return s.trim();
}
}
static class OccTSVLineParser extends LineParser {
public String parse(String s) throws ParseException {
String result = s;
try {
result = s.trim().split("\t")[3];
} catch (ArrayIndexOutOfBoundsException e) {
throw new ParseException(e.getMessage(), 3);
}
return result;
}
}
}
public void saveExtractedEntitiesSet(String Question, LineParser parser, int restartFrom) throws Exception {
String text = Question;
int i=0;
//int correct =0 ; int error = 0;int sum = 0;
for (String snippet: text.split("\n")) {
String s = parser.parse(snippet);
if (s!= null && !s.equals("")) {
i++;
if (i<restartFrom) continue;
List<DBpediaResource> entities = new ArrayList<DBpediaResource>();
try {
entities = extract(new Text(snippet.replaceAll("\\s+"," ")));
System.out.println(entities.get(0).getFullUri());
} catch (AnnotationException e) {
// error++;
LOG.error(e);
e.printStackTrace();
}
for (DBpediaResource e: entities) {
RES.add(e.uri());
}
}
}
}
public abstract List<DBpediaResource> extract(Text text) throws AnnotationException;
public void evaluate(String Question) throws Exception {
evaluateManual(Question,0);
}
public void evaluateManual(String Question, int restartFrom) throws Exception {
saveExtractedEntitiesSet(Question,new LineParser.ManualDatasetLineParser(), restartFrom);
}
}
main()
public static void main(String[] args) throws Exception {
String Question ="Is the Amazon river longer than the Nile River?";
db c = new db ();
c.configiration(0.0, 0, "non", "CoOccurrenceBasedSelector", "Default", "yes");
System.out.println("resource : "+c.getResu());
}

I just add one little fix for your answer.
Your code is running, if you add the evaluate method call:
public static void main(String[] args) throws Exception {
String question = "Is the Amazon river longer than the Nile River?";
db c = new db ();
c.configiration(0.0, 0, "non", "CoOccurrenceBasedSelector", "Default", "yes");
c.evaluate(question);
System.out.println("resource : "+c.getResu());
}
Lamine

In the request method of the second class (AnnotationClient) in Adel's answer, the author Pablo Mendes hasn't finished
TODO Going to buffer response body of large or unknown size. Using getResponseBodyAsStream instead is recommended.
which is an annoying warning that needs to be removed by replacing
byte[] responseBody = method.getResponseBody(); //TODO Going to buffer response body of large or unknown size. Using getResponseBodyAsStream instead is recommended.
// Deal with the response.
// Use caution: ensure correct character encoding and is not binary data
response = new String(responseBody);
with
Reader in = new InputStreamReader(method.getResponseBodyAsStream(), "UTF-8");
StringWriter writer = new StringWriter();
org.apache.commons.io.IOUtils.copy(in, writer);
response = writer.toString();

How to download videos from youtube on java? [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 3 years ago.
Improve this question
how to download videos from youtube on Java?
need class(or piece of code) which describes how to do that.
Thank you.

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.logging.Formatter;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.LogRecord;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.ClientContext;
import org.apache.http.client.utils.URIUtils;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
public class JavaYoutubeDownloader {
public static String newline = System.getProperty("line.separator");
private static final Logger log = Logger.getLogger(JavaYoutubeDownloader.class.getCanonicalName());
private static final Level defaultLogLevelSelf = Level.FINER;
private static final Level defaultLogLevel = Level.WARNING;
private static final Logger rootlog = Logger.getLogger("");
private static final String scheme = "http";
private static final String host = "www.youtube.com";
private static final Pattern commaPattern = Pattern.compile(",");
private static final Pattern pipePattern = Pattern.compile("\\|");
private static final char[] ILLEGAL_FILENAME_CHARACTERS = { '/', '\n', '\r', '\t', '\0', '\f', '`', '?', '*', '\\', '<', '>', '|', '\"', ':' };
private static void usage(String error) {
if (error != null) {
System.err.println("Error: " + error);
}
System.err.println("usage: JavaYoutubeDownload VIDEO_ID DESTINATION_DIRECTORY");
System.exit(-1);
}
public static void main(String[] args) {
if (args == null || args.length == 0) {
usage("Missing video id. Extract from http://www.youtube.com/watch?v=VIDEO_ID");
}
try {
setupLogging();
log.fine("Starting");
String videoId = null;
String outdir = ".";
// TODO Ghetto command line parsing
if (args.length == 1) {
videoId = args[0];
} else if (args.length == 2) {
videoId = args[0];
outdir = args[1];
}
int format = 18; // http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs
String encoding = "UTF-8";
String userAgent = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13";
File outputDir = new File(outdir);
String extension = getExtension(format);
play(videoId, format, encoding, userAgent, outputDir, extension);
} catch (Throwable t) {
t.printStackTrace();
}
log.fine("Finished");
}
private static String getExtension(int format) {
// TODO
return "mp4";
}
private static void play(String videoId, int format, String encoding, String userAgent, File outputdir, String extension) throws Throwable {
log.fine("Retrieving " + videoId);
List<NameValuePair> qparams = new ArrayList<NameValuePair>();
qparams.add(new BasicNameValuePair("video_id", videoId));
qparams.add(new BasicNameValuePair("fmt", "" + format));
URI uri = getUri("get_video_info", qparams);
CookieStore cookieStore = new BasicCookieStore();
HttpContext localContext = new BasicHttpContext();
localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);
HttpClient httpclient = new DefaultHttpClient();
HttpGet httpget = new HttpGet(uri);
httpget.setHeader("User-Agent", userAgent);
log.finer("Executing " + uri);
HttpResponse response = httpclient.execute(httpget, localContext);
HttpEntity entity = response.getEntity();
if (entity != null && response.getStatusLine().getStatusCode() == 200) {
InputStream instream = entity.getContent();
String videoInfo = getStringFromInputStream(encoding, instream);
if (videoInfo != null && videoInfo.length() > 0) {
List<NameValuePair> infoMap = new ArrayList<NameValuePair>();
URLEncodedUtils.parse(infoMap, new Scanner(videoInfo), encoding);
String token = null;
String downloadUrl = null;
String filename = videoId;
for (NameValuePair pair : infoMap) {
String key = pair.getName();
String val = pair.getValue();
log.finest(key + "=" + val);
if (key.equals("token")) {
token = val;
} else if (key.equals("title")) {
filename = val;
} else if (key.equals("fmt_url_map")) {
String[] formats = commaPattern.split(val);
for (String fmt : formats) {
String[] fmtPieces = pipePattern.split(fmt);
if (fmtPieces.length == 2) {
// in the end, download somethin!
downloadUrl = fmtPieces[1];
int pieceFormat = Integer.parseInt(fmtPieces[0]);
if (pieceFormat == format) {
// found what we want
downloadUrl = fmtPieces[1];
break;
}
}
}
}
}
filename = cleanFilename(filename);
if (filename.length() == 0) {
filename = videoId;
} else {
filename += "_" + videoId;
}
filename += "." + extension;
File outputfile = new File(outputdir, filename);
if (downloadUrl != null) {
downloadWithHttpClient(userAgent, downloadUrl, outputfile);
}
}
}
}
private static void downloadWithHttpClient(String userAgent, String downloadUrl, File outputfile) throws Throwable {
HttpGet httpget2 = new HttpGet(downloadUrl);
httpget2.setHeader("User-Agent", userAgent);
log.finer("Executing " + httpget2.getURI());
HttpClient httpclient2 = new DefaultHttpClient();
HttpResponse response2 = httpclient2.execute(httpget2);
HttpEntity entity2 = response2.getEntity();
if (entity2 != null && response2.getStatusLine().getStatusCode() == 200) {
long length = entity2.getContentLength();
InputStream instream2 = entity2.getContent();
log.finer("Writing " + length + " bytes to " + outputfile);
if (outputfile.exists()) {
outputfile.delete();
}
FileOutputStream outstream = new FileOutputStream(outputfile);
try {
byte[] buffer = new byte[2048];
int count = -1;
while ((count = instream2.read(buffer)) != -1) {
outstream.write(buffer, 0, count);
}
outstream.flush();
} finally {
outstream.close();
}
}
}
private static String cleanFilename(String filename) {
for (char c : ILLEGAL_FILENAME_CHARACTERS) {
filename = filename.replace(c, '_');
}
return filename;
}
private static URI getUri(String path, List<NameValuePair> qparams) throws URISyntaxException {
URI uri = URIUtils.createURI(scheme, host, -1, "/" + path, URLEncodedUtils.format(qparams, "UTF-8"), null);
return uri;
}
private static void setupLogging() {
changeFormatter(new Formatter() {
#Override
public String format(LogRecord arg0) {
return arg0.getMessage() + newline;
}
});
explicitlySetAllLogging(Level.FINER);
}
private static void changeFormatter(Formatter formatter) {
Handler[] handlers = rootlog.getHandlers();
for (Handler handler : handlers) {
handler.setFormatter(formatter);
}
}
private static void explicitlySetAllLogging(Level level) {
rootlog.setLevel(Level.ALL);
for (Handler handler : rootlog.getHandlers()) {
handler.setLevel(defaultLogLevelSelf);
}
log.setLevel(level);
rootlog.setLevel(defaultLogLevel);
}
private static String getStringFromInputStream(String encoding, InputStream instream) throws UnsupportedEncodingException, IOException {
Writer writer = new StringWriter();
char[] buffer = new char[1024];
try {
Reader reader = new BufferedReader(new InputStreamReader(instream, encoding));
int n;
while ((n = reader.read(buffer)) != -1) {
writer.write(buffer, 0, n);
}
} finally {
instream.close();
}
String result = writer.toString();
return result;
}
}
/**
* <pre>
* Exploded results from get_video_info:
*
* fexp=90...
* allow_embed=1
* fmt_stream_map=35|http://v9.lscache8...
* fmt_url_map=35|http://v9.lscache8...
* allow_ratings=1
* keywords=Stefan Molyneux,Luke Bessey,anarchy,stateless society,giant stone cow,the story of our unenslavement,market anarchy,voluntaryism,anarcho capitalism
* track_embed=0
* fmt_list=35/854x480/9/0/115,34/640x360/9/0/115,18/640x360/9/0/115,5/320x240/7/0/0
* author=lukebessey
* muted=0
* length_seconds=390
* plid=AA...
* ftoken=null
* status=ok
* watermark=http://s.ytimg.com/yt/swf/logo-vfl_bP6ud.swf,http://s.ytimg.com/yt/swf/hdlogo-vfloR6wva.swf
* timestamp=12...
* has_cc=False
* fmt_map=35/854x480/9/0/115,34/640x360/9/0/115,18/640x360/9/0/115,5/320x240/7/0/0
* leanback_module=http://s.ytimg.com/yt/swfbin/leanback_module-vflJYyeZN.swf
* hl=en_US
* endscreen_module=http://s.ytimg.com/yt/swfbin/endscreen-vflk19iTq.swf
* vq=auto
* avg_rating=5.0
* video_id=S6IZP3yRJ9I
* token=vPpcFNh...
* thumbnail_url=http://i4.ytimg.com/vi/S6IZP3yRJ9I/default.jpg
* title=The Story of Our Unenslavement - Animated
* </pre>
*/

I know i am answering late. But this code may useful for some one. So i am attaching it here.
Use the following java code to download the videos from YouTube.
package com.mycompany.ytd;
import java.io.File;
import java.net.URL;
import com.github.axet.vget.VGet;
/**
*
* #author Manindar
*/
public class YTD {
public static void main(String[] args) {
try {
String url = "https://www.youtube.com/watch?v=s10ARdfQUOY";
String path = "D:\\Manindar\\YTD\\";
VGet v = new VGet(new URL(url), new File(path));
v.download();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
Add the below Dependency in your POM.XML file
<dependency>
<groupId>com.github.axet</groupId>
<artifactId>vget</artifactId>
<version>1.1.33</version>
</dependency>
Hope this will be useful.

Regarding the format (mp4 or flv) decide which URL you want to use. Then use this tutorial to download the video and save it into a local directory.

Ref :Youtube Video Download (Android/Java)
Edit 3
You can use the Lib : https://github.com/HaarigerHarald/android-youtubeExtractor
Ex :
String youtubeLink = "http://youtube.com/watch?v=xxxx";
new YouTubeExtractor(this) {
#Override
public void onExtractionComplete(SparseArray<YtFile> ytFiles, VideoMeta vMeta) {
if (ytFiles != null) {
int itag = 22;
String downloadUrl = ytFiles.get(itag).getUrl();
}
}
}.extract(youtubeLink, true, true);
They decipherSignature using :
private boolean decipherSignature(final SparseArray<String> encSignatures) throws IOException {
// Assume the functions don't change that much
if (decipherFunctionName == null || decipherFunctions == null) {
String decipherFunctUrl = "https://s.ytimg.com/yts/jsbin/" + decipherJsFileName;
BufferedReader reader = null;
String javascriptFile;
URL url = new URL(decipherFunctUrl);
HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
urlConnection.setRequestProperty("User-Agent", USER_AGENT);
try {
reader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream()));
StringBuilder sb = new StringBuilder("");
String line;
while ((line = reader.readLine()) != null) {
sb.append(line);
sb.append(" ");
}
javascriptFile = sb.toString();
} finally {
if (reader != null)
reader.close();
urlConnection.disconnect();
}
if (LOGGING)
Log.d(LOG_TAG, "Decipher FunctURL: " + decipherFunctUrl);
Matcher mat = patSignatureDecFunction.matcher(javascriptFile);
if (mat.find()) {
decipherFunctionName = mat.group(1);
if (LOGGING)
Log.d(LOG_TAG, "Decipher Functname: " + decipherFunctionName);
Pattern patMainVariable = Pattern.compile("(var |\\s|,|;)" + decipherFunctionName.replace("$", "\\$") +
"(=function\\((.{1,3})\\)\\{)");
String mainDecipherFunct;
mat = patMainVariable.matcher(javascriptFile);
if (mat.find()) {
mainDecipherFunct = "var " + decipherFunctionName + mat.group(2);
} else {
Pattern patMainFunction = Pattern.compile("function " + decipherFunctionName.replace("$", "\\$") +
"(\\((.{1,3})\\)\\{)");
mat = patMainFunction.matcher(javascriptFile);
if (!mat.find())
return false;
mainDecipherFunct = "function " + decipherFunctionName + mat.group(2);
}
int startIndex = mat.end();
for (int braces = 1, i = startIndex; i < javascriptFile.length(); i++) {
if (braces == 0 && startIndex + 5 < i) {
mainDecipherFunct += javascriptFile.substring(startIndex, i) + ";";
break;
}
if (javascriptFile.charAt(i) == '{')
braces++;
else if (javascriptFile.charAt(i) == '}')
braces--;
}
decipherFunctions = mainDecipherFunct;
// Search the main function for extra functions and variables
// needed for deciphering
// Search for variables
mat = patVariableFunction.matcher(mainDecipherFunct);
while (mat.find()) {
String variableDef = "var " + mat.group(2) + "={";
if (decipherFunctions.contains(variableDef)) {
continue;
}
startIndex = javascriptFile.indexOf(variableDef) + variableDef.length();
for (int braces = 1, i = startIndex; i < javascriptFile.length(); i++) {
if (braces == 0) {
decipherFunctions += variableDef + javascriptFile.substring(startIndex, i) + ";";
break;
}
if (javascriptFile.charAt(i) == '{')
braces++;
else if (javascriptFile.charAt(i) == '}')
braces--;
}
}
// Search for functions
mat = patFunction.matcher(mainDecipherFunct);
while (mat.find()) {
String functionDef = "function " + mat.group(2) + "(";
if (decipherFunctions.contains(functionDef)) {
continue;
}
startIndex = javascriptFile.indexOf(functionDef) + functionDef.length();
for (int braces = 0, i = startIndex; i < javascriptFile.length(); i++) {
if (braces == 0 && startIndex + 5 < i) {
decipherFunctions += functionDef + javascriptFile.substring(startIndex, i) + ";";
break;
}
if (javascriptFile.charAt(i) == '{')
braces++;
else if (javascriptFile.charAt(i) == '}')
braces--;
}
}
if (LOGGING)
Log.d(LOG_TAG, "Decipher Function: " + decipherFunctions);
decipherViaWebView(encSignatures);
if (CACHING) {
writeDeciperFunctToChache();
}
} else {
return false;
}
} else {
decipherViaWebView(encSignatures);
}
return true;
}
Now with use of this library High Quality Videos Lossing Audio so i use the MediaMuxer for Murging Audio and Video for Final Output
Edit 1
https://stackoverflow.com/a/15240012/9909365
Why the previous answer not worked
Pattern p2 = Pattern.compile("sig=(.*?)[&]");
Matcher m2 = p2.matcher(url);
String sig = null;
if (m2.find()) {
sig = m2.group(1);
}
As of November 2016, this is a little rough around the edges, but
displays the basic principle. The url_encoded_fmt_stream_map today
does not have a space after the colon (better make this optional) and
"sig" has been changed to "signature"
and while i am debuging the code i found the new keyword its
signature&s in many video's URL
here edited answer
private static final HashMap<String, Meta> typeMap = new HashMap<String, Meta>();
initTypeMap(); call first
class Meta {
public String num;
public String type;
public String ext;
Meta(String num, String ext, String type) {
this.num = num;
this.ext = ext;
this.type = type;
}
}
class Video {
public String ext = "";
public String type = "";
public String url = "";
Video(String ext, String type, String url) {
this.ext = ext;
this.type = type;
this.url = url;
}
}
public ArrayList<Video> getStreamingUrisFromYouTubePage(String ytUrl)
throws IOException {
if (ytUrl == null) {
return null;
}
// Remove any query params in query string after the watch?v=<vid> in
// e.g.
// http://www.youtube.com/watch?v=0RUPACpf8Vs&feature=youtube_gdata_player
int andIdx = ytUrl.indexOf('&');
if (andIdx >= 0) {
ytUrl = ytUrl.substring(0, andIdx);
}
// Get the HTML response
/* String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0.1)";*/
/* HttpClient client = new DefaultHttpClient();
client.getParams().setParameter(CoreProtocolPNames.USER_AGENT,
userAgent);
HttpGet request = new HttpGet(ytUrl);
HttpResponse response = client.execute(request);*/
String html = "";
HttpsURLConnection c = (HttpsURLConnection) new URL(ytUrl).openConnection();
c.setRequestMethod("GET");
c.setDoOutput(true);
c.connect();
InputStream in = c.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
StringBuilder str = new StringBuilder();
String line = null;
while ((line = reader.readLine()) != null) {
str.append(line.replace("\\u0026", "&"));
}
in.close();
html = str.toString();
// Parse the HTML response and extract the streaming URIs
if (html.contains("verify-age-thumb")) {
Log.e("Downloader", "YouTube is asking for age verification. We can't handle that sorry.");
return null;
}
if (html.contains("das_captcha")) {
Log.e("Downloader", "Captcha found, please try with different IP address.");
return null;
}
Pattern p = Pattern.compile("stream_map\":\"(.*?)?\"");
// Pattern p = Pattern.compile("/stream_map=(.[^&]*?)\"/");
Matcher m = p.matcher(html);
List<String> matches = new ArrayList<String>();
while (m.find()) {
matches.add(m.group());
}
if (matches.size() != 1) {
Log.e("Downloader", "Found zero or too many stream maps.");
return null;
}
String urls[] = matches.get(0).split(",");
HashMap<String, String> foundArray = new HashMap<String, String>();
for (String ppUrl : urls) {
String url = URLDecoder.decode(ppUrl, "UTF-8");
Log.e("URL","URL : "+url);
Pattern p1 = Pattern.compile("itag=([0-9]+?)[&]");
Matcher m1 = p1.matcher(url);
String itag = null;
if (m1.find()) {
itag = m1.group(1);
}
Pattern p2 = Pattern.compile("signature=(.*?)[&]");
Matcher m2 = p2.matcher(url);
String sig = null;
if (m2.find()) {
sig = m2.group(1);
} else {
Pattern p23 = Pattern.compile("signature&s=(.*?)[&]");
Matcher m23 = p23.matcher(url);
if (m23.find()) {
sig = m23.group(1);
}
}
Pattern p3 = Pattern.compile("url=(.*?)[&]");
Matcher m3 = p3.matcher(ppUrl);
String um = null;
if (m3.find()) {
um = m3.group(1);
}
if (itag != null && sig != null && um != null) {
Log.e("foundArray","Adding Value");
foundArray.put(itag, URLDecoder.decode(um, "UTF-8") + "&"
+ "signature=" + sig);
}
}
Log.e("foundArray","Size : "+foundArray.size());
if (foundArray.size() == 0) {
Log.e("Downloader", "Couldn't find any URLs and corresponding signatures");
return null;
}
ArrayList<Video> videos = new ArrayList<Video>();
for (String format : typeMap.keySet()) {
Meta meta = typeMap.get(format);
if (foundArray.containsKey(format)) {
Video newVideo = new Video(meta.ext, meta.type,
foundArray.get(format));
videos.add(newVideo);
Log.d("Downloader", "YouTube Video streaming details: ext:" + newVideo.ext
+ ", type:" + newVideo.type + ", url:" + newVideo.url);
}
}
return videos;
}
private class YouTubePageStreamUriGetter extends AsyncTask<String, String, ArrayList<Video>> {
ProgressDialog progressDialog;
#Override
protected void onPreExecute() {
super.onPreExecute();
progressDialog = ProgressDialog.show(webViewActivity.this, "",
"Connecting to YouTube...", true);
}
#Override
protected ArrayList<Video> doInBackground(String... params) {
ArrayList<Video> fVideos = new ArrayList<>();
String url = params[0];
try {
ArrayList<Video> videos = getStreamingUrisFromYouTubePage(url);
/* Log.e("Downloader","Size of Video : "+videos.size());*/
if (videos != null && !videos.isEmpty()) {
for (Video video : videos)
{
Log.e("Downloader", "ext : " + video.ext);
if (video.ext.toLowerCase().contains("mp4") || video.ext.toLowerCase().contains("3gp") || video.ext.toLowerCase().contains("flv") || video.ext.toLowerCase().contains("webm")) {
ext = video.ext.toLowerCase();
fVideos.add(new Video(video.ext,video.type,video.url));
}
}
return fVideos;
}
} catch (Exception e) {
e.printStackTrace();
Log.e("Downloader", "Couldn't get YouTube streaming URL", e);
}
Log.e("Downloader", "Couldn't get stream URI for " + url);
return null;
}
#Override
protected void onPostExecute(ArrayList<Video> streamingUrl) {
super.onPostExecute(streamingUrl);
progressDialog.dismiss();
if (streamingUrl != null) {
if (!streamingUrl.isEmpty()) {
//Log.e("Steaming Url", "Value : " + streamingUrl);
for (int i = 0; i < streamingUrl.size(); i++) {
Video fX = streamingUrl.get(i);
Log.e("Founded Video", "URL : " + fX.url);
Log.e("Founded Video", "TYPE : " + fX.type);
Log.e("Founded Video", "EXT : " + fX.ext);
}
//new ProgressBack().execute(new String[]{streamingUrl, filename + "." + ext});
}
}
}
}
public void initTypeMap()
{
typeMap.put("13", new Meta("13", "3GP", "Low Quality - 176x144"));
typeMap.put("17", new Meta("17", "3GP", "Medium Quality - 176x144"));
typeMap.put("36", new Meta("36", "3GP", "High Quality - 320x240"));
typeMap.put("5", new Meta("5", "FLV", "Low Quality - 400x226"));
typeMap.put("6", new Meta("6", "FLV", "Medium Quality - 640x360"));
typeMap.put("34", new Meta("34", "FLV", "Medium Quality - 640x360"));
typeMap.put("35", new Meta("35", "FLV", "High Quality - 854x480"));
typeMap.put("43", new Meta("43", "WEBM", "Low Quality - 640x360"));
typeMap.put("44", new Meta("44", "WEBM", "Medium Quality - 854x480"));
typeMap.put("45", new Meta("45", "WEBM", "High Quality - 1280x720"));
typeMap.put("18", new Meta("18", "MP4", "Medium Quality - 480x360"));
typeMap.put("22", new Meta("22", "MP4", "High Quality - 1280x720"));
typeMap.put("37", new Meta("37", "MP4", "High Quality - 1920x1080"));
typeMap.put("33", new Meta("38", "MP4", "High Quality - 4096x230"));
}
Edit 2:
Some time This Code Not worked proper
Same-origin policy
https://en.wikipedia.org/wiki/Same-origin_policy
https://en.wikipedia.org/wiki/Cross-origin_resource_sharing
problem of Same-origin policy. Essentially, you cannot download this file from www.youtube.com because they are different domains. A workaround of this problem is [CORS][1].
Ref : https://superuser.com/questions/773719/how-do-all-of-these-save-video-from-youtube-services-work/773998#773998
url_encoded_fmt_stream_map // traditional: contains video and audio stream
adaptive_fmts // DASH: contains video or audio stream
Each of these is a comma separated array of what I would call "stream objects". Each "stream object" will contain values like this
url // direct HTTP link to a video
itag // code specifying the quality
s // signature, security measure to counter downloading
Each URL will be encoded so you will need to decode them. Now the tricky part.
YouTube has at least 3 security levels for their videos
unsecured // as expected, you can download these with just the unencoded URL
s // see below
RTMPE // uses "rtmpe://" protocol, no known method for these
The RTMPE videos are typically used on official full length movies, and are protected with SWF Verification Type 2. This has been around since 2011 and has yet to be reverse engineered.
The type "s" videos are the most difficult that can actually be downloaded. You will typcially see these on VEVO videos and the like. They start with a signature such as
AA5D05FA7771AD4868BA4C977C3DEAAC620DE020E.0F421820F42978A1F8EAFCDAC4EF507DB5
Then the signature is scrambled with a function like this
function mo(a) {
a = a.split("");
a = lo.rw(a, 1);
a = lo.rw(a, 32);
a = lo.IC(a, 1);
a = lo.wS(a, 77);
a = lo.IC(a, 3);
a = lo.wS(a, 77);
a = lo.IC(a, 3);
a = lo.wS(a, 44);
return a.join("")
}
This function is dynamic, it typically changes every day. To make it more difficult the function is hosted at a URL such as
http://s.ytimg.com/yts/jsbin/html5player-en_US-vflycBCEX.js
this introduces the problem of Same-origin policy. Essentially, you cannot download this file from www.youtube.com because they are different domains. A workaround of this problem is CORS. With CORS, s.ytimg.com could add this header
Access-Control-Allow-Origin: http://www.youtube.com
and it would allow the JavaScript to download from www.youtube.com. Of course they do not do this. A workaround for this workaround is to use a CORS proxy. This is a proxy that responds with the following header to all requests
Access-Control-Allow-Origin: *
So, now that you have proxied your JS file, and used the function to scramble the signature, you can use that in the querystring to download a video.

ytd2 is a fully functional YouTube video downloader. Check out its source code if you want to see how it's done.
Alternatively, you can also call an external process like youtube-dl to do the job. This is probably the easiest solution but it isn't in "pure" Java.

Develop Reference

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Crawling amazon.com - java

Well, don't be rude and crawl them then. Check their robots.txt (wiki) to see what they allow you to do. Don't be surprised if they ban you if you go places they don't want you to go.

Related

How to access the repository names in GitHub with Java?

Apache HttpClient doesn't get content of pages using pagination links. I get 200 status but html has no content

WebCrawler with recursion

using dbpedia spotlight in java or scala

How to download videos from youtube on java? [closed]

Categories

Resources