LoveTech Bots

LoveTech Google Bots

Summary

Selenium WebDriver allows us to easily crawl Google and Youtube using Java. Here are some convenience functions for your crawling journey

Install Selenium WebDriver

Download Eclipse: https://www.eclipse.org/downloads/packages/eclipse-ide-java-developers/mars2

Download Java jdk: http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html

Download ChromeDriver.exe: https://sites.google.com/a/chromium.org/chromedriver/downloads

Download Selenium Java jar: http://www.seleniumhq.org/download/

Download JSON jar: https://mvnrepository.com/artifact/org.json/json/20170516

Common Bot Methods

These common methods are used by many of our bots:

public static String runJavascript(WebDriver driver, String script) {
	WebElement body = driver.findElement(By.cssSelector("body"));
	return (String)((JavascriptExecutor)driver).executeScript(script, body);
}

Crawl Google Search:

import java.io.File;
import java.io.PrintWriter;
import java.util.TimerTask;
import java.util.concurrent.TimeUnit;

import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.ie.InternetExplorerDriver;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.openqa.selenium.support.ui.ExpectedConditions;

import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

public class GoogleSearch {

	public static void main(String[] args) throws InterruptedException, JSONException {
		System.setProperty("webdriver.chrome.driver", "C:/Selenium/chromedriver.exe");
		
		WebDriver driver = new ChromeDriver();
		googleSearch(driver, "Let Freedom Ring");
		JSONArray results = getGoogleSearchResults(driver);
		waitForGoogleSearch(driver);
		for(int page = 2; page < 10; page++) {
			Boolean hasNext = nextGoogleSearchPage(driver);
			if(hasNext) {
				waitForGoogleSearch(driver);
				loadjQuery(driver);
				JSONArray newPageResults = getGoogleSearchResults(driver);
				for (int j = 0; j < newPageResults.length(); j++) {
					results.put(newPageResults.getJSONObject(j));
				}
			}
		}
		System.out.println(results.length());
	}

	public static void googleSearch(WebDriver driver, String searchPhrase) {
		driver.get("https://google.com");
		loadjQuery(driver);
		runJavascript(driver, "$('#lst-ib').trigger(\"focus\").val('" + searchPhrase + "');f.submit();");
	}


	private static void waitForGoogleSearch(WebDriver driver) {
		WebDriverWait wait = new WebDriverWait(driver, 10);
		WebElement element = wait.until(ExpectedConditions.visibilityOfElementLocated(By.className("rc")));
	}
	
	public static JSONArray getGoogleSearchResults(WebDriver driver) throws JSONException {
		loadjQuery(driver);
		String resultJSON = runJavascript(driver, "var results={results:[]};$('.rc').each(function() { var link = $(this).find('a'); var addObj = {title:link.text(), description:$(this).find('.s').html(), href:link.attr('href')}; results.results.push(addObj);});return JSON.stringify(results);");
		JSONArray results = (new JSONObject(resultJSON)).getJSONArray("results");
		return results;
	}

	//Returns true if navigated to next page, false if no next page exists 
	private static Boolean nextGoogleSearchPage(WebDriver driver) {
		String count = runJavascript(driver, "return ''+$('#pnnext').length;");
		if(count.equals("1")) {
			runJavascript(driver, "window.location=$('#pnnext').attr('href');");
			return true;
		} else {
			return false;
		}
	}

	//See Common Bot Methods
}

YouTube View Bot:

import java.io.File;
import java.io.PrintWriter;

import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.interactions.Actions;

import java.util.ArrayList;
public class YouTubeBot {
	public static int maxVidMillis = 0;
	public static void main(String[] args) throws InterruptedException {
		System.setProperty("webdriver.chrome.driver", "C:/Selenium/chromedriver.exe");
		String vidUrl = "https://www.youtube.com/watch?v=CDII5dZxDpA";
		openVideoMultipleTimes(vidUrl, 4);
	}
	

	private static void openVideoMultipleTimes(String vidUrl, int numToOpen) throws InterruptedException {
		ArrayList<WebDriver> drivers = new ArrayList<WebDriver>();
		for(int i = 0; i < numToOpen; i++) {
			drivers.add(playVideoInNewWindow(vidUrl));
		}
		int sleepDur = Math.max(5000, maxVidMillis - 1500*numToOpen);
		System.out.println("Sleeping... " + sleepDur);
		Thread.sleep(sleepDur);
		for(int i = 0; i < drivers.size(); i++) {
			WebDriver driver = drivers.get(i);
			driver.close();
			driver.quit();
		}
	}


	private static WebDriver playVideoInNewWindow(String vidUrl) {
		WebDriver driver = new ChromeDriver();
		driver.get(vidUrl);
		String durationStr = runJavascript(driver, "window.setTimeout(function() { var muteBtn = document.getElementsByClassName('ytp-mute-button')[0]; if(muteBtn.getAttribute('title') == 'Mute') { muteBtn.click(); } }, 2000); return document.getElementsByClassName('ytp-time-duration')[0].innerHTML;");
		String[] durParts = durationStr.split(":");
		runJavascript(driver, "document.getElementsByClassName('ytp-settings-button')[0].click();document.getElementsByClassName('ytp-menuitem')[4].click();var items=document.getElementsByClassName('ytp-menuitem');items[items.length-3].click();");
		
		int durMilis = 0;
		if(durParts.length == 2) {
			durMilis = 1000 * (Integer.parseInt(durParts[0])*60 + Integer.parseInt(durParts[1])); 
		} else if(durParts.length == 3) {
			durMilis = 1000 * (Integer.parseInt(durParts[0])*360 + Integer.parseInt(durParts[1])*60 + Integer.parseInt(durParts[2]));
		}
		if(durMilis > maxVidMillis) {
			maxVidMillis = durMilis;
		}
		
		return driver;
	}
	//See Common Bot Methods
	
}

Explore Chakra7 Today