LoveTech Bots

Bing Bots | LoveTech

Summary

Selenium WebDriver allows us to easily crawl Bing using Java. Here are some convenience functions for your crawling journey

Install Selenium WebDriver

Download Eclipse: https://www.eclipse.org/downloads/packages/eclipse-ide-java-developers/mars2

Download Java jdk: http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html

Download ChromeDriver.exe: https://sites.google.com/a/chromium.org/chromedriver/downloads

Download Selenium Java jar: http://www.seleniumhq.org/download/

Download JSON jar: https://mvnrepository.com/artifact/org.json/json/20170516

Common Bot Methods

These common methods are used by many of our bots:

public static String runJavascript(WebDriver driver, String script) {
	WebElement body = driver.findElement(By.cssSelector("body"));
	return (String)((JavascriptExecutor)driver).executeScript(script, body);
}

Crawl Bing Image Search:

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Map;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.*;
public class BingImageBot {

	public static void main(String[] args) throws InterruptedException {
		System.setProperty("webdriver.chrome.driver", "C:/Selenium/chromedriver.exe");
		WebDriver driver = new ChromeDriver();

    	//A) Download Bing Image Search Results
    	searchBingImages(driver, "flight of the conchords meme");
    	loadjQuery(driver);
    	while(loadMoreContent(driver)) {
    		Thread.sleep((long) (777*Math.random()+500));
    		clickingImageMoreResults(driver);
    		//downloadAnyNewImages(driver);
    		Thread.sleep((long) (777*Math.random()+500));
    	}
    	
    	//B) Related Images 
    	/*String url = "https://www.bing.com/images/search?view=detailV2&ccid=XKw3tHPN&id=0728444CE6B28AE71BB9D9417B3D12632514AD12&thid=OIP.XKw3tHPNWrcKwxZxADUGigEsC7&q=3D+Desktop+Wallpaper&simid=608053717454161832&selectedIndex=0&qft=+filterui%3acolor2-FGcls_BLUE&ajaxhist=0";
    	driver.get(url);
    	loadjQuery(driver);
    	long curThumb = 0;
    	long numThumbs = runJavascriptLong(driver, "var thumbs = $('.cico');return thumbs.length;");
    	do {
    		while(curThumb < numThumbs) {
    			curThumb++;
    			runJavascript(driver, "$($('.cico')[" + curThumb + "]).click()");
    			Thread.sleep(333);
    			runJavascript(driver, "window.scrollTo(0, document.body.scrollHeight);$('.expandButton.active').click()");
        		Thread.sleep((long) (777*Math.random()+500));
    			//downloadAllRelatedImages(driver);
    		}
        	numThumbs = runJavascriptLong(driver, "thumbs = $('.cico');return thumbs.length;");
    	} while(curThumb < numThumbs);*/
    	
		//driver.close();
		//driver.quit();
	}
	
	

	private static void downloadAnyNewImages(WebDriver driver) {
		ArrayList al = runJavascriptArrayList(driver,   "var thumbLinks = []; var seenHrefs = [];var imgThumbs = $('img.mimg:not(.downloaded)');" + " \n" +
				"for(var i = 0; i < imgThumbs.length; i++) {" + " \n" +
				"	var addObj = {};" + " \n" +
				"	var thumbEl = $(imgThumbs[i]);" + " \n" +
				"	addObj.thumb = thumbEl.attr('src');" + " \n" +
				"   var parentLink = thumbEl.closest('a');" + "\n" +
				"	addObj.href = 'https://www.bing.com/' + parentLink.attr('href');" + " \n" +
				"   addObj.imgUrl = JSON.parse(parentLink.attr('m')).murl" + "\n" + 
				"	var extraParamsInd = addObj.href.indexOf(\"&selectedIndex=\");" + " \n" +
				"	if(extraParamsInd != -1) {" + " \n" +
				"		addObj.href = addObj.href.substring(0, extraParamsInd);" + " \n" +
				"	}" + " \n" +
				"	if(seenHrefs.indexOf(addObj.href) == -1) {" + " \n" +
				"		seenHrefs.push(addObj.href);" + " \n" +
				"		thumbLinks.push(addObj);" + " \n" +
				"	}" + " \n" +
				"	thumbEl.addClass('downloaded');" + " \n" +
				"}; return thumbLinks;");
				
		for(int i = 0; i < al.size(); i++) {
			Map m = (Map) al.get(i);
			String img = (String)m.get("imgUrl");
			try(InputStream in = new URL(img).openStream()){
				PushbackInputStream pushUrlStream = new PushbackInputStream(in, 100);
				byte [] firstBytes = new byte[100];
				pushUrlStream.read(firstBytes);
				pushUrlStream.unread(firstBytes);
				String imageType = null;
				ByteArrayInputStream bais = new ByteArrayInputStream(firstBytes);
				String mimeType = URLConnection.guessContentTypeFromStream(bais);
				in.close();
				if (mimeType != null && mimeType.startsWith("image/")) {
				    imageType = mimeType.substring("image/".length());
					//BufferedImage inputImage = ImageIO.read(pushUrlStream);
				    String extension = "";
				    if(imageType.equals("jpeg")) {
				    	extension = "jpg";
				    } else if(imageType.equals("png")) {
				    	extension = "png";
				    } else if(imageType.equals("gif")) {
				    	extension = "gif";
				    }
				    if(!extension.equals("")) {
				    	long millis = System.currentTimeMillis();
				    	InputStream is = new URL(img).openStream();
				    	//System.out.println(Paths.get("images/" + millis + "." + extension));
				    	Files.copy(is, Paths.get("images/" + millis + "." + extension));
				    	is.close();
				    }
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}



	private static Boolean loadMoreContent(WebDriver driver) throws InterruptedException {
		String startHeight = runJavascript(driver, "var startHeight = $(document).height(); return ''+startHeight;");
		runJavascript(driver, "$(document).scrollTop($(document).height());");
		Thread.sleep(2000);
		String endHeight = runJavascript(driver, "var endHeight = $(document).height(); return ''+endHeight");
		return !(startHeight.equals(endHeight));
	}

	private static void clickingImageMoreResults(WebDriver driver) {
		runJavascript(driver, "var elems=document.getElementsByClassName('btn_seemore'); if(elems.length > 0) { elems[0].click(); }");
	}

	private static void scrollToBottom(WebDriver driver) {
		runJavascript(driver, "window.scrollTo(0, document.body.scrollHeight);");
	}

	private static void searchBingImages(WebDriver driver, String search) {
		driver.get("https://www.bing.com/images/discover");
    	runJavascript(driver, "document.getElementById('sb_form_q').value = '" + search + "';sb_form.submit();");
    	try {
			Thread.sleep(1000);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
	}

	public static void downloadAllRelatedImages(WebDriver driver) {
		ArrayList al = runJavascriptArrayList(driver,   "var thumbLinks = []; var seenHrefs = [];var imgThumbs = $('img.mimg');" + " \n" +
				"for(var i = 0; i < imgThumbs.length; i++) {" + " \n" +
				"	var addObj = {};" + " \n" +
				"	var thumbEl = $(imgThumbs[i]);" + " \n" +
				"	addObj.thumb = thumbEl.attr('src');" + " \n" +
				"   var parentLink = thumbEl.closest('a');" + "\n" +
				"	addObj.href = 'https://www.bing.com/' + parentLink.attr('href');" + " \n" +
				"   addObj.imgUrl = JSON.parse(parentLink.attr('m')).murl" + "\n" + 
				"	var extraParamsInd = addObj.href.indexOf(\"&selectedIndex=\");" + " \n" +
				"	if(extraParamsInd != -1) {" + " \n" +
				"		addObj.href = addObj.href.substring(0, extraParamsInd);" + " \n" +
				"	}" + " \n" +
				"	if(seenHrefs.indexOf(addObj.href) == -1) {" + " \n" +
				"		seenHrefs.push(addObj.href);" + " \n" +
				"		thumbLinks.push(addObj);" + " \n" +
				"	}" + " \n" +
				"}; return thumbLinks;");
				
		for(int i = 0; i < al.size(); i++) {
			Map m = (Map) al.get(i);
			String img = (String)m.get("imgUrl");
			try(InputStream in = new URL(img).openStream()){
				PushbackInputStream pushUrlStream = new PushbackInputStream(in, 100);
				byte [] firstBytes = new byte[100];
				pushUrlStream.read(firstBytes);
				pushUrlStream.unread(firstBytes);
				String imageType = null;
				ByteArrayInputStream bais = new ByteArrayInputStream(firstBytes);
				String mimeType = URLConnection.guessContentTypeFromStream(bais);
				in.close();
				if (mimeType.startsWith("image/")) {
				    imageType = mimeType.substring("image/".length());
					//BufferedImage inputImage = ImageIO.read(pushUrlStream);
				    String extension = "";
				    if(imageType.equals("jpeg")) {
				    	extension = "jpg";
				    } else if(imageType.equals("png")) {
				    	extension = "png";
				    } else if(imageType.equals("gif")) {
				    	extension = "gif";
				    }
				    if(!extension.equals("")) {
				    	long millis = System.currentTimeMillis();
				    	InputStream is = new URL(img).openStream();
				    	//System.out.println(Paths.get("images/" + millis + "." + extension));
				    	Files.copy(is, Paths.get("images/" + millis + "." + extension));
				    	is.close();
				    }
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}
	
	public static void attemptLogin(WebDriver driver, String username, String password) {
		runJavascript(driver, "$('input[name=\"user\"]').val('" + username + "');$('input[name=\"passwd\"]').val('" + password + "');$('button.btn[type=\"submit\"]').click()");
	}
	
	//See Common Bot Methods
}

Explore Chakra7 Today