HtmlUnit实现ajax网络爬虫(转)

/**
 *网上关于网络爬虫实现方式有很多种,但是很多都不支持Ajax,李兄说:模拟才是王道。确实,
 *如果能够模拟一个没有界面的浏览器,还有什么不能做到的呢?关于解析Ajax网站的框架也有不少,
 *我选择了HtmlUnit,官方网站:http://htmlunit.sourceforge.net/?,htmlunit可以说是一个Java
 *版本的无界面浏览器,几乎无所不能,而且很多东西都封装得特别完美。这是这几天来积累下来的心血,记录一下
 */
package com.lanyotech.www.wordbank;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.util.List;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.ScriptResult;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlOption;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSelect;

public class WorldBankCrawl {

	private static String TARGET_URL = “http://databank.worldbank.org/ddp/home.do”;

	public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
		//模拟一个浏览器
		WebClient webClient = new WebClient();
		//设置webClient的相关参数
		webClient.setJavaScriptEnabled(true);
		webClient.setCssEnabled(false);
		webClient.setAjaxController(new NicelyResynchronizingAjaxController());
		webClient.setTimeout(35000);
		webClient.setThrowExceptionOnScriptError(false);
		//模拟浏览器打开一个目标网址
		HtmlPage rootPage= webClient.getPage(TARGET_URL);
		//获取第一个数据库
		HtmlSelect hs = (HtmlSelect) rootPage.getElementById("lstCubes");
		//按要求选择第一个数据库
		hs.getOption(0).setSelected(true);
		//模拟点击Next按钮,跳转到第二个页面
		System.out.println("正在跳转…");
		//执行按钮出发的js事件
		ScriptResult sr = rootPage.executeJavaScript("javascript:setCubeData(2,-1,4,’/ddp');");

		//跳转到第二个页面,选择国家
		HtmlPage countrySelect = (HtmlPage) sr.getNewPage();
		//获得包含全部国家信息的选择框页面
		HtmlPage framePage=(HtmlPage)countrySelect.getFrameByName("frmTree1″).getEnclosedPage();
		//获得selectAll按钮,触发js事件
		framePage.executeJavaScript("javascript:TransferListAll('countrylst’,'countrylstselected’,'no');SetSelectedCount('countrylstselected’,'tdcount');");
		//获取Next按钮,触发js事件
		ScriptResult electricityScriptResult = framePage.executeJavaScript("javascript:wrapperSetCube('/ddp')");

		System.out.println("正在跳转…");
		//跳转到下一个页面electricitySelect
		HtmlPage electricitySelect = (HtmlPage) electricityScriptResult.getNewPage();
		//获得electricity选择的iframe
		HtmlPage electricityFrame = (HtmlPage) electricitySelect.getFrameByName("frmTree1″).getEnclosedPage();
		//获得选择框
		HtmlSelect seriesSelect = (HtmlSelect) electricityFrame.getElementById("countrylst");
		//获得所有的选择框内容
		List optionList = seriesSelect.getOptions();
		//将指定的选项选中
		optionList.get(1).setSelected(true);
		//模拟点击select按钮
		electricityFrame.executeJavaScript("javascript:TransferList('countrylst’,'countrylstselected’,'no');SetSelectedCount('countrylstselected’,'tdcount');");
		//获取选中后,下面的选择框
		HtmlSelect electricitySelected = (HtmlSelect) electricityFrame.getElementById("countrylstselected");
		List list = electricitySelected.getOptions();
		//模拟点击Next按钮,跳转到选择时间的页面
		ScriptResult timeScriptResult = electricityFrame.executeJavaScript("javascript:wrapperSetCube('/ddp')");

		System.out.println("正在跳转…");
		HtmlPage timeSelectPage = (HtmlPage) timeScriptResult.getNewPage();
		//获取选中时间的选择框
		timeSelectPage = (HtmlPage) timeSelectPage.getFrameByName("frmTree1″).getEnclosedPage();
		//选中所有的时间
		timeSelectPage.executeJavaScript("javascript:TransferListAll('countrylst’,'countrylstselected’,'no');SetSelectedCount('countrylstselected’,'tdcount');");
		//点击Next按钮
		ScriptResult exportResult = timeSelectPage.executeJavaScript("javascript:wrapperSetCube('/ddp')");

		System.out.println("正在跳转…");
		//转到export页面
		HtmlPage exportPage = (HtmlPage) exportResult.getNewPage();
		//点击页面上的Export按钮,进入下载页面
		ScriptResult downResult = exportPage.executeJavaScript("javascript:exportData('/ddp’ ,’EXT_BULK’ ,’WDI_Time=51||WDI_Series=1||WDI_Ctry=244||’ );");

		System.out.println("正在跳转…");
		HtmlPage downLoadPage = (HtmlPage) downResult.getNewPage();
		//点击Excel图标,开始下载
		ScriptResult downLoadResult = downLoadPage.executeJavaScript("javascript:exportData('/ddp’,'BULKEXCEL');");
		//下载Excel文件
		InputStream is = downLoadResult.getNewPage().getWebResponse().getContentAsStream();

		OutputStream fos = new FileOutputStream("d://test.xls");
		byte[] buffer=new byte[1024*30];
		int len=-1;
		while((len=is.read(buffer))>0){
			fos.write(buffer, 0, len);
		}
		fos.close();
		fos.close();
		System.out.println("Success!");
	}
}

相关推荐