版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/XGL1569348/article/details/76602013
使用前先用maven导入HtmlUnit或者在HtmlUnit官网下载相关jar包支持
使用方法见主方法内,传入的ID数据见https://drive.google.com/file/d/0B9zkTpK3eXCGc01XM2xPeHFSdEU/view?usp=sharing
操作:使用了HtmlUnit进行模拟浏览器加载JS后对HTML代码进行解析从而获得相关文本信息。
注意:
- 因为从网站上抓取的,因此可能不是很稳定,但思路大同小异。(中国天气网没有什么反爬虫策略??)
- getByXPath()这个方法是直接通过规则进行整个网页HTML代码进行搜索,无论你是通过page或者HtmlDivision来使用这个方法。
- 因为这个demo抓取的是纯文本信息,所以其实将page直接转化text,再去操作字符串的话,速度上可能差距不是很大。
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebClientOptions;
import com.gargoylesoftware.htmlunit.html.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Created by Xgl on 2017/8/2.
* <p>You can use this file as a lib to get hourly weather forecast in the next 24 hours</p>
* <p>Powered by HtmlUnit</p>
*/
public class Spider {
private List<String> times;
private List<String> weathers;
private List<String> temps;
private String url;
private static int TOTAL_NUM = 24;
/**
* Test
*/
public static void main(String[] args) {
Spider spider = new Spider("101230505");
if (spider.initData()){
//Then you can do everything you want.
}
}
/**
* Initialize url and lists in constructor.
*/
public Spider(String cityID) {
times = new ArrayList<>();
weathers = new ArrayList<>();
temps = new ArrayList<>();
url = new String("http://m.weather.com.cn/mhours/"+cityID+".shtml");
}
/**
* Initialize the hourly weather data( in those lists of Spider)
* @return Successful if true, failed if false
*/
public boolean initData(){
final WebClient webClient = new WebClient();
final HtmlPage page;
//set web client options
WebClientOptions webClientOptions = webClient.getOptions();
webClientOptions.setCssEnabled(false);
webClientOptions.setUseInsecureSSL(true);
webClientOptions.setActiveXNative(false);
webClientOptions.setThrowExceptionOnScriptError(false);
webClient.waitForBackgroundJavaScript(50000);
webClientOptions.setThrowExceptionOnScriptError(false);
webClientOptions.setThrowExceptionOnFailingStatusCode(false);
try {
page = webClient.getPage(url);
List<?> items = page.getByXPath("//li");//TODO:Perhaps it can be improved to reduce the wasted time because of such a violent algorithms.
for (int i = 0; i < TOTAL_NUM; i++) {
HtmlListItem htmlListItem = (HtmlListItem) items.get(i);
String time = htmlListItem.getFirstChild().toString();
times.add(time);
}
for (int i = TOTAL_NUM; i < TOTAL_NUM*2; i++) {
HtmlListItem htmlListItem = (HtmlListItem) items.get(i);
HtmlSpan htmlSpan = (HtmlSpan) htmlListItem.getElementsByTagName("span").get(1);
weathers.add(htmlSpan.getFirstChild().toString());
htmlSpan = (HtmlSpan) htmlListItem.getElementsByTagName("span").get(2);
temps.add(htmlSpan.getFirstChild().toString());
}
} catch (IOException e) {
System.err.println("error");
e.printStackTrace();
return false;
}
webClient.close();
return true;
}
/**
* Get weather type of an hour in the future within 24 hours.
* @param index
* @return
*/
public String getWeather(int index){
return this.weathers.get(index);
}
/**
* Get accurate time of an hour in the future within 24 hours.
* @param index
* @return
*/
public String getTime(int index){
return this.times.get(index);
}
/**
* Get temperature of an hour in the future within 24 hours.
* @param index
* @return
*/
public String getTemperature(int index){
return this.temps.get(index);
}
}