未来24小时逐小时天气预报爬虫(数据来自中国天气网)/Java

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/XGL1569348/article/details/76602013

使用前先用maven导入HtmlUnit或者在HtmlUnit官网下载相关jar包支持
    
使用方法见主方法内,传入的ID数据见https://drive.google.com/file/d/0B9zkTpK3eXCGc01XM2xPeHFSdEU/view?usp=sharing

操作:使用了HtmlUnit进行模拟浏览器加载JS后对HTML代码进行解析从而获得相关文本信息。

注意:

  • 因为从网站上抓取的,因此可能不是很稳定,但思路大同小异。(中国天气网没有什么反爬虫策略??)
  • getByXPath()这个方法是直接通过规则进行整个网页HTML代码进行搜索,无论你是通过page或者HtmlDivision来使用这个方法。
  • 因为这个demo抓取的是纯文本信息,所以其实将page直接转化text,再去操作字符串的话,速度上可能差距不是很大。
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebClientOptions;
import com.gargoylesoftware.htmlunit.html.*;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * Created by Xgl on 2017/8/2.
 * <p>You can use this file as a lib to get hourly weather forecast in the next 24 hours</p>
 * <p>Powered by HtmlUnit</p>
 */

public class Spider {

    private List<String> times;
    private List<String> weathers;
    private List<String> temps;
    private String url;
    private static int TOTAL_NUM = 24;

    /**
     * Test
     */
    public static void main(String[] args) {
        Spider spider = new Spider("101230505");
        if (spider.initData()){
            //Then you can do everything you want.
        }

    }

    /**
     * Initialize url and lists in constructor.
     */
    public Spider(String cityID) {
        times = new ArrayList<>();
        weathers = new ArrayList<>();
        temps = new ArrayList<>();
        url = new String("http://m.weather.com.cn/mhours/"+cityID+".shtml");
    }

    /**
     * Initialize the hourly weather data( in those lists of Spider)
     * @return Successful if true, failed if false
     */
    public boolean initData(){

        final WebClient webClient = new WebClient();
        final HtmlPage page;

        //set web client options
        WebClientOptions webClientOptions = webClient.getOptions();
        webClientOptions.setCssEnabled(false);
        webClientOptions.setUseInsecureSSL(true);
        webClientOptions.setActiveXNative(false);
        webClientOptions.setThrowExceptionOnScriptError(false);
        webClient.waitForBackgroundJavaScript(50000);
        webClientOptions.setThrowExceptionOnScriptError(false);
        webClientOptions.setThrowExceptionOnFailingStatusCode(false);

        try {
            page = webClient.getPage(url);

            List<?> items = page.getByXPath("//li");//TODO:Perhaps it can be improved to reduce the wasted time because of such a violent algorithms.

            for (int i = 0; i < TOTAL_NUM; i++) {
                HtmlListItem htmlListItem = (HtmlListItem) items.get(i);
                String time = htmlListItem.getFirstChild().toString();
                times.add(time);
            }

            for (int i = TOTAL_NUM; i < TOTAL_NUM*2; i++) {

                HtmlListItem htmlListItem = (HtmlListItem) items.get(i);

                HtmlSpan htmlSpan = (HtmlSpan) htmlListItem.getElementsByTagName("span").get(1);
                weathers.add(htmlSpan.getFirstChild().toString());

                htmlSpan = (HtmlSpan) htmlListItem.getElementsByTagName("span").get(2);
                temps.add(htmlSpan.getFirstChild().toString());
            }

        } catch (IOException e) {
            System.err.println("error");
            e.printStackTrace();
            return false;
        }
        webClient.close();
        return true;
    }

    /**
     * Get weather type of an hour in the future within 24 hours.
     * @param index
     * @return
     */
    public String getWeather(int index){
        return this.weathers.get(index);
    }

    /**
     * Get accurate time of an hour in the future within 24 hours.
     * @param index
     * @return
     */
    public String getTime(int index){
        return this.times.get(index);
    }

    /**
     * Get temperature of an hour in the future within 24 hours.
     * @param index
     * @return
     */
    public String getTemperature(int index){
        return this.temps.get(index);
    }

}

猜你喜欢

转载自blog.csdn.net/XGL1569348/article/details/76602013