使用jsoup和httpclient编写的简单爬虫工具

一、灵感来源

前两天有位朋友向我提到了这样一个需求，希望从某张网页上抓取数据然后自动填入到本地的Excel表格中，当你需要进行大量的数据统计时，会显得很方便，让简单的工作变得自动化。这不正是shell编程得目的吗！（本文与shell编程无关，只是有感即发）

二、需求分析

首先对于需求，就要进行大概的分析设计（由于只是简单的做一个测试demo，所以便不必考虑可行性，可维护性等问题），主要归结为以下几步：

爬取网页数据，那么必然要知道网页的Url，这是我们抓取数据的入口
网页需要根据发送的url产生响应，并返回html响应页面。
判断返回得结果是否是我们所需要的
解析返回得html页面
将解析的数据打包，写入本地磁盘

三、解决方案

本示例测试了对“编程吧”的信息进行抓取。
1、使用到的jar包，主要有

httpcilent：[http://hc.apache.org/downloads.cgi]
jsoup：[https://jsoup.org/download]
poi：[http://poi.apache.org/]
均可到指定的地址下载。

2、Demo结构

bean包：用于管理需要存储的对象
main包：用于测试
parser包：用于解析html响应页面
service包：用于将解析结果写入本地磁盘
util包：用于与网页交互响应，返回html响应页面

Model类（存储的对象）：

package com.crawler.bean;
public class Model {
    private String cardTitle;//帖子标题
    private String authorName;//作者
    private String cardContent;//帖子内容
    private String cardDate;//发帖日期
    public String getCardTitle() {
        return cardTitle;
    }
    public void setCardTitle(String cardTitle) {
        this.cardTitle = cardTitle;
    }
    public String getAuthorName() {
        return authorName;
    }
    public void setAuthorName(String authorName) {
        this.authorName = authorName;
    }
    public String getCardContent() {
        return cardContent;
    }
    public void setCardContent(String cardContent) {
        this.cardContent = cardContent;
    }
    public String getCardDate() {
        return cardDate;
    }
    public void setCardDate(String cardDate) {
        this.cardDate = cardDate;
    }
}

UrlToHtml类（返回html响应页面）：

package com.crawler.util;
import com.crawler.bean.Model;
import com.crawler.parser.DataParse;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicHttpResponse;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class UrlToHtml {
    public  List<Model> URLParser(String url) throws Exception {
        //初始化一个httpclient
        HttpClient client = new DefaultHttpClient();
        //用来接收解析的数据
        List<Model> cardDatas = new ArrayList<>();
        //获取响应文件，即html，采用get方法获取响应数据
        HttpGet getMethod = new HttpGet(url);
        HttpResponse response = new BasicHttpResponse(HttpVersion.HTTP_1_1,
                HttpStatus.SC_OK, "OK");
        try {
            //执行get方法
            response = client.execute(getMethod);
        } catch (IOException e) {
            e.printStackTrace();
        }
        //获取响应状态码
        int statusCode = response.getStatusLine().getStatusCode();
        //如果状态响应码为200，则获取html实体内容或者json文件
        if (statusCode == 200) {
            //设置字符编码
            String entity = EntityUtils.toString(response.getEntity(), "utf-8");
            //对响应的html内容进行解析
            cardDatas = DataParse.getData(entity);
            EntityUtils.consume(response.getEntity());
        } else {
            //否则，消耗掉实体
            EntityUtils.consume(response.getEntity());
        }
        return cardDatas;
    }
}

DataParse类（解析html响应页面）：

package com.crawler.parser;

import com.crawler.bean.Model;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.List;

public class DataParse {
    public static List<Model> getData(String html) throws Exception {
        //cardDatas用于存放结果
        List<Model> cardDatas = new ArrayList<Model>();
        //采用Jsoup解析
        Document doc = Jsoup.parse(html);
        //获取html标签中的内容
        Elements elements = doc.select("div[class=content]").select("ul[id=thread_list]").select("div[class=t_con cleafix]");
        //遍历
        for (Element ele : elements) {
            //获取标题
            String cardName = ele.select("a").text();
            //获取作者
            String authorName = ele.select("div[class=threadlist_author pull_right]").select("span").attr("title");
            String newAuthorName = authorName.substring(6);
            //获取内容
            String cardContent = ele.select("div[class=threadlist_text pull_left]").text();
            //获取日期
            String cardDate = ele.select("div[class=threadlist_author pull_right]").select("span[class=pull-right is_show_create_time]").text();
            //写入Model属性中
            Model cd = new Model();
            cd.setCardTitle(cardName);
            cd.setAuthorName(newAuthorName);
            cd.setCardContent(cardContent);
            cd.setCardDate(cardDate);
            cardDatas.add(cd);
        }
        //返回数据
        return cardDatas;
    }
}

WriteToLocal类（写入本地磁盘）：

package com.crawler.service;
import com.crawler.bean.Model;
import org.apache.poi.hssf.usermodel.*;
import org.apache.poi.hssf.util.HSSFColor;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
public class WriteToLocal {
    public void writeToExcel(List<Model> cardDatas, int columeCount, String[] titles, String path) {
        HSSFWorkbook hssfWorkbook = new HSSFWorkbook();
        HSSFSheet sheet = hssfWorkbook.createSheet("我的表格");
        //创建标题行
        HSSFRow headRow = sheet.createRow(0);
        for (int i = 0; i <= columeCount - 1; i++) {
            HSSFCell cell = headRow.createCell(i);
            cell.setCellType(HSSFCell.CELL_TYPE_STRING);
            //设置列的宽度
            sheet.setColumnWidth(i, 8000);
            HSSFCellStyle style = hssfWorkbook.createCellStyle();
            //设置列名的颜色
            HSSFFont font = hssfWorkbook.createFont();
            short color = HSSFColor.GREEN.index;
            font.setColor(color);
            style.setFont(font);
            //填写数据
            cell.setCellStyle(style);
            cell.setCellValue(titles[i]);
        }
        //设置行号标签
        int index = 0;
        //写入数据
        for (Model model : cardDatas) {
            //;
            HSSFRow hssfRow = sheet.createRow(index + 1);
            for (int n = 0; n <= columeCount - 1; n++)
                hssfRow.createCell(n);
            hssfRow.getCell(0).setCellValue(model.getCardTitle());
            hssfRow.getCell(1).setCellValue(model.getAuthorName());
            hssfRow.getCell(2).setCellValue(model.getCardContent());
            hssfRow.getCell(3).setCellValue(model.getCardDate());
            index++;
        }
        //写入磁盘
        try {
            FileOutputStream fileOutputStream = new FileOutputStream(new File(path));
            hssfWorkbook.write(fileOutputStream);
            fileOutputStream.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

TestCrawler类（测试）：

package com.crawler.main;
import com.crawler.bean.Model;
import com.crawler.service.WriteToLocal;
import com.crawler.util.UrlToHtml;
import java.util.List;
public class TestCrawler {
    public static void main(String[] args) throws Exception {
        //需要抓取数据的网页的URL
        String url = "http://tieba.baidu.com/f?kw=%E7%BC%96%E7%A8%8B&ie=utf-8&pn=0";
        UrlToHtml urlToHtml = new UrlToHtml();
        List<Model> cardDatas = urlToHtml.URLParser(url);
        //写入本地磁盘
        WriteToLocal wt = new WriteToLocal();
        //表格的列数，即Model具有的属性个数
        int columebCount = 4;
        //表格的列名，即Model具有的属性的名称
        String[] titles = {"帖子标题", "作者", "帖子内容", "发帖日期"};
        //本地的路径，即写入到哪个地方
        String path = "C:\\test6.xlsx";
        wt.writeToExcel(cardDatas,columebCount,titles,path);
    }
}

爬取对象：
这里写图片描述
爬取结果（由于操作时间的差异，可能导致刚更新的数据会有差异，望谅解）：

四、总结

上述案例只是一个简单的演示，虽然灵活性很差（未使用到正则表达式进行匹配），但是基本上完成了主要的功能，以后会不断地改善！

使用jsoup和httpclient编写的简单爬虫工具

一、灵感来源

二、需求分析

三、解决方案

四、总结

猜你喜欢