Jsoup爬取CSDN博客

版权声明:本文为博主原创文章,未经博主允许不得转载 https://blog.csdn.net/liujun03/article/details/82832617

个人Jsoup练习之作,只做参考:

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.10.3</version>
</dependency>
package CSDN;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;

/**
 * @Author: BaiDing
 * @Date: 2018/9/24 19:14
 */
public class JsopDemo {

    private static final String BASE_URL = "https://blog.csdn.net/liujun03/article/list/";
    private static int ARTICLE_SORT=0;

    private int getAllPageCount(){
        int count = 0;
        try {
            Document doc = Jsoup.connect(BASE_URL).get();
            Elements scriptList = doc.select("script");
            for(Element script : scriptList){
                String text = script.data();
                if(text.contains("getAllUrl")){
                    String[] splitArray = text.split(";");
                    int pageSize = 0;
                    int listTotal = 0;
                    for(String split : splitArray){
                        if(split.contains("pageSize")){
                            pageSize = Integer.valueOf(split.split("=")[1].trim());
                        }
                        if(split.contains("listTotal")){
                            listTotal = Integer.valueOf(split.split("=")[1].trim());
                        }
                    }
                    count = listTotal%pageSize == 0 ?listTotal/pageSize:listTotal/pageSize+1;
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return count;
    }

    private void testJsop(int page,BufferedWriter bw) {
        String url = BASE_URL + page;
        try {

            Document doc = Jsoup.connect(url).get();
            Elements articleDiv = doc.select("div.article-list");
            Elements articleList = articleDiv.select("div.csdn-tracking-statistics");
            for(Element article : articleList){
                String linkUrl= article.select("a").first().attr("href");
                if(linkUrl.contains("liujun")) {
                    ARTICLE_SORT++;

                    String linkTitle = article.select("a").first().text();
                    Elements num= article.select("p span");
                    String date = num.get(0).text();
                    String readNum = num.get(1).text();
                    String reviewNum = num.get(2).text();
                    StringBuilder data = new StringBuilder()
                        .append(ARTICLE_SORT)
                        .append(" 标题: ")
                        .append(linkTitle)
                        .append(" , 链接: ")
                        .append(linkUrl)
                        .append("\t\n")
                        .append("创建时间: ")
                        .append(date)
                        .append(" , ")
                        .append(readNum)
                        .append(" , ")
                        .append(reviewNum);
                    bw.write(data+"\t\n");
                }
            }

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args){
        JsopDemo jsopDemo = new JsopDemo();
        try{
            DateFormat bf = new SimpleDateFormat("yyyyMMddHHmmss");
            String fileName =bf.format(new Date())+".txt";
            File file = new File("D:/CSDN/"+fileName);
            if(!file.exists()){
                file.createNewFile();
            }
            FileOutputStream fos=new FileOutputStream(file);
            OutputStreamWriter osw=new OutputStreamWriter(fos, "UTF-8");
            BufferedWriter bw=new BufferedWriter(osw);
            int count = jsopDemo.getAllPageCount();
            for(int i=1;i<=count;i++){
                jsopDemo.testJsop(i,bw);
            }
            bw.close();
            osw.close();
            fos.close();
        }catch (Exception e){
            e.printStackTrace();
        }


    }
}

猜你喜欢

转载自blog.csdn.net/liujun03/article/details/82832617