爬虫(一) java爬取起点中文网小说

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/QuietHRH/article/details/82794063

思维导图

获取数据

1. 原生JDK

  • 创建URL对象
  • 获取连接
  • 设置请求方式
    • post方式要打开输出流,因为参数在请求体中. conn.setDoOutput(true);
  • 流的方式获取数据
package com.hrh.jdk;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

public class JdkPost {
    //演示使用原生jdk来发送post请求
    public static void main(String[] args) throws Exception {
        //1. 创建url对象
        URL url = new URL("http://www.itcast.cn");
        //2. 获取连接
        HttpURLConnection httpURLConnection = (HttpURLConnection)url.openConnection();

        //3. 设置请求方式
        //httpURLConnection.setRequestMethod("GET");
        httpURLConnection.setRequestMethod("POST");

        //4. 设置请求参数 POST方式
        //打开输出流. 因为jdk默认将输出流是关闭的
        httpURLConnection.setDoOutput(true);
        OutputStream out = httpURLConnection.getOutputStream();

        out.write("username=zs&password=123".getBytes());

        //5. 获取数据
        InputStream in = httpURLConnection.getInputStream();

        int len = -1;
        byte[] b = new byte[1024];
        while((len = in.read(b)) != -1){

            System.out.println(new String(b,0,len));

        }

    }
}
	

2. httpClient

  • 导入依赖
  • 获取httpClient对象
  • 设置请求方式
  • 设置请求参数
  • 发送请求,获取数据
package com.itheima.httpClient;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;


public class HttpClientPost {
    public static void main(String[] args) throws IOException {
        //1. 获取httpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //2. 设置请求方式
        // HttpGet httpGet = new HttpGet("http://www.itcast.cn");
        HttpPost httpPost = new HttpPost("http://www.itcast.cn");

        //3. 设置请求参数
        //设置请求头:
        //设置请求体:POST
        List<BasicNameValuePair> list = new ArrayList<BasicNameValuePair>();
        list.add(new BasicNameValuePair("username","zhangsan"));
        list.add(new BasicNameValuePair("age","18"));

        HttpEntity entity = new UrlEncodedFormEntity(list);
        httpPost.setEntity(entity);

        //4. 发送请求, 获取响应对象
        CloseableHttpResponse response = httpClient.execute(httpPost);

        //5. 获取数据
        //5.1 获取状态码
        int statusCode = response.getStatusLine().getStatusCode();

        //5.2 获取响应头
        Header[] headers = response.getHeaders("Content-Type");
        String value = headers[0].getValue();
        System.out.println(value);

        //5.3 获取响应体
        String html = EntityUtils.toString(response.getEntity(), "utf-8");
        System.out.println(html);
    }
}

解析数据

jsoup

  • 导入依赖
  • 获取dom对象
    • Jsoup.parse( String html ); 常用
    • Jsoup.connect( url ).get(); 指定url
    • Jsoup.parse( File in, String charset); 指定file文件路径
    • Jsoup.parseBodyFragment( String html ) 解析html片段
  • 根据选择器获取Elements
package com.hrh.jsoup;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;

public class ItcastParseSelector {

    public static void main(String[] args) throws IOException {
        //1. 获取dom对象
        Document document = Jsoup.connect("http://www.itcast.cn").get();
        //2. 解析操作
        Elements aEl = document.select(".nav_txt>ul>li>a");
        for (Element a : aEl) {
            System.out.println(a.text());
        }
    }
}

jsoup的常用方法:

  • parse(String html); 用来得到dom对象
  • select(选择器);
  • text(); 获取指定元素的内容体(只能获取文本内容)
  • html();获取指定元素的内容体(可以将html代码也一并获取到)
  • attr(name); 根据指定的属性名称获取其对应属性的值

保存数据

当解析完数据以后, 要将解析后的数据保存到对应的一个容器中(MySQL, 文件), 目前采用的MySQL,后期可以使用hadoop,hbase

  • 使用数据库来进行保存数据的操作有几种方式:
    • JDBC: 七大步
    • DbUtils: queryRunner
    • mybatis
    • spring中jdbcTemplate

案例 爬取起点中文网的榜单小说

package com.hrh.anli;


import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;

public class QiDianDemo {

    //抽取方法 传入URL 获得document对象
    public static Document getDocument(String url) throws Exception{
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(url);
        CloseableHttpResponse response = httpClient.execute(httpGet);
        String html = EntityUtils.toString(response.getEntity(), "utf-8");
        Document document = Jsoup.parse(html);
        return document;
    }

    public static void main(String[] args) throws Exception {

        //起点首页url
        String url="https://www.qidian.com/";
        //获取document
        Document document = getDocument(url);
        //获得某一榜单(这里为签约作家新书榜)
        Elements aEl = document.select("[class=rank-list mr0] li a[href*=book.qidian.com][class!=link]");

        //遍历获得的a标签 取出url 依次获得每本书的链接
        for (Element a : aEl) {

            url="https:"+a.attr("href");
            document=getDocument(url);

            //获得开始阅读的url
            Elements readBtn = document.select("#readBtn");
            String bookName = document.select(".book-info h1 em").text();
            url="https:"+readBtn.attr("href");

            //创建一个输出流,将爬到的小说以txt形式保存在硬盘
            BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E://"+bookName+".txt")));

            //遍历某一本书的免费章节
            while(true){
                document=getDocument(url);

                //获得本章的章节名称 并输出到文本中
                Elements chapterName = document.select(".j_chapterName");
                bw.write(chapterName.text());
                bw.newLine();
                bw.flush();

                //获得本章的小说内容 并输出到文本中
                Elements pEl = document.select("[class=read-content j_readContent] p");
                for (Element p : pEl) {
                    bw.write(p.text());
                    bw.newLine();
                    bw.flush();
                }

                //获得下一章的元素
                Elements chapterNext = document.select("#j_chapterNext[href*=read.qidian.com]");

                //判断下一章是否存在(这里指的是免费章节)
                //存在则继续进入下一章的链接
                //不存在则跳出本书的章节遍历,进入榜单中下一本书的遍历
                if(chapterNext==null || chapterNext.size()==0){
                    break;
                }

                //获得下一章的链接
                url="https:"+chapterNext.attr("href");
            }
            //关流
            bw.close();

        }

    }
}

猜你喜欢

转载自blog.csdn.net/QuietHRH/article/details/82794063
今日推荐