java爬虫-HttpClient小练习

本文记录前几天参照视频写的几个java使用HttpClient爬取数据的例子，主要内容包括：

GET请求

带参数的GET请求

POST请求

带参数的POST请求

连接池

请求参数

项目目录如下图所示：
pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>cn.mlnt</groupId>
    <artifactId>mlnt-crawler-first</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
            <!--<scope>test</scope>-->
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.6</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.11</version>
        </dependency>
    </dependencies>
</project>

log4j.properties

log4j.rootLogger = DEBUG,A1
log4j.logger.cn.mlnt = DEBUG

log4j.appender.A1 = org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout = org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n

GET请求

package cn.mlnt.crawler.test;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;

/**
* HttpClient-Get
*/
public class HttpGetTest {
    
    
   public static void main(String[] args) {
    
    
       // 创建HttpClient对象
       CloseableHttpClient httpClient = HttpClients.createDefault();

       // 创建HttpGet对象，设置url访问地址
       HttpGet httpGet = new HttpGet("http://www.itcast.cn");

       CloseableHttpResponse response = null;
       try {
    
    
           // 使用HttpClient发起请求，获取response
            response = httpClient.execute(httpGet);

           // 解析响应
           if(response.getStatusLine().getStatusCode() == 200) {
    
    
               String content = EntityUtils.toString(response.getEntity(), "utf8");
               /*System.out.println(content);*/
               System.out.println(content.length());
           }
       } catch (IOException e) {
    
    
           e.printStackTrace();
       } finally {
    
    
           // 关闭response
           try {
    
    
               response.close();
           } catch (IOException e) {
    
    
               e.printStackTrace();
           }
           try {
    
    
               httpClient.close();
           } catch (IOException e) {
    
    
               e.printStackTrace();
           }
       }
   }
}

带参数的GET请求

package cn.mlnt.crawler.test;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URISyntaxException;

/**
 * HttpClient-Get带参数
 */
public class HttpGetParamTest {
    
    
    public static void main(String[] args) throws URISyntaxException {
    
    
        // 创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        // 设置请求地址是：http://yun.itheima.com/search?keys=Java
        // 创建URIBuilder
        URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
        // 设置参数
        uriBuilder.setParameter("keys", "Java");


        // 创建HttpGet对象，设置url访问地址
        //HttpGet httpGet = new HttpGet("http://www.itcast.cn");
        HttpGet httpGet = new HttpGet(uriBuilder.build());

        System.out.println("发起请求的信息："+httpGet);

        CloseableHttpResponse response = null;
        try {
    
    
            // 使用HttpClient发起请求，获取response
            response = httpClient.execute(httpGet);

            // 解析响应
            if(response.getStatusLine().getStatusCode() == 200) {
    
    
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                /*System.out.println(content);*/
                System.out.println(content.length());
            }
        } catch (IOException e) {
    
    
            e.printStackTrace();
        } finally {
    
    
            // 关闭response
            try {
    
    
                response.close();
            } catch (IOException e) {
    
    
                e.printStackTrace();
            }
            try {
    
    
                httpClient.close();
            } catch (IOException e) {
    
    
                e.printStackTrace();
            }
        }
    }
}

POST请求

package cn.mlnt.crawler.test;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;

/**
 * HttpClient-Post
 */
public class HttpPostTest {
    
    
    public static void main(String[] args) {
    
    
        // 创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        // 创建HttpPost对象，设置url访问地址
        HttpPost httpPost = new HttpPost("http://www.itcast.cn");

        CloseableHttpResponse response = null;
        try {
    
    
            // 使用HttpClient发起请求，获取response
             response = httpClient.execute(httpPost);

            // 解析响应
            if(response.getStatusLine().getStatusCode() == 200) {
    
    
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                /*System.out.println(content);*/
                System.out.println(content.length());
            }
        } catch (IOException e) {
    
    
            e.printStackTrace();
        } finally {
    
    
            // 关闭response
            try {
    
    
                response.close();
            } catch (IOException e) {
    
    
                e.printStackTrace();
            }
            try {
    
    
                httpClient.close();
            } catch (IOException e) {
    
    
                e.printStackTrace();
            }
        }
    }
}

带参数的POST请求

package cn.mlnt.crawler.test;

import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;

/**
 * HttpClient-Post带参数
 */
public class HttpPostParamTest {
    
    
    public static void main(String[] args) throws UnsupportedEncodingException {
    
    
        // 创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        // 创建HttpPost对象，设置url访问地址
        HttpPost httpPost = new HttpPost("http://www.itcast.cn/search");

        // 声明List集合，封装表单中的参数
        ArrayList<NameValuePair> params = new ArrayList<NameValuePair>();
        // 设置请求地址是：http://yun.itheima.com/search?keys=Java
        params.add(new BasicNameValuePair("keys", "Java"));
        // 创建表单的Entity对象，第一个参数就是封装好的表单数据，第二个参数就是编码
        UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "utf8");

        // 设置表单的Entity对象到Post请求中
        httpPost.setEntity(formEntity);

        CloseableHttpResponse response = null;
        try {
    
    
            // 使用HttpClient发起请求，获取response
             response = httpClient.execute(httpPost);

            // 解析响应
            if(response.getStatusLine().getStatusCode() == 200) {
    
    
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                /*System.out.println(content);*/
                System.out.println(content.length());
            }
        } catch (IOException e) {
    
    
            e.printStackTrace();
        } finally {
    
    
            // 关闭response
            try {
    
    
                response.close();
            } catch (IOException e) {
    
    
                e.printStackTrace();
            }
            try {
    
    
                httpClient.close();
            } catch (IOException e) {
    
    
                e.printStackTrace();
            }
        }
    }
}

连接池

package cn.mlnt.crawler.test;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;

/**
 * HttpClient-连接池
 */
public class HttpClientPoolTest {
    
    
    public static void main(String[] args) {
    
    
        // 创建连接池管理器
        PoolingHttpClientConnectionManager manager = new PoolingHttpClientConnectionManager();

        // 设置连接数
        manager.setMaxTotal(100);

        // 设置每个主机的最大连接数
        manager.setDefaultMaxPerRoute(10);

        // 使用连接池管理器发起请求
        doGet(manager);
        doGet(manager);
    }

    private static void doGet(PoolingHttpClientConnectionManager manager) {
    
    
        // 不是每次都创建新的HttpClient，而是从连接池中获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(manager).build();

        HttpGet httpGet = new HttpGet("http://www.itcast.cn");

        CloseableHttpResponse response = null;
        try {
    
    
            response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode() == 200) {
    
    
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                System.out.println(content);
            }
        } catch (IOException e) {
    
    
            e.printStackTrace();
        } finally {
    
    
            if(response != null) {
    
    
                try {
    
    
                    response.close();
                } catch (IOException e) {
    
    
                    e.printStackTrace();
                }
                // 不能关闭httpClient，由连接池对象管理
            }
        }
    }
}

请求参数配置

package cn.mlnt.crawler.test;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;

/**
 * HttpClient-请求参数
 */
public class HttpConfigTest {
    
    
    public static void main(String[] args) {
    
    
        // 创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        // 创建HttpGet对象，设置url访问地址
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");

        // 配置请求信息
        RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) // 设置连接的最长时间，单位是毫秒
                .setConnectionRequestTimeout(500) // 设置获取连接的最长时间，单位是毫秒
                .setSocketTimeout(10 * 1000) // 设置数据传输的最长时间，单位是毫秒
                .build();
        // 给请求设置请求信息
        httpGet.setConfig(config);

        CloseableHttpResponse response = null;
        try {
    
    
            // 使用HttpClient发起请求，获取response
             response = httpClient.execute(httpGet);

            // 解析响应
            if(response.getStatusLine().getStatusCode() == 200) {
    
    
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                /*System.out.println(content);*/
                System.out.println(content.length());
            }
        } catch (IOException e) {
    
    
            e.printStackTrace();
        } finally {
    
    
            // 关闭response
            try {
    
    
                response.close();
            } catch (IOException e) {
    
    
                e.printStackTrace();
            }
            try {
    
    
                httpClient.close();
            } catch (IOException e) {
    
    
                e.printStackTrace();
            }
        }
    }
}

附Jsoup解析html的例子：

package jsoup;

import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.File;
import java.net.URL;
import java.util.Set;

public class JsoupFirstTest {
    
    
    /**
     * 虽然使用jsoup可以替代HttpClient直接发起请求解析数据，但实际开发中会需要使用到多线程、连接池、代理等方式，
     * 而jsoup对这些的支持并不是很好，因此一般只把jsoup作为html解析工具使用
     * @throws Exception
     */

    /**
     * jsoup解析URL
     * @throws Exception
     */
    @Test
    public void testUrl() throws Exception {
    
    
        // 解析url地址，第一个参数是访问的url，第二个参数是访问的超时时间
        Document document = Jsoup.parse(new URL("http://www.itcast.cn"), 1000);

        // 使用标签选择器，获取title标签中的内容
        String title = document.getElementsByTag("title").first().text();

        // 打印
        System.out.println(title);
    }

    /**
     * jsoup解析字符串
     * @throws Exception
     */
    @Test
    public void testString() throws Exception {
    
    
        // 使用工具类读取文件，获取字符串
        String content = FileUtils.readFileToString(new File("C:\\Users\\18476\\Desktop\\test.html"), "utf8");

        // 解析字符串
        Document document = Jsoup.parse(content);

        // 使用标签选择器，获取title标签中的内容
        String title = document.getElementsByTag("title").first().text();

        // 打印
        System.out.println(title);
    }

    /**
     * Jsoup解析文件
     * @throws Exception
     */
    @Test
    public void testFile() throws Exception {
    
    
        // 解析文件
        Document document = Jsoup.parse(new File("C:\\Users\\18476\\Desktop\\test.html"), "utf8");

        // 使用标签选择器，获取title标签中的内容
        String title = document.getElementsByTag("title").first().text();

        // 打印
        System.out.println(title);
    }

    /**
     * 使用Dom的方式获取元素
     * @throws Exception
     */
    @Test
    public void testDom() throws Exception {
    
    
        // 解析文件获取Document对象
        // 解析文件
        Document document = Jsoup.parse(new File("C:\\Users\\18476\\Desktop\\test.html"), "utf8");

        // 1.根据id查询元素getElementById
        Element element = document.getElementById("s_strpx_span1");

        // 2.根据标签获取元素getElementByTag
        Elements element1 = document.getElementsByTag("span");


        // 3.根据class获取元素getElementByClass
        Element element2 = document.getElementsByClass("wrapper_new").first();

        // 4.根据属性获取元素getElementByAttribute
        Element element3 = document.getElementsByAttribute("style").first();
        Element element4 = document.getElementsByAttributeValue("id", "s_strpx_span1").first();

        // 打印元素的内容
        System.out.println("获取到的元素内容是："+element.text());
        System.out.println("获取到的元素内容是："+element1.text());
        System.out.println("获取到的元素内容是："+element2.text());
        System.out.println("获取到的元素内容是："+element3.text());
        System.out.println("获取到的元素内容是："+element4.text());
    }

    /**
     * 获取元素中的数据
     * @throws Exception
     */
    @Test
    public void testData() throws Exception {
    
    
        // 解析文件获取Document对象
        Document document = Jsoup.parse(new File("C:\\Users\\18476\\Desktop\\test.html"), "utf8");

        // 根据id查询元素getElementById
        Element element = document.getElementById("s_strpx_span1");

        String str = "";

        // 元素中获取数据
        // 1.从元素中获取id
        str = element.id();

        // 2.从元素中获取className
        str = element.className();
        Set<String> classSet = element.classNames();
        for(String s : classSet) {
    
    
            System.out.println(s);
        }

        // 3.从元素中获取属性的值attr
        str = element.attr("id");
        str = element.attr("class");

        // 4.从元素中获取所有属性attributes
        Attributes attributes = element.attributes();
        System.out.println(attributes.toString());

        // 5.从元素中获取文本内容text
        str = element.text();

        System.out.println("获取到的内容是："+str);
    }

    /**
     * 使用选择器获取元素
     * @throws Exception
     */
    @Test
    public void testSelector() throws Exception {
    
    

        // 解析html文件获取Document对象
        Document document = Jsoup.parse(new File("C:\\Users\\18476\\Desktop\\test.html"), "utf8");

        // tagname：通过标签查找元素，比如：span
        Elements elements = document.select("span");
        for (Element element : elements) {
    
    
            System.out.println(element.text());
        }

        // #id：通过ID查找元素，比如：#s_strpx_span1
        Element element1 = document.select("#s_strpx_span1").first();
        System.out.println("获取到的结果1是："+element1);

        // .class：通过class名称查找元素，比如：.wrapper_new
        Element element2 = document.select(".wrapper_new").first();
        System.out.println("获取到的结果2是："+element2);

        // [attribute]：利用属性查找元素，比如：[style]
        Element element3 = document.select("[style]").first();
        System.out.println("获取到的结果3是："+element3);

        // [attr=value]：利用属性值来查找元素，比如：[id=s_strpx_span1]
        Element element4 = document.select("[id=s_strpx_span1]").first();
        System.out.println("获取到的结果4是："+element4);
    }

    /**
     * 使用组合选择器获取元素
     */
    @Test
    public void testSelector2() throws Exception {
    
    

        // 解析html文件获取Document对象
        Document document = Jsoup.parse(new File("C:\\Users\\18476\\Desktop\\test.html"), "utf8");

        // el#id：元素+ID，比如：span#s_strpx_span1
        Element element1 = document.select("span#s_strpx_span1").first();
        System.out.println("获取到的结果1是："+element1);

        // el.class：元素+class，比如：span.wrapper_new
        Element element2 = document.select("span.wrapper_new").first();
        System.out.println("获取到的结果2是："+element2);

        // el[attr]：元素+属性名，比如：span[style]
        Element element3 = document.select("span[style]").first();
        System.out.println("获取到的结果3是："+element3);

        // 任意组合：比如：span[abc].s_name
        Element element4 = document.select("span[style].s_name").first();
        System.out.println("获取到的结果4是："+element4);

        // ancestor child：查找某个元素下的子元素，比如：.city_con li 查找“city_con”下的所有li
        Element element5 = document.select(".city_con li").first();
        System.out.println("获取到的结果5是："+element5);

        // parent > child：查找某个父元素下的直接子元素，比如：
        // .city_con > ul > li 查找city_con第一级（直接子元素）的ul，再找所有ul下的第一级li
        Elements elements = document.select(".city_con > ul > li");
        for (Element element : elements) {
    
    
            System.out.println(element);
        }

        // parent > * :查找某个父元素下所有直接子元素
        Elements element6 = document.select(".city_con > ul > li");
        for (Element element : elements) {
    
    
            System.out.println(element);
        }
    }

}

java爬虫-HttpClient小练习

猜你喜欢