JAVA——基于HttpComponents(HttpClient)的简单网络爬虫DEMO

基本概念

HttpComponents(HttpClient)

超文本传输​​协议(HTTP)可能是当今Internet上使用的最重要的协议。Web服务,支持网络的设备和网络计算的增长继续将HTTP协议的作用扩展到用户驱动的Web浏览器之外,同时增加了需要HTTP支持的应用程序的数量。

HttpComponents是为扩展而设计的,同时提供了对基本HTTP协议的强大支持,对于构建HTTP感知的客户端和服务器应用程序(例如Web浏览器,Web Spider,HTTP代理,Web服务传输库或利用或扩展HTTP协议以进行分布式通信。

官网

官网地址:http://hc.apache.org/ 

Maven

        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpcore</artifactId>
            <version>4.4.10</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.6</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-collections4 -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-collections4</artifactId>
            <version>4.1</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>

源代码

HTTPClientPool 

package club.zstuca.httpclient;

import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;


import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.ssl.TrustStrategy;




/**
 * Https忽略证书
 */
public class HTTPClientPool {
    private static final String HTTP = "http";
    private static final String HTTPS = "https";
    private static SSLConnectionSocketFactory sslConnectionSocketFactory = null;
    private static PoolingHttpClientConnectionManager poolingHttpClientConnectionManager = null;//连接池管理类
    private static SSLContextBuilder sslContextBuilder = null;//管理Https连接的上下文类

    static {
        try {
            sslContextBuilder = new SSLContextBuilder().loadTrustMaterial(null,
                    new TrustStrategy() {
                        @Override
                        public boolean isTrusted(X509Certificate[] x509Certificates, String s)
                                throws CertificateException {
        //                    信任所有站点 直接返回true
                            return true;
                        }
                    });
            //"SSLv2Hello", "SSLv3", "TLSv1"
            sslConnectionSocketFactory = new SSLConnectionSocketFactory(
                    sslContextBuilder.build(),
                    new String[]{"TLSv1.2"},
                    null,
                    NoopHostnameVerifier.INSTANCE);
            Registry<ConnectionSocketFactory> registryBuilder = RegistryBuilder
                    .<ConnectionSocketFactory>create()
                    .register(HTTP, new PlainConnectionSocketFactory())
                    .register(HTTPS, sslConnectionSocketFactory)
                    .build();
            poolingHttpClientConnectionManager = new PoolingHttpClientConnectionManager(registryBuilder);
            poolingHttpClientConnectionManager.setMaxTotal(200);
        } catch (NoSuchAlgorithmException e) {
            e.printStackTrace();
        } catch (KeyStoreException e) {
            e.printStackTrace();
        } catch (KeyManagementException e) {
            e.printStackTrace();
        }

    }


    /**
     * 获取连接
     *
     * @return
     * @throws Exception
     */
    public static CloseableHttpClient getHttpClient() throws Exception {
        CloseableHttpClient httpClient = HttpClients.custom()
                .setSSLSocketFactory(sslConnectionSocketFactory)
                .setConnectionManager(poolingHttpClientConnectionManager)
                .setConnectionManagerShared(true)
                .setDefaultCookieStore(new BasicCookieStore())
                .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36")
                .build();
        return httpClient;
    }
}

Web Crawler

package club.zstuca.httpclient;


import org.apache.http.*;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.StringUtils;


import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Map;

/**
 * Http/Https请求的工具类
 */
public class HTTPClientUtil {
    // 日志
    private static Logger logger = LoggerFactory.getLogger(HTTPClientUtil.class);
    // Request params default Config
    private static RequestConfig requestConfig = RequestConfig.custom()
            .setConnectTimeout(5000)
            .setConnectionRequestTimeout(5000)
            .setSocketTimeout(5000)
            .setRedirectsEnabled(false)
            .build();
    // HttpClient
    private static CloseableHttpClient httpClient = null;
    // HTTP Request
    private static HttpRequestBase httpRequest = null;
    // HTTP Response
    private static CloseableHttpResponse httpResponse = null;

    /**
     *
     * @param HttpRequestType
     * @param url
     * @param header
     * @param params
     * @param httpEntity
     * @return
     */
    public static String doRequest(
            String HttpRequestType,
            String url,
            Map<String, String> header,
            Map<String, String> params,
            HttpEntity httpEntity) {
        String resultStr = "";
        if (StringUtils.isEmpty(url)) {
            return resultStr;
        }
        try {
            // Set GET params
            setHttpURIParams(url,params);
            // Set POST params
            if("POST".equals(HttpRequestType)&&httpEntity != null){
                ((HttpPost)httpRequest).setEntity(httpEntity);
            }
            // Set HTTP header
            setHttpHeader(header);
            // Send POST
            sendHttpRequest();
            // Response
            resultStr = dealWithHttpResponse();

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            closeConnection();
        }
        return resultStr;
    }
    /**
     * 发送POST请求
     *
     * @param url:请求地址
     * @param header:请求头参数
     * @param httpEntity:表单参数  form提交 json/xml参数
     * @return
     */
    public static String doPostRequest(String url, Map<String, String> header, HttpEntity httpEntity) {
        String resultStr = "";
        if (StringUtils.isEmpty(url)) {
            return resultStr;
        }
        try {
            getHttpRequest("POST");
            HttpPost httpPost = (HttpPost)httpRequest;
            httpPost.setURI(new URIBuilder(url).build());
            // Set HTTP header
            setHttpHeader(header);

            // Set POST params
            if (httpEntity != null) {
                httpPost.setEntity(httpEntity);
            }

            sendHttpRequest();
            // Response
            resultStr = dealWithHttpResponse();

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            closeConnection();
        }
        return resultStr;
    }

    /**
     * 发送GET请求
     * @param url URL
     * @param header HTTP header info
     * @param params GET params
     * @return
     */
    public static String doGetRequest(String url, Map<String, String> header, Map<String, String> params) {
        String resultStr = "";
        if (StringUtils.isEmpty(url)) {
            return resultStr;
        }
        try {
            // getHttpRequest
            getHttpRequest("GET");
            // Set GET params
            setHttpURIParams(url,params);
            // Set HTTP header
            setHttpHeader(header);
            // Send POST
            sendHttpRequest();
            // Response
            resultStr = dealWithHttpResponse();

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            closeConnection();
        }
        return resultStr;
    }
    /**
     *
     * @param HttpRequestType
     * @throws Exception
     */
    private static void getHttpRequest(String HttpRequestType) throws Exception {
        httpClient = HTTPClientPool.getHttpClient();
        if("GET".equals(HttpRequestType)){
            httpRequest = new HttpGet();
        }else if("POST".equals(HttpRequestType)){
            httpRequest = new HttpPost();
        }
    }

    /**
     *
     * @param header
     */
    private static void setHttpHeader(Map<String, String> header){
        if (!(header == null || header.isEmpty())) {
            for (Map.Entry<String, String> headerEntry : header.entrySet()) {
                httpRequest.setHeader(headerEntry.getKey(), headerEntry.getValue());
            }
        }
    }

    /**
     *
     * @param url
     * @param params
     * @throws URISyntaxException
     */
    private static void setHttpURIParams(String url,Map<String, String> params) throws URISyntaxException {
        // URIBuilder
        URIBuilder urlbuilder = new URIBuilder(url);
        if (!(params == null || params.isEmpty())) {
            // Set GET params
            for (Map.Entry<String, String> stringStringEntry : params.entrySet()) {
                urlbuilder.setParameter(stringStringEntry.getKey(), stringStringEntry.getValue());
            }
        }
        httpRequest.setURI(urlbuilder.build());
    }

    /**
     *
     * @throws IOException
     */
    private static void  sendHttpRequest() throws IOException {
        // Request Config
        httpRequest.setConfig(requestConfig);

        // Send POST
        httpResponse = httpClient.execute(httpRequest);

        return ;
    }
    /**
     *
     * @return Response String UTF-8
     */
    private static String dealWithHttpResponse(){
        String resultStr = "";
        try{
            if (httpResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                resultStr = EntityUtils.toString(httpResponse.getEntity(),"UTF-8");
            } else {
                StringBuffer stringBuffer = new StringBuffer();
                HeaderIterator headerIterator = httpResponse.headerIterator();
                while (headerIterator.hasNext()) {
                    stringBuffer.append("\t" + headerIterator.next());
                }
            }
        }catch (IOException e) {
            e.printStackTrace();
        }
        return resultStr;
    }
    /**
     * 关掉连接释放资源
     */
    private static void closeConnection() {
        if (httpClient != null) {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        if (httpResponse != null) {
            try {
                httpResponse.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}


TEST 

package clua.zstuca;

import club.zstuca.httpclient.HTTPClientUtil;

import java.util.HashMap;

public class HTTPTEST {

    public static void main(String[] args) {
        HTTPClientUtil.doGetRequest("http://www.baidu.com",null,null);
        HTTPClientUtil.doGetRequest("http://api.help.bj.cn/apis/weather/", null, new HashMap<String, String>(){{
            put("id","101060101");
        }}
            );
    }
}

教学资源

https://www.bilibili.com/video/av68932809 

参考文章

https://blog.csdn.net/qwe86314/article/details/91450098

发布了1362 篇原创文章 · 获赞 231 · 访问量 31万+

猜你喜欢

转载自blog.csdn.net/weixin_43272781/article/details/104071242