在做爬虫的时候,如何生成一个靠谱可用的httpclient对象是非常关键的。在踩了无数的坑之后,总结出一个较为完善的httpclient生成方式。
可以解决以下问题:
1、设置代理问题
2、设置默认的cookiestore对象,用来保存请求中的cookie。以便进行深层次访问。
3、在请求失败的重试策略问题
4、默认useragent的问题
5、https及自签名证书的验证问题
/** * 新建一个通用httpclientbuider * 使用代理时,必须一起传入host对象。 * 不传入host对象的时候,代理不会生效 */ public static HttpClientBuilder getInstanceClientBuilder(boolean isNeedProxy, CookieStore store, HttpHost host, HttpRequestRetryHandler handler, String userAgent) { org.apache.http.ssl.SSLContextBuilder context_b = SSLContextBuilder.create(); SSLContext ssl_context = null; try { context_b.loadTrustMaterial(null, (x509Certificates, s) -> true); //信任所有证书,解决https证书问题 ssl_context = context_b.build(); } catch (Exception e) { e.printStackTrace(); } ConnectionSocketFactory sslSocketFactory = null; Registry<ConnectionSocketFactory> registry = null; if (ssl_context != null) { sslSocketFactory = new SSLConnectionSocketFactory(ssl_context, new String[]{"TLSv1", "TLSv1.1", "TLSv1.2"}, null, (s, sslSession) -> true); //应用多种tls协议,解决偶尔握手中断问题 registry = RegistryBuilder.<ConnectionSocketFactory>create().register("https", sslSocketFactory).register("http", new PlainConnectionSocketFactory()).build(); } PoolingHttpClientConnectionManager manager = null; if (registry != null) { manager = new PoolingHttpClientConnectionManager(registry); } else { manager = new PoolingHttpClientConnectionManager(); } manager.setMaxTotal(150); manager.setDefaultMaxPerRoute(200); HttpClientBuilder builder = HttpClients.custom().setRetryHandler(handler) .setConnectionTimeToLive(6000, TimeUnit.SECONDS) .setUserAgent(userAgent); if (store != null) { builder.setDefaultCookieStore(store); } if (isNeedProxy && host != null) { // HttpHost proxy = new HttpHost("127.0.0.1", 1080);// 代理ip DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(host); builder = builder.setRoutePlanner(routePlanner); } builder.setConnectionManager(manager);//httpclient连接池 builder.setRedirectStrategy(new AllowAllRedirectStrategy());//默认重定向所有302和307,否则httpclient只自动处理get请求导致的302和307 return builder; }