网络爬虫并设置代理

通过 HttpClient 来模拟登录与设置代理

  • 通过HttpClient模拟浏览器请求
  • 通过jsoup来解析页面
  • 通过代理隐藏自己IP
  • 去大型免费代理平台去拉取代理

需要导入的依赖

<!-- HttpClient :: 发起请求 -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.10</version>
</dependency>

<!-- Httpmime :: 请求时间 -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpmime</artifactId>
    <version>4.5.10</version>
</dependency>

<!-- jsoup :: 解析html内容 -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.12.1</version>
</dependency>

<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
    <version>3.3.2</version>
</dependency>

使用HttpClient模拟请求并设置代理访问

public class HttpProxyClientUtils {

    // 通过访问这个网站获取自己请求的IP地址
    public static final String TEST_PROXY_URL = "http://pv.sohu.com/cityjson?ie=utf-8";

    // 设置(请求,连接,传输)超时
    public static final Integer SOCKET_TIMEOUT = 1000 * 5;
    public static final Integer CONNECT_TIMEOUT = SOCKET_TIMEOUT;
    public static final Integer CONNECTION_REQUEST_TIMEOUT = SOCKET_TIMEOUT;

    // HttpClient 客户端
    private static CloseableHttpClient httpClient = HttpClients.createDefault();
    // 代理服务器
    private HttpHost proxy;
    // 保存的 Cookie
    private String cookies = "";

    /**
     * 模拟表单提交来进行登录
     */
    public String verifyLogin(String url, String username, String password) {
        log.debug("验证登录({}), username: {}, password: {}", url, username, password);

        String result = "";
        HttpPost httpPost = new HttpPost(url);
        // 请求时携带 Cookie
        httpPost.setHeader("Cookie", "PHPSESSID=" + getPHPSSID() + ";e=0|0");
        List<NameValuePair> nvps = Arrays.asList(
                new BasicNameValuePair("username", username),
                new BasicNameValuePair("password", password)
        );

        HttpEntity reqEntity = new UrlEncodedFormEntity(nvps, Consts.UTF_8);
        httpPost.setEntity(reqEntity);
       
        // 设置代理与请求超时时间
        httpPost.setConfig(RequestConfig.custom().setProxy(proxy).setSocketTimeout(SOCKET_TIMEOUT).setConnectTimeout(CONNECT_TIMEOUT).setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT).setStaleConnectionCheckEnabled(true).build());

        try (CloseableHttpResponse response = httpClient.execute(httpPost)) {
            HttpEntity respEntity = response.getEntity();
            result = EntityUtils.toString(respEntity, Charset.forName("utf-8"));
            EntityUtils.consume(respEntity);
        } catch (org.apache.http.NoHttpResponseException n) {
            log.error("代理不能使用了::{}", n);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result;
    }

    public String getData(String url) {
        log.debug("GetData::{}", url);
        HttpGet httpGet = new HttpGet(url);
        httpGet.setConfig(RequestConfig.custom().setProxy(proxy).setSocketTimeout(SOCKET_TIMEOUT).setConnectTimeout(CONNECT_TIMEOUT).setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT).setStaleConnectionCheckEnabled(true).build());
        
        httpGet.setHeader("Cookie", "cookie1=val;cookie2=val2");

        try (CloseableHttpResponse response = httpClient.execute(httpGet)) {
            int statusCode = response.getStatusLine().getStatusCode();
            if (statusCode > 299 || statusCode < 200) {
                throw new IOException("statusCode error : " + statusCode);
            }
            return entityToString(response.getEntity());
        } catch (org.apache.http.NoHttpResponseException n) {
            log.error("代理不能使用了::{}", n);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return "";
    }

    public static String entityToString(HttpEntity entity) throws IOException {
        StringBuilder stringBuilder = new StringBuilder();
        BufferedReader br = new BufferedReader(new InputStreamReader(entity.getContent()));
        String line = null;
        while ((line = br.readLine()) != null) {
            stringBuilder.append(line);
        }
        EntityUtils.consume(entity);
        return stringBuilder.toString();
    }
    
}

去免费代理网站中获取代理

网站 正则表达式
http://www.ip3366.net/free/?stype=1&page=2 <td>(\d+\.\d+\.\d+\.\d+)</td>\s+<td>(\d+)</td>
http://www.iphai.com/free/ng <td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>\s+<td>\s+(\d+)\s+</td>
http://www.66ip.cn/nmtq.php?getnum=100&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip (\d+\.\d+\.\d+\.\d+):(\d+)<br/>
http://www.xicidaili.com/nn/1 <td>(\d+\.\d+\.\d+\.\d+)</td>\s+<td>(\d+)</td>
发布了49 篇原创文章 · 获赞 39 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/qq_42920045/article/details/102969854