Java使用HttpClient爬取数据

1.建立http连接返回html页面:

 public static String  doRequest(String url,String method) {
        //创建httpClient对象
        CloseableHttpClient client=HttpClientBuilder.create().build();
        URIBuilder uriBuilder= null;
        HttpUriRequest uri=null;
        CloseableHttpResponse response= null;       //获取相应对象
        String html="";      //存放响应信息
        try {
            uriBuilder = new URIBuilder(url);
            switch (method){
                case "POST":
                    uri=new HttpPost(uriBuilder.build());
                    break;
                case "PUT":
                    uri=new HttpPut(uriBuilder.build());
                    break;
                case "DELETE":
                    uri=new HttpDelete(uriBuilder.build());
                    break;
                default:
                    uri=new HttpGet(uriBuilder.build());
                    break;
            }
            response = client.execute(uri);
            if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {     // 返回 200 表示成功
                html = EntityUtils.toString(response.getEntity(), "utf-8");     // 获取服务器响应实体的内容
            }
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }catch (IOException e) {
            e.printStackTrace();
        }finally {
            if (response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return html;
    }

2.解析页面获取想要的数据:

 //解析 筛选网页所需信息
    public static void analysisHtml(String html){
        //第一步,将字符内容解析成一个Document类
        Document document = Jsoup.parse(html);
        //第二步,根据我们需要得到的标签,选择提取相应标签的内容
        Elements elements = document.select("div[class=leftbox]").select("div[class=pr0]");
        String name="";
        String address="";
        for(Element e : elements){
            name=e.getElementsByClass("pr2").select("ul").select("li").select("a").first().text();
            address=e.getElementsByClass("pr4").text();
            Company company=new Company(name,address);
            companyList.add(company);
        }
    }

3.启动方法启动:

    public static List<Company> start() {
        String url="http://www.chinawj.com.cn/qiye/wujinjidian/c1_1_0_";
        for(int i=1;i<=10;i++){
            System.out.println("开始爬取数据[页码:"+i+"]");
            analysisHtml(doRequest(url+i+".html","GET"));
        }
        System.out.println(companyList);
        return companyList;

    }

猜你喜欢

转载自www.cnblogs.com/yanghe123/p/11956730.html