【Java爬虫】Jsoup基本使用

直接上代码

public class JsoupDemo {

    public static void main(String[] args) throws IOException {
        CloseableHttpClient client = HttpClients.createDefault();
        String url = "http://www.cnblogs.com";
        HttpGet httpGet = new HttpGet(url);
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36");

        CloseableHttpResponse response =  client.execute(httpGet);
        //获取实体
        HttpEntity entity = response.getEntity();
        String content = EntityUtils.toString(entity, "utf-8");
        System.out.println("status:" + response.getStatusLine().getStatusCode());
        //System.out.println(content);

        Document dom = Jsoup.parse(content);
        /*Elements  title = dom.getElementsByTag("title");
        for(Element t : title){
            System.out.println(t.text());
        }*/

        //通过选择器寻找所有的标题
        Elements elem = dom.select("#post_list .post_item .post_item_body h3 a");
        for(Element e : elem){
            System.out.println(e.html());
            System.out.println(e.attr("href")); //获得href属性的值
        }

        dom.select("img[src$=.png]");       //寻找所有结尾是png的图片
        response.close();
        client.close();
    }
}
发布了130 篇原创文章 · 获赞 151 · 访问量 19万+

猜你喜欢

转载自blog.csdn.net/haohulala/article/details/104866723