网络爬虫之Element的筛选

正值618购物节，手机又碎屏了，奈何没钱，心血来潮看了看京东上的iphone11的价格以及相关店铺相关信息，正好尝试一下网络爬虫技术。

这次主要用到的有
1.Jsoup解析字符串技术
2.HttpClient模拟客户端访问技术
3.获取html文本内容的方法：element(元素对象).html()；
4.Element查找技术,主要用到了这两个方法来查找元素，分别代表两大类查询方法，其中select属于万能选择器方法

getElementByTag(tag)
select(.class)
关于document和element的方法参考以下资料
https://blog.csdn.net/u010728594/article/details/97168495
https://blog.csdn.net/championhengyi/article/details/68491306

package crawler;

import com.sun.xml.internal.messaging.saaj.util.ByteInputStream;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.lang.reflect.Array;
import java.util.ArrayList;

/**
 * @author 谢磊
 * @description HttpClientDemo05
 * @date 2020/6/18 18:46
 * 爬取京东618iphone11的价格和专卖店名称,用httpClient类爬取到html
 */
public class HttpClientDemo05 {
    /**
     * 创建客户端
     *
     * @param url
     * @return
     */
    public static HttpResponse createClient(String url) {
        //创建客户端
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //创建请求get实例
        HttpGet httpGet = new HttpGet(url);
        //添加头部信息模拟器访问
        httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
        httpGet.setHeader("Accept-Encoding", "gzip, deflate, br");
        httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " +
                "Chrome/83.0.4103.97 Safari/537.36 Edg/83.0.478.45");
        CloseableHttpResponse closeableHttpResponse = null;
        try {
            //客户端执行httpGet方法并返回响应
            closeableHttpResponse = httpClient.execute(httpGet);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return closeableHttpResponse;
    }

    public static void main(String[] args) throws Exception {
        //创建一个连接并返回响应
        HttpResponse response = createClient("https://search.jd.com/Search?keyword=iphone%2011&enc=utf-8&suggest=3.def.0.V00--38s0&wq=iphone&pvid=3a95b0fa309f40c2aa0e8f7fb55ab17c");

         String entity = EntityUtils.toString(response.getEntity(), "utf-8");
             //使用jsoup解析html字符串成document对象
         Document document = Jsoup.parse(entity);
             //使用select方法中的class查找方式价格存入list集合
         Elements prices = document.select(".p-price");
         ArrayList<String> priceList = new ArrayList<>();
         for(Element price:prices){
              Elements i = price.getElementsByTag("i");
              priceList.add(i.html());
             }
             //同理，查找店铺信息
        Elements shops = document.select(".p-shop");
        ArrayList<String> shopList = new ArrayList<>();
        for(Element shop:shops){
            //  System.out.println(price);
            Elements i = shop.getElementsByTag("a");
            shopList.add(i.html());
        }
     
       for(int i=0;i<priceList.size();i++){
            System.out.println(shopList.get(i)+"的价格是："+ priceList.get(i));
        }
    }
}

网络爬虫之Element的筛选

猜你喜欢