练手之爬取某电商IPHONE信息

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/12/30 10:51
# @Site    : 
# @File    : jd_iphone.py
# @Software: PyCharm

import json
import time
import urllib3
import logging
import requests
from pyquery import PyQuery
from selenium import webdriver

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning
headers = {
        "Referer": "https://search.jd.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    }
#控制台日志输出
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger_page = logging.getLogger("jd_iphone_page")
logger_detail = logging.getLogger("jd_iphone_detail")

def get_page_detail(maxp):
    product_list = []
    p_no = 0
    for page in range(1,2*maxp,2):
        url = 'https://search.jd.com/Search?keyword=iphoneapple&page='+str(page)+'&click=0'#筛选iPhone手机链接
        # resp = requests.get(url,headers=headers,verify=False)
        #浏览器无窗模式
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        # driver = webdriver.Chrome()
        driver.get(url)
        driver.implicitly_wait(10)
        #执行js滚动条
        js = '''
        timer = setInterval(function(){
           var scrollTop=document.documentElement.scrollTop||document.body.scrollTop;
           var ispeed=Math.floor(document.body.scrollHeight / 100);
           if(scrollTop > document.body.scrollHeight * 90 / 100){
               clearInterval(timer);
           }
           console.log('scrollTop:'+scrollTop)
           console.log('scrollHeight:'+document.body.scrollHeight)
           window.scrollTo(0, scrollTop+ispeed)
        }, 20)
        '''
        driver.execute_script(js)
        time.sleep(5)
        html = driver.page_source
        doc = PyQuery(html,parser="html")#无参数parser="html"则不能解析,pyquery解析的是html类型的字符串,但是上面的类型是xhtml
        logger_page.info("正在获取%s页数据......"%((page+1)/2))
        for item in doc("#J_goodsList li").items():
            tmp_list = []
            key_list = []#验证关键字,去除不带iphone信息的数据
            #获取价格
            data_sku = item.attr('data-sku')
            para = ".J_%s"%data_sku
            price = item.find(para).text()
            #获取店铺
            shop = item.find('.J_im_icon').text()
            #标签
            tag_p = "#J_pro_%s"%data_sku
            tag = item.find(tag_p).text()
            #商品链接
            item = item.find(".gl-i-wrap div a")
            for font in item.find('font').items():
                key_list.append(font.text())
            href = item.attr('href')
            #判断是否是苹果手机
            if (u'苹果'in key_list) or(u'iphone'in key_list) or(u'Apple'in key_list) or(u'apple'in key_list):
                if "http:" in href:
                    href = href[5:]
                    tmp_list.append(href)
                    tmp_list.append(price)
                    tmp_list.append(shop)
                    tmp_list.append(tag)
                    product_list.append(tmp_list)
                else:
                    tmp_list.append(href)
                    tmp_list.append(price)
                    tmp_list.append(shop)
                    tmp_list.append(tag)
                    product_list.append(tmp_list)
                p_no+=1
                logger_page.info('正在获取%s页,第%s个产品信息......'%(((page+1)/2),p_no))
            else:
                continue
    # print(product_list)
    # print(len(product_list))
    return product_list

def product_detail(list):
    no = 0
    product_info = []
    for link in list:
        url = 'http:'+link[0]
        logger_detail.info("正在获取第%s条信息......"%(no+1))
        no+=1
        detail_html = requests.get(url,verify=False)
        doc = PyQuery(detail_html.text,parser="html")

        product_dic = {
            "title":doc(".itemInfo-wrap div.sku-name").text(),
            "jd_price":list[no-1][1],
            "shop":list[no-1][2],
            "tag":list[no-1][3],
            "colour":doc("#choose-attr-1 div.item").text(),
            "ram":doc("#choose-attr-2 div.item").text(),
            "style_buy":doc("#choose-attr-3 div.item").text(),
            # "increment":doc("#summary-support div span").text()
        }
        product_info.append(product_dic)
        # print("第%s条iphone信息:"%(no+1))
        # print(json.dumps(product_dic,encoding='UTF-8', ensure_ascii=False))
    return product_info
        # print(json.dumps(product_dic,encoding='UTF-8', ensure_ascii=False))#字典中文输出


if __name__ == '__main__':
    list = get_page_detail(1)
    # print(json.dumps(list,encoding='UTF-8', ensure_ascii=False))
    reasult = product_detail(list)
    print (json.dumps(reasult,encoding='UTF-8', ensure_ascii=False))

猜你喜欢

转载自www.cnblogs.com/East-fence/p/12129371.html
今日推荐