爬去京东——智能音箱

import requests
import lxml
import re
import json
from lxml import etree
import urllib3
urllib3.disable_warnings()
import time
import xlwt
import demjson

class spiders():
    #初始化
    def __init__(self):
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
        self.data_sku = []                          #用来存储ajax需要的data_pid
        self.data_comment = []
        self.data_money = []     #现在的价格
        self.data_money_before = []
        self.data_name = []
        self.data_url = []

    # 得到每一页的网页源码
    def get_html(self,url):   #请求页面
        try:
            res = requests.get(url, headers=self.headers)
            res.encoding = res.apparent_encoding
            if res.status_code == 200:
                html = res.text
                return html
            else:
                time.sleep(0.1)
                return self.get_html(url)
        except Exception as e:  # except BaseException  这个也可以     e是打印出错误的原因
            print("问题是",e)
            pass



    def get_sku(self,html):   #在原始页面解析  data_sku  和品牌机型
        r = etree.HTML(html)
        node_list = r.xpath('//ul[@class="gl-warp clearfix"]/li')
        self.data_sku.clear()
        for node in node_list:
            self.data_sku.append(node.xpath('./div/@data-sku'))
            self.data_url.append(node.xpath('./div/div[@class="p-img"]/a/@href'))
            self.data_name.append(node.xpath('./div/div[@class="p-name"]/a/em/text()'))
            s = str(node.xpath('./div/div[@class="p-img"]/a/@href'))
            s1 = s.replace("['", "")
            s2 = s1.replace("']", "")
            self.data_url.append(s2)

        self.data_sku = [i[0] for i in self.data_sku]    #把这样的[[7621084],[6946605],[7357933]]的数据变成['7624081', '6946605', '7357933']


    def parse_comment(self,html):   #评论数解析

        json_comment = json.loads(html)   #loads转成字典
        comment_list = json_comment["CommentsCount"]

        for comment in comment_list:
            self.data_comment.append(comment['CommentCountStr'])

    def join_url_comment(self):    #拼接评论数代码
        url_comment_start = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds='
        comment_sku = ','.join(self.data_sku)
        comment_end = url_comment_start + comment_sku
        return  comment_end
#        pass

    def join_url_money_up(self):   #拼接上半部分代码
        url_money_start_up = 'https://p.3.cn/prices/mgets?callback=jQuery873263&ext=11000000&pin=&type=1&area=6_303_304_36864&skuIds=J_'
        money_sku_up = ','.join(self.data_sku[:30])
        money_end_up = url_money_start_up + money_sku_up
        return money_end_up

    def join_url_money_down(self):   #拼接下半部分url
        global num
        if num == 3:
            return 0
        num += 1
        url_money_start_down = 'https://p.3.cn/prices/mgets?callback=jQuery873263&ext=11000000&pin=&type=1&area=6_303_304_36864&skuIds=J_'
        money_sku_down = ','.join(self.data_sku[30:])
        money_end_down = url_money_start_down + money_sku_down
        return money_end_down

    def parse_money(self,html):    #解析价钱
        #print(html)
        s = re.findall(r'873263\((.*?)\)', html)[0]
        json_s = demjson.decode(s)     #把字符串转成list
        for money in json_s:
            self.data_money.append(money['p'])
            self.data_money_before.append((money['m']))

class Excel(spiders):   #存储
    def __init__(self):
        spiders.__init__(self)
        # 创建一个xlwt对象。
        self.f = xlwt.Workbook(encoding='utf-8')

        # 创建一个单表  sheet1, 在单表里面插入
        self.sheet1 = self.f.add_sheet(u'sheet1', cell_overwrite_ok=True)

    def write_jd(self):
        j = 0
        for name in self.data_name:
            self.sheet1.write(j,0,name)
            j += 1

        m = 0
        for money in self.data_money:
            self.sheet1.write(m,1,money)
            m += 1

        n = 0
        for comment in self.data_comment:
            self.sheet1.write(n,2,comment)
            n += 1

        self.f.save(r'jd_spider606.xls')


if __name__ == '__main__':
    sp = spiders()
    e = Excel()#开辟内存
    num = 0

    li_url = ['https://list.jd.com/list.html?cat=652,828,841&ev=1107_97252&sort=sort_totalsales15_desc&trans=1&JL=3_%E5%88%86%E7%B1%BB_%E6%99%BA%E8%83%BD%E9%9F%B3%E7%AE%B1#J_crumbsBar','https://list.jd.com/list.html?cat=652,828,841&ev=1107_97252&page=2&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main','https://list.jd.com/list.html?cat=652,828,841&ev=1107_97252&page=3&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main','https://list.jd.com/list.html?cat=652,828,841&ev=1107_97252&page=4&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main']
    for url in li_url:
        sp.get_sku(sp.get_html(url))  # 获得sku
        url_comment = sp.join_url_comment()  # 拼接评论url
        comment_html = sp.get_html(url_comment)  # 请求评论
        sp.parse_comment(comment_html)  # 解析评论

        url_money_up = sp.join_url_money_up()  # 拼接价钱上半部分 钱
        money_html_up = sp.get_html(url_money_up)  # 请求
        sp.parse_money(money_html_up)  # 解析

        url_money_dowm = sp.join_url_money_down()  # 拼接价钱下半部分  钱
        if url_money_dowm == 0:
            break
        money_html_down = sp.get_html(url_money_dowm)  # 请求
        sp.parse_money(money_html_down)  # 解析

    e.write_jd()
爬去京东——智能音箱

猜你喜欢