Python自动化采集淘宝信息,你要不要了解一下?

这里写图片描述
本篇文章爬取内容有:商品内容【价格、名称】、商品评价、商品卖家等存入Excel里。

**环境:**Windows+Python3.6
IDE:根据个人喜欢 自行选择
模块

import requests
from bs4 import BeautifulSoup
import os #os模块包含普遍的操作系统功能
import csv
import re
import json
import time

以上模块都可以pip安装或者自带
这里写图片描述

具体步骤

  1. 爬取页面链接
  2. 查找每件商品的页面链接
  3. 链接单页面
  4. 爬取商品价格
  5. 爬取商品评论
  6. 爬取商品名称
  7. 爬取卖家
  8. 保存到csv
#抓取京东商品详情页数据
import  requests
from bs4 import  BeautifulSoup
import openpyxl
import  time
import  re
import json
#搜索商品列表页的每个商品的链接
#Python学习交流群:125240963,群内每天分享干货,包括最新的python企业案例学习资料和零基础入门教程,欢迎各位小伙伴入群学习交流
def make_a_link(keyword,page):
    url = 'https://search.jd.com/Search?keyword=' + keyword + '&enc=utf-8&page=' + str(page * 2 - 1)
    res = requests.get(url)
    res.raise_for_status()
    res.encoding = res.apparent_encoding  # 转码 为防止出现乱码
    print('正在爬取第' + str(page) + '页:' + url)
    html = res.text
    soup = BeautifulSoup(html, 'lxml')
    links = soup.find_all('li', class_='gl-item')  # 所有商品的li
    return (link for link in links)
#详情页
def detail_link(purl):
    try:
        r = requests.get(purl)
        r.raise_for_status
        r.encoding = 'gbk'
        return r.text
    except:
        print('此页无法链接!!!')
        return ''
#商品的名字和价格
def get_name_price(uid,row,sheet):
    content = detail_link('https://c.3.cn/recommend?&methods=accessories&sku=' + uid + '&cat=9987%2C653%2C655')
    try:
        jd=json.loads(content)
        sheet.cell(row=row,column=2)._value=jd['accessories']['data']['wName']
        sheet.cell(row=row,column=3)._value=jd['accessories']['data']['wMaprice']
        print(jd['accessories']['data']['wName'])
    except:
        return ""
#店铺
def get_shop(uid,row,sheet):
    content = detail_link('https://chat1.jd.com/api/checkChat?pid=' + uid + '&returnCharset=utf-8')
    try:
        jd = json.loads(content.lstrip('null(').rstrip(');'))
        try:
            sheet.cell(row=row, column=4)._value = jd['seller']
        except:
            return ''
    except:
        ''
#商品的评论
def get_comments(uid,row,sheet):
    content = detail_link('https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + uid)
    jd=json.loads(content)
    sheet.cell(row=row,column=6)._value=jd['CommentsCount'][0]['CommentCountStr'] #总评
    sheet.cell(row=row,column=7)._value=jd['CommentsCount'][0]['GoodCountStr'] #好评
    sheet.cell(row=row,column=8)._value=jd['CommentsCount'][0]['GoodRate'] #好评率
def main():
    wb =openpyxl.Workbook()
    sheet=wb.active
    sheet.title = "京东抓取商品数据.xlsx"
    sheet.cell(row=1,column=1)._value='商品ID'
    sheet.cell(row=1,column=2)._value='商品名称'
    sheet.cell(row=1,column=3)._value= '价格'
    sheet.cell(row=1,column=4)._value='店铺'
    sheet.cell(row=1,column=5)._value= '链接'
    sheet.cell(row=1,column=6)._value='评论数'
    sheet.cell(row=1,column=7)._value='好评数'
    sheet.cell(row=1,column=8)._value= '评论率'
    row=2
    keyword=input("请输入要抓取的商品:")
    pages=input("要抓取的页数:")   #str类型
    starttime=time.time()
    pages=int(pages)
    for page in range(1,pages+1):
        for link in make_a_link(keyword, page):
            uid=link['data-sku']
            # uid = re.match(r'.+?(\d+).+', purl).group(1)  # 商品的ID
            sheet.cell(row=row, column=1)._value =uid
            purl=link.find('div', class_='p-name p-name-type-2').a['href']
            if 'http' not in purl:
                sheet.cell(row=row, column=5)._value = 'http:' + purl
            else:
                sheet.cell(row=row, column=5)._value = purl
            get_name_price(uid, row, sheet)
            get_shop(uid, row, sheet)
            get_comments(uid, row, sheet)
            row+=1
    wb.save('京东抓取'+keyword+'数据.xlsx')
    print('耗时{}秒。'.format(time.time() - starttime))  # 爬取所需时间
if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/qq_40925239/article/details/80601342