大众点评数据的抓取

            搞过大众点评的应该都知道, 大众点评很多信息是加密的,今天来介绍一下我爬取过程中遇到的注意事项:

1、代理 IP 

2、css 反爬虫 (SVG矢量图)

 3、.woff文件 字体反爬虫

(推荐一个网址: http://fontstore.baidu.com/static/editor/index.html 解析字体文件)

如果知道这些页面的原理, 其实你做这个反爬虫的提取信息就有思路了,  反正就是 盘它,找到之后替换掉就行了

  代码也不是很多 具体的我就不解释怎么玩了, 有很多大神都有截图, 我也懒得搞了。

我测试的代码:

# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @Author   : zsc
# @FILE     : 主页面.py
# @Time     : 2019/6/11 18:24
# @Software : PyCharm
import re
import requests
import pandas as pd
from lxml import etree


def index():
    # url = "https://www.dianping.com/shanghai/ch0/r812"
    # headers = {
    #     'Accept':'text/html,application/xhtml+xml,application/xmlq=0.9,image/webp,image/apng,*/*q=0.8,application/signed-exchangev=b3',
    #     'Accept-Encoding':'gzip, deflate',
    #     'Cookie':'_lxsdk_cuid=16b0299d3eec8-026ce4179cdbae-3c644d0e-1fa400-16b0299d3eec8 _lxsdk=16b0299d3eec8-026ce4179cdbae-3c644d0e-1fa400-16b0299d3eec8 _hc.v=7e54b0f1-e26f-3e9f-5ab6-2fb2ef034df1.1559116764 s_ViewType=10 cy=1 cye=shanghai _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic _lxsdk_s=16b4eeeeb16-882-d53-724%7C%7C45',
    #     'Host':'www.dianping.com',
    #     'User-Agent':'Mozilla/5.0 (Windows NT 10.0 WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
    #     }
    # response = requests.get(url=url, headers=headers)
    # with open("b.html", "w", encoding="utf-8") as f:
    #     f.write(response.text)
    # html= etree.HTML(response.text)
    with open("b.html", "r", encoding="utf-8") as f:
        a = f.read()
    html = etree.HTML(a)
    ul = html.xpath("//div[@id='shop-all-list']/ul")[0]
    ul_li = html.xpath("//div[@id='shop-all-list']/ul/li")  # 多少个li标签
    number = len(ul_li)
    # print("number:", number)
    df = pd.DataFrame()
    for i in range(1, number + 1):
        title = ul.xpath("//li[{0}]//div[@class='tit']/a[1]/h4/text()".format(i))
        tuangou = ul.xpath("//li[{0}]//div[@class='si-deal d-packup']/a[2]/@title".format(i))
        youhui = [ul.xpath("string(//li[{0}]//div[@class='svr-info']/a[@class='tuan privilege'])".format(i)).strip()]
        star = ul.xpath("//li[{0}]/div[@class='txt']/div[@class='comment']/span/@title".format(i))
        dianpu = [ul.xpath("string(//li[{0}]//a[@class='shop-branch'])".format(i)).strip()]
        more = ul.xpath("//li[{}]/div[@class='svr-info']//a[contains(@class, 'J_more')]/text()".format(i))

        # 点评:
        comment = ul.xpath("li[{}]//a[@class='review-num']/b//text()".format(i))
        comment = [number_dict.get(k) if k in number_dict.keys() else k  for k in comment]
        comment = ["".join(comment)]
        print("comment:", comment)

        # 人均:
        people_money = ul.xpath("//ul/li[{}]/div[@class='txt']/div[@class='comment']/a[@class='mean-price']/b//text()".format(i))
        people_money = [number_dict.get(j) if j in number_dict.keys() else j for j in people_money]
        people_money = ["".join(people_money)]
        print("people_money:", people_money)

        # 标签
        targets = ul.xpath("//li[{}]//div[@class='tag-addr']/a[1]/span[@class='tag']/svgmtsi/@class".format(i))
        targets = ["".join([get_font(target).get(target) for target in targets])]
        print("标签:", targets)

        # 区域
        areas = ul.xpath("//li[{}]//div[@class='tag-addr']/a[2]/span[@class='tag']/svgmtsi/@class".format(i))
        areas = ["".join([get_font(area).get(area) for area in areas])]
        print("区域:", areas)

        # 地址
        address = ul.xpath("//li[{}]//div[@class='tag-addr']/span[@class='addr']/svgmtsi/@class | //li[{}]//div[@class='tag-addr']/span[@class='addr']/text()".format(i, i))
        addr = ["".join(get_font(k).get(k) if k in get_font(k).keys() else k for k in address)]
        print("地址:", addr)
        
        df1 = pd.DataFrame([title, tuangou, youhui, star, dianpu, more, comment, people_money, targets, areas, addr]).T
        df1.columns = ["标题", "团购", "优惠", "星级", "店铺", "更多信息","点评量", "人均消费",  "标签"," 区域", "地址"]
        df = df.append(df1, ignore_index=True)
    df.to_excel("大众点评3.xlsx")
    return df



def svg_parser(url):
    """
    解析界面: 每行汉字的值
    y值 : 表示属于第几梯队
    font-size: 每个像素占的 像素
    """
    headers = {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding':'gzip, deflate',
        'Host':'s3plus.meituan.net',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
        }
    r=requests.get(url,headers=headers)
    font=re.findall('" y="(\d+)">(\w+)</text>',r.text,re.M)
    if not font:
        font=[]
        z=re.findall('" textLength.*?(\w+)</textPath>',r.text,re.M)
        y=re.findall('id="\d+" d="\w+\s(\d+)\s\w+"',r.text,re.M)
        for a,b in zip(y,z):
            font.append((a,b))
    width=re.findall("font-size:(\d+)px",r.text)[0]  # 一个字体占的 像素    12px
    new_font=[]
    for i in font:
        new_font.append((int(i[0]),i[1]))
    return new_font,int(width)


def get_rel():
    # 获取矢量图的链接, 得到对应的汉字, y 值 便于区分
    with open("b.html", "r", encoding="utf-8") as f:
        a = f.read()
    css_url = "http:" + re.search('(//.+svgtextcss.+\.css)', a).group()
    # print(css_url)
    css_res = requests.get(css_url)
    # # 保存页面, 里面包含坐标地址
    # with open("c.html", "w", encoding="utf-8") as f:
    #     f.write(css_res.text)
    svg_url=re.findall('class\^="(\w+)".*?(//s3plus.*?\.svg)',css_res.text)
    # print(svg_url)
    s_parser = []
    for c, u in svg_url:
        # print("http:" + u)
        f, w = svg_parser("http:" + u)
        s_parser.append({"code": c, "font": f, "fw": w})
    # print(s_parser)  # {'code': 'xat', 'font': [(36, '污计枕眨多础虽钞同鹅昂盘遍重酱控传船他都规姥保归梢衡载属枪句顶绝典枯贫悲程凡演乡惜礼纵患兼渐诞另关菊')
    return s_parser


def get_background():
    with open("c.html", "r", encoding="utf-8") as f:
        css_cont = f.read()
    # 人家匹配的
    # css_list = re.findall('(\w+){background:.*?(\d+).*?px.*?(\d+).*?px;', '\n'.join(css_cont.split('}')))
    # css_list = [(i[0], int(i[1]), int(i[2])) for i in css_list]
    # 我自己匹配的
    css_list = re.findall(r"\.(.+?){background:(.+?)px(.+?)px;}", css_cont)
    css_list = [(i[0], abs(int(float(i[1]))), abs(int(float(i[2])))) for i in css_list]
    css_dict =dict()
    for i in css_list:
        css_dict[i[0]] = i[1:]
    return css_dict


def get_font(css):
    s_parser = get_rel()
    # print("字体获取值:", s_parser)
    # print(len(s_parser))
    css_dict = get_background()
    # print("css_dict:", css_dict)
    font_dict = dict()
    for i in s_parser:
        if i["code"] in css:
            # print("css", css)
            font = i.get("font")
            # print("font:", font)
            background_x, background_y = css_dict.get(css)
            # print( background_x, background_y)
            font_size = i.get("fw")
            x = background_x // font_size  # 这个代表的是位置
            flag = False
            for k in font:
                if k[0]> background_y and flag is False:
                    font_dict[css] = k[1][x]
                    flag = True
                else:
                    pass
    return font_dict


if __name__ == '__main__':
    # 数字匹配  .woff 格式
    number_dict = {'\uee53': '0', '\ue573': '1', '\ue3a3': '2', '\uf759': '3', '\uf831': '4', '\ue2ba': '5', '\ue96b': '6', '\ue7d4': '7', '\uf8d6': '8', '\ueb25': '9'}
    df = index()

猜你喜欢

转载自blog.csdn.net/chang995196962/article/details/92773580