Pyspider框架之大众点评数据抓取

需求

抓取全国所有城市，美食的店铺信息。
代码

没有IP代理，勿用
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-29 14:37:07
# Project: dianping_vi

from pyspider.libs.base_handler import *
import datetime
import re
import json
import copy

from pymongo import MongoClient

# 连接线下数据库
DB_IP = 
DB_PORT = 

#DB_IP = '127.0.0.1'
#DB_PORT = 27017

client = MongoClient(host=DB_IP, port=DB_PORT)

# admin 数据库有帐号，连接-认证-切换
db_auth = client.admin
db_auth.authenticate( )

DB_NAME = 'research'
DB_COL = 'dianping'
db = client[DB_NAME]
col = db[DB_COL]



detail_headers = {
    'Host': 'www.dianping.com',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    'Cookie': 'hc.v=c433e5ea-ff94-9d82-2544-871b013c64eb.1536116212; _lxsdk_cuid=165a7a93ffcc8-0885d455e400d4-3b7b0d58-1aeaa0-165a7a93ffcc8; _lxsdk=165a7a93ffcc8-0885d455e400d4-3b7b0d58-1aeaa0-165a7a93ffcc8; _lxsdk_s=165a8c7b269-b24-ec-e63%7C%7C135',
    #'Cookie': '_lxsdk_cuid=165419b77c0c8-0b7bab6ed7c246-1e2e130c-1fa400-165419b77c1c8; _lxsdk=165419b77c0c8-0b7bab6ed7c246-1e2e130c-1fa400-165419b77c1c8; _hc.v=b53c090b-d406-9c02-4cf2-ef330bf04f87.1534404033; switchcityflashtoast=1; source=m_browser_test_33; pvhistory="6L+U5ZuePjo8L3N1Z2dlc3QvZ2V0SnNvbkRhdGE/Y2FsbGJhY2s9anNvbnBfMTUzNDQwNDI0NjYxOV82NTg3NT46PDE1MzQ0MDQyNDY2NzddX1s="; m_flash2=1; default_ab=citylist%3AA%3A1%7Cshop%3AA%3A1%7Cindex%3AA%3A1%7CshopList%3AA%3A1; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s=%7C%7C0',
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
     'Accept-Encoding': 'gzip, deflate',
     'Accept-Language': 'en-US,en;q=0.9',
     'Cache-Control': 'no-cache',
     'Upgrade-Insecure-Requests': '1',
     'Connection': 'keep-alive',
     'Pragma': 'no-cache',
}

def parse_score(taste, doc):
    if len(taste) == 2:
        taste_score = [num_map.get(x, '0') for x in taste]
        taste = float('.'.join(taste_score))
    elif len(taste) == 1:
        taste_score = num_map.get(taste[0], '0')
        _flag = doc.xpath('//span[@id="comment_score"]/span[1]/text()')[1]
        if _flag.startswith('1'):
            taste = float('1.' + taste_score)
        else:
            taste = float(taste_score + '1.')
    else:
        taste = 0
    return taste



def get_today():
    return datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d'), '%Y-%m-%d')


class Handler(BaseHandler):
    crawl_config = {
        'proxy': '',
        "headers": {
            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
        },
        'retries': 6
        
    }

    @every(minutes=24 * 60)
    def on_start(self):
        url = 'https://m.dianping.com/citylist'
        
        self.crawl(url, callback=self.get_city_id)
        
        
    @config(age=60)
    def get_city_id(self, response):
        # 进入美食的url
        url = 'https://m.dianping.com/{}/ch10/d1?from=m_nav_1_meishi'
        
        result = re.findall('window.PAGE_INITIAL_STATE = ({.*})', response.text)[0]
        print(result)
        
        city_data = json.loads(result)
        for each in city_data['hotcity']['data']['hotCity']:
            item = {
                'city_name': each['cityName'],
                'city_id': each['cityId'],
                'city_pyname': each['cityEnName'],
            }
            city_pyname = each['cityEnName']

            print(each['cityName'])

            self.crawl(url.format(city_pyname), validate_cert=False, callback=self.get_area_food, save=item)

        for each in city_data['list']['data']['cityData']:
            for i in each['list']:
                item = {
                    'city_name': i['cityName'],
                    'city_id': i['cityId'],
                    'city_pyname': i['cityEnName'],
                }
                city_pyname = i['cityEnName']

                print(i['cityName'], '-----',city_pyname)
                #if city_pyname == 'rongcang':
                self.crawl(url.format(city_pyname), validate_cert=False, callback=self.get_area_food, save=item)

                
    @config(age=60)
    def get_area_food(self, response):
        result = re.findall('window.PAGE_INITIAL_STATE = ({.*})', response.text)[0]
        item = response.save
        print(item)
        city_pyname = item.pop('city_pyname')

        data = json.loads(result)
        areas = data['mapiSearch']['data']['regionNavs']
        areas = [x for x in areas if x['parentId'] == 0 and x['count'] > 0]
        # {u'count': 9828, u'name': u'\u897f\u57ce\u533a', u'regionType': 0, u'parentId': 0, u'lat': 0, u'lng': 0, u'id': 16}

        foods = data['mapiSearch']['data']['categoryNavs']
        foods = [x for x in foods if x['parentId'] == 10 and x["id"] != 10]
        

        city_id = item.pop('city_id')
        for area in areas:
            for food in foods:
                _item = copy.deepcopy(item)
                _item['region'] = area['name']
                _item['category'] = food['name']
                category_id = food['id']
                region_id = area['id']
                print(area['name'], food['name'], category_id, region_id)
                _url = 'http://www.dianping.com/{}/ch10/g{}r{}p1'.format(city_pyname, category_id, region_id)
                print(_url)
                self.crawl(_url, callback=self.get_next_page, save={'item': item, 'url': _url})


    @config(age=60)
    def get_next_page(self, response):
        _item = response.save['item']
        doc = response.etree
        
        ## 翻页-------如果if不执行，翻页结束
        if response.save.get('url'):
            url = response.save['url']
            pages = doc.xpath('//a[@class="PageLink"]/text()')
            if len(pages) > 0 or pages[-1] != '1':
                max_page = int(pages[-1])
                for each in range(2, max_page + 1, 1):
                    _url = url.replace('p1', 'p{}'.format(each))
                    self.crawl(_url, callback=self.get_next_page, save={'item': _item})
                
        shops = doc.xpath('//div[@id="shop-all-list"]/ul/li')
        for shop in shops:
            item = copy.deepcopy(_item)
            shop_id = shop.xpath('.//div[@class="tit"]/a[1]/@data-shopid')[0]
            item['shop_id'] = shop_id
            
            name = shop.xpath('.//div[@class="tit"]/a[1]/@title')[0]
            item['name'] = name  # 店铺名
            
            is_ad = shop.xpath('.//div[@class="tit"]/a[2]/text()')
            if len(is_ad) == 0 or is_ad[0] != '广告':
                item['ad_shop'] = False  # 是否是广告
            else:
                item['ad_shop'] = True  # 是否是广告
            
            shoop_id = shop.xpath('.//div[@class="tit"]/a[1]/@data-shopid')[0]
            item['shop_id'] = shoop_id  # 店铺id                 
            
            addr = shop.xpath('.//span[@class="addr"]/text()')
            item['addr'] = addr[0] if len(addr) > 0 else ''  # address
            
            region_tag = shop.xpath('.//div[@class="tag-addr"]/a[2]/span[@class="tag"]/text()')

            item['area'] = region_tag[0] if len(region_tag) > 0 else ''  # 详细地区的名字
            
            review = shop.xpath('.//div[@class="comment"]/a[1]/b/text()')
            item['review_count'] = int(review[0]) if len(review) > 0  else 0  # 评论数
            
            price = shop.xpath('.//div[@class="comment"]/a[2]/b/text()')
            item['price_text'] = int(price[0].replace('￥', '')) if len(price) > 0 else 0
            
            scores = shop.xpath('.//span[@class="comment-list"]/span/b/text()')
            if len(scores) == 3:
                taste, surrounding, service = scores
                item['taste'] = float(taste)
                item['surrounding'] = float(surrounding)
                item['service'] = float(service)
            else:
                item['taste'] = 0
                item['surrounding'] = 0
                item['service'] = 0
                
            star = shop.xpath('.//div[@class="comment"]/span/@class')[0]
            item['star'] = float(star.replace('sml-rank-stars sml-str', ''))/10
            
            ## 打印
            print(item)

            _url = "http://www.dianping.com/shop/{}/review_all".format(item['shop_id'])

            self.crawl(_url, callback=self.get_detail_page, save={'item': item}, headers=detail_headers)
            
              
            

    @config(age=60)
    def get_detail_page(self, response):
       
        item = response.save['item']
        doc = response.etree

        tags_list = doc.xpath("//div[@class='reviews-tags']/div[@class='content']/span")
        
        review_tags = []
        for each in tags_list:
            review_tags.append(''.join([i.strip() for i in each.xpath("./a/text()")[0].split('\n')]))
           
        item["review_tags"] = review_tags    
        print(review_tags)
        
        #comments_list = doc.xpath("//div[@id='summaryfilter-wrapper']/div[class='comment-filter-box clearfix J-filter']/label")
        
        if doc.xpath("//label[@class='filter-item filter-pic']"):
            item["pic"] = u'图片' + doc.xpath("//label[@class='filter-item filter-pic']/span[@class='count']/text()")[0]
        else:
            item["pic"] = ''
        
        print(item["pic"])
        if doc.xpath("//label[@class='filter-item filter-good']"):
            item["good"] = u'好评' + doc.xpath("//label[@class='filter-item filter-good']/span[@class='count']/text()")[0]
        else:
            item["good"] = ''
          
        print(item["good"])
        if doc.xpath("//label[@class='filter-item filter-middle']"):
            item["middle"] = u'中评' + doc.xpath("//label[@class='filter-item filter-middle']/span[@class='count']/text()")[0]
        else:
            item["middle"] = ''
        
        print(item["middle"])
        if doc.xpath("//label[@class='filter-item filter-bad']"):
            item["bad"] = u'差评' + doc.xpath("//label[@class='filter-item filter-bad']/span[@class='count']/text()")[0]
        else:
            item["bad"] = ''
        
        print(item["bad"])
        
       
        item["date"] = get_today()
        item["update_time"] = datetime.datetime.now()

        print(item)

        yield item

 
                
    def on_result(self, result):
        super(Handler, self).on_result(result)
        if not result:
            return

        update_key = {
            'name': result['name'],
            'shop_id': result['shop_id'],
            'date': result["date"]
        }

        col.update(update_key, {'$set': result}, upsert=True)
Pyspider框架之大众点评数据抓取

需求

代码

猜你喜欢