Python Scrapy 爬取易车品牌 车系 车款(在售 停售)及车辆参数

基于Scrapy  爬取易车品牌 车系 车款(在售  停售)及车辆参数

源码地址:https://download.csdn.net/download/wpaycn/11548531

# -*- coding: utf-8 -*-
import scrapy
import re
import json
import logging
from copy import deepcopy
from yicar.items import YicarItem

logger = logging.getLogger(__name__)


# json替换key
def replacea(matched):
    return '\"' + matched.group('value') + '\":'


class YcSpider(scrapy.Spider):
    name = 'yc'
    allowed_domains = ['bitauto.com']
    start_urls = [
        'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?tagtype=chexing&pagetype=masterbrand&objid=0']

    def parse(self, response):
        result = re.sub('(?P<value>\w+):', replacea,
                        response.text[response.text.find('{'):response.text.rfind('}') + 1].replace('https:', ''))
        data = json.loads(result)
        for char in data['char']:
            try:
                for brand in data['brand']['%s' % char]:
                    item = {}
                    item["params"] = {}
                    item['id'] = brand['id']
                    item['name'] = brand['name']
                    item['initial'] = char

                    url = 'http://car.bitauto.com/tree_chexing/mb_{}/'.format(item['id'])
                    item['url'] = url

                    yield scrapy.Request(
                        url,
                        callback=self.parse_serial,
                        meta={"item": deepcopy(item)}
                    )
                    return
            except KeyError:
                pass

    def parse_serial(self, response):
        item = response.meta.get("item")

        brands_div = response.xpath("//div[@id='divCsLevel_0']")

        car_cates = brands_div.xpath('./h5')

        # 循环遍历得到子品牌
        brand_item_id = 0
        for i in car_cates:
            brand_item_id = brand_item_id + 1
            brand_cate = brands_div.xpath('./h5[' + str(brand_item_id) + ']/a/text()').extract_first()
            #    logger.warning(brand_cate)
            item["brand_item"] = brand_cate
            brand_item = brands_div.xpath("./div[" + str(brand_item_id) + "]")
            brand_item_col = brand_item.xpath("./div[@class='col-xs-3']")
            for j in brand_item_col:
                brand_name = j.xpath("./div/ul/li[contains(@class,'name')]/a/text()").extract_first()
                brand_price = j.xpath("./div/ul/li[@class='price']/a/text()").extract_first()
                brand_url = j.xpath("./div/ul/li[contains(@class,'name')]/a/@href").extract_first()
                brand_url = "http://car.bitauto.com" + str(brand_url)
                #     logger.warning(str(brand_name) + "---" + str(brand_price) + "---" + brand_url)
                item["model"] = str(brand_name)
                item["price"] = str(brand_price)
                if str(brand_price) == "未上市":
                    pass
                else:
                    logger.warning("##### " + item["model"] + " #######")
                    yield scrapy.Request(
                        brand_url,
                        callback=self.parse_vehicle,
                        meta={"item": deepcopy(item)}
                    )
                    return

    # 获取在售车款
    def parse_vehicle(self, response):
        item = response.meta.get("item")

        # 在售车款
        tr_list = response.xpath("//tr[contains(@id,'car_filter_id')]/td[1]")
        for tr in tr_list:
            vehicle_name = tr.xpath("./a/text()").extract_first()
            vehicle_url = tr.xpath("./a/@href").extract_first()
            vehicle_url = "http://car.bitauto.com" + str(vehicle_url)
            logger.warning(vehicle_name + "  ---  " + vehicle_url)
            #  logger.warning(item["model"] + "--" + str(vehicle_name) + "---" + str(vehicle_url))
            item["vehicle"] = vehicle_name
            item["issell"] = "在售"
            yield scrapy.Request(
                vehicle_url,
                callback=self.parse_vehicle_params,
                meta={"item": deepcopy(item)}
            )

        # 停售车款
        old_vehicle_url_id = response.xpath("//input[@id='csHid']/@value").extract_first()
        # 得到停售车辆年份
        drop_a = response.xpath("//div[@class='drop-layer']/a/text()").extract()
        for i in drop_a:
            brand_url = "http://car.bitauto.com/AjaxNew/GetNoSaleSerailListByYear.ashx?csID=" + str(
                str(old_vehicle_url_id)) + "&year=" + str(i)[0:4]
            # logger.warning(brand_url)
            yield scrapy.Request(
                brand_url,
                callback=self.parse_old_vehicle,
                meta={"item": deepcopy(item)}
            )

    # 解析非在售车款
    def parse_old_vehicle(self, response):
        item_model = response.meta.get("item")
        data = json.loads(response.body.decode())

        for item in data:
            for itemj in item["carList"]:
                vehicle_name = itemj["Name"]
                vehicle_id = itemj["CarID"]
                vehicle_year_type = itemj["YearType"]
                vehicle_spell = itemj["Spell"]
                brand_url = "http://car.bitauto.com/" + str(vehicle_spell) + "/m" + str(vehicle_id)
                item_model["vehicle"] = str(vehicle_year_type) + " " + str(vehicle_name)
                item_model["issell"] = "停产"
                logger.warning(item_model["vehicle"] + "  *** " + brand_url)

                yield scrapy.Request(
                    brand_url,
                    callback=self.parse_vehicle_params,
                    meta={"item": deepcopy(item_model)}
                )

    # 解析车辆参数
    def parse_vehicle_params(self, response):
        item_model = response.meta.get("item")
        # 存放参数
        params = []
        item_model["params"] = params

        cate = response.xpath("//div[@class='caption-1']")
        cate_layout = response.xpath("//div[@class='special-layout-18 layout-1']")
        cate_item_index = 0
        for i in cate_layout:
            param = {}
            item_model["params"].append(param)
            cate_ll = cate[cate_item_index]
            big_cate = cate_ll.xpath("./h6/text()").extract_first()
            # logger.warning(str(big_cate))
            param["param_cate"] = str(big_cate)
            # 获取所有的参数
            pars_tr = i.xpath("./table/tbody/tr")
            # 获取到所有的参数
            index = 0
            param["param_cate_value"] = []
            # 每一行
            for j in pars_tr:
                pars_td = j.xpath("./td")
                for k in pars_td:

                    if index % 2 == 0:
                        param_item = {}
                        p = k.xpath("./span/text()").extract_first()
                        param_item["key"] = p
                    else:

                        # 颜色
                        if param_item["key"] == "外观颜色:":
                            a_list = k.xpath("./div/ul/li/a/@title").extract()
                            param_item["value"] = a_list
                        # 可选配置  判断
                        elif len(k.xpath("./div")) != 0:

                            p_list = []
                            div_list = k.xpath("./div/div/div[@class='l']")
                            for xi in div_list:
                                p_obj = {}
                                cricle = xi.xpath("./i/text()").extract_first()
                                text = xi.xpath("./text()").extract_first()
                                p_obj["isSelect"] = cricle
                                p_obj["value"] = text
                                p_list.append(p_obj)
                            param_item["value"] = p_list
                        # 字
                        else:
                            p = k.xpath("./span/text()").extract_first()
                            param_item["value"] = p
                        param["param_cate_value"].append(param_item)

                    index = index + 1

            cate_item_index = cate_item_index + 1
        logger.warning(json.dumps(item_model, ensure_ascii=False))
       # yield item_model
发布了10 篇原创文章 · 获赞 2 · 访问量 1008

猜你喜欢

转载自blog.csdn.net/wpaycn/article/details/99597459