爬虫实战项目--优信二手车--天眼

import requests, time, random
from fake_useragent import UserAgent
from lxml.html import etree, HTMLParser

from requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning
from multiprocessing import Pool


# 关闭Https请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)
ua = UserAgent()

number = 1
new_session_xin = 'k8935l0tr72p6dfngdfnuiukoo4n6jfn'
anti_uid = '8F932282-2E08-FA10-DDDC-841EEF3E0BF3'


def get_proxy():
    response = requests.get('http://localhost:5010/get/').text
    proxy = {'http': 'http://' + response}
    return proxy


def get_session_xin():
    global anti_uid
    headers = {
        'User-Agent': ua.random,
        'Host': 'www.xin.com',
        'Referer': 'https://www.xin.com/zhengzhou/baoma/',
        'Cookie': 'XIN_bhv_oc=1233; XIN_anti_uid={}; XIN_LOCATION_CITY=%7B%22cityid%22%3A%221001%22%2C%22areaid%22%3A%224%22%2C%22big_areaid%22%3A%222%22%2C%22provinceid%22%3A%2210%22%2C%22cityname%22%3A%22%5Cu90d1%5Cu5dde%22%2C%22ename%22%3A%22zhengzhou%22%2C%22shortname%22%3A%22ZN%22%2C%22service%22%3A%221%22%2C%22near%22%3A%22201%2C501%2C2101%2C2117%2C1010%2C1002%2C601%2C2401%2C901%2C1201%22%2C%22tianrun_code%22%3A%220371%22%2C%22zhigou%22%3A%221%22%2C%22longitude%22%3A%22113.6253680%22%2C%22latitude%22%3A%2234.7465990%22%2C%22direct_rent_support%22%3A%221%22%2C%22salvaged_support%22%3A%221%22%2C%22isshow_c%22%3A%221%22%7D; uid=rBAKEls5vG1giwDiR4LWAg==; NSC_20.eqppmxfc.yjo.dpn=ffffffffaf18140345525d5f4f58455e445a4a423660; XIN_UID_CK=5e21beea-146c-a405-2a32-2df07fc0eac9'.format(anti_uid)
    }

    response = requests.get('https://www.xin.com/search/get_wishlist_token', headers=headers, proxies=get_proxy(), verify=False)
    # 从响应头的Set-Cookie中,取出session_xin
    session_xin = response.cookies.get('session_xin', '没有')
    print(session_xin)
    return session_xin


def get_list_page(page_num):
    global number, new_session_xin, anti_uid
    tm = str(time.time()).split('.')[0]
    url = 'https://www.xin.com/zhengzhou/baoma/i{}'.format(page_num)
    headers = {
        'User-Agent': ua.random,
        'Host': 'www.xin.com',
        'Referer': 'https://www.xin.com/zhengzhou/baoma/',
        'Cookie': 'RELEASE_KEY=; XIN_bhv_oc=1233; XIN_anti_uid={}; XIN_LOCATION_CITY=%7B%22cityid%22%3A%221001%22%2C%22areaid%22%3A%224%22%2C%22big_areaid%22%3A%222%22%2C%22provinceid%22%3A%2210%22%2C%22cityname%22%3A%22%5Cu90d1%5Cu5dde%22%2C%22ename%22%3A%22zhengzhou%22%2C%22shortname%22%3A%22ZN%22%2C%22service%22%3A%221%22%2C%22near%22%3A%22201%2C501%2C2101%2C2117%2C1010%2C1002%2C601%2C2401%2C901%2C1201%22%2C%22tianrun_code%22%3A%220371%22%2C%22zhigou%22%3A%221%22%2C%22longitude%22%3A%22113.6253680%22%2C%22latitude%22%3A%2234.7465990%22%2C%22direct_rent_support%22%3A%221%22%2C%22salvaged_support%22%3A%221%22%2C%22isshow_c%22%3A%221%22%7D; uid=rBAKEls5vG1giwDiR4LWAg==; NSC_20.eqppmxfc.yjo.dpn=ffffffffaf18140345525d5f4f58455e445a4a423660; XIN_UID_CK=5e21beea-146c-a405-2a32-2df07fc0eac9; Hm_lvt_ae57612a280420ca44598b857c8a9712=1530510447; Hm_lpvt_ae57612a280420ca44598b857c8a9712={}; session_xin={}; SEO_REF=https://www.xin.com/zhengzhou/baoma/'.format(anti_uid, tm, new_session_xin)
    }

    response = requests.get(url, headers=headers, verify=False, proxies=get_proxy())

    uid = response.cookies.get('XIN_anti_uid', '')
    if uid:
        print('uid = ',uid)
        anti_uid = uid
    else:
        print('uid 不存在')

    return response.text


def parse_list_page(list_page):

    list_pool = Pool(4)

    list_obj = etree.HTML(list_page, parser=HTMLParser(encoding='utf-8'))
    detail_urls = list_obj.cssselect('h2 .tit')
    for detail_url in detail_urls:
        detail_url = 'https:' + detail_url.attrib['href']
        list_pool.apply_async(get_detail_page, args=(detail_url,), callback=parse_detail_page)

    list_pool.close()
    list_pool.join()


def get_detail_page(detail_url):
    global number, new_session_xin, anti_uid
    number_list = [1525 + number, 1319 + number, 1262 + number, 1436 + number, 1561 + number, 1452 + number,
                   1618 + number, 1624 + number, 1632 + number, 1631 + number, 1646 + number, 1742 + number,
                   1814 + number, 1891 + number, 1847 + number, 2286 + number]
    tm = str(time.time()).split('.')[0]

    # 每次请求详情页数据之前,需要判断number的值,目的就是爬取详情页几条数据之后,更换session_xin的值
    if number % 9 == 0:
        number += 1
        new_session_xin = get_session_xin()
        get_detail_page(detail_url)
        # 默认情况下,get_detail_page()执行完毕,会继续向下执行代码
        return

    headers = {
        'User-Agent': ua.random,
        'Host': 'www.xin.com',
        'Referer': 'https://www.xin.com/zhengzhou/baoma/',
        'Cookie': 'RELEASE_KEY=; XIN_bhv_oc={}; XIN_anti_uid={}; XIN_LOCATION_CITY=%7B%22cityid%22%3A%221001%22%2C%22areaid%22%3A%224%22%2C%22big_areaid%22%3A%222%22%2C%22provinceid%22%3A%2210%22%2C%22cityname%22%3A%22%5Cu90d1%5Cu5dde%22%2C%22ename%22%3A%22zhengzhou%22%2C%22shortname%22%3A%22ZN%22%2C%22service%22%3A%221%22%2C%22near%22%3A%22201%2C501%2C2101%2C2117%2C1010%2C1002%2C601%2C2401%2C901%2C1201%22%2C%22tianrun_code%22%3A%220371%22%2C%22zhigou%22%3A%221%22%2C%22longitude%22%3A%22113.6253680%22%2C%22latitude%22%3A%2234.7465990%22%2C%22direct_rent_support%22%3A%221%22%2C%22salvaged_support%22%3A%221%22%2C%22isshow_c%22%3A%221%22%7D; uid=rBAKEls5vG1giwDiR4LWAg==; NSC_20.eqppmxfc.yjo.dpn=ffffffffaf18140345525d5f4f58455e445a4a423660; XIN_UID_CK=5e21beea-146c-a405-2a32-2df07fc0eac9; Hm_lvt_ae57612a280420ca44598b857c8a9712=1530510447; Hm_lpvt_ae57612a280420ca44598b857c8a9712={}; session_xin={}; SEO_REF=https://www.xin.com/zhengzhou/baoma/; XIN_CARBROWSE_IDS=%5B67720293%5D; XIN_bhv_pc={}; XIN_bhv_expires=1530597119591'.format(anti_uid, random.choice(number_list), tm, new_session_xin, number)
    }
    response = requests.get(detail_url, headers=headers, verify=False, proxies=get_proxy())

    return response.text, detail_url


def parse_detail_page(detail_tuple):
    global number
    detail_page = detail_tuple[0]
    detail_url = detail_tuple[1]
    detail_obj = etree.HTML(detail_page, parser=HTMLParser(encoding='utf-8'))
    try:
        title = detail_obj.xpath('//span[@class="cd_m_h_tit"]//text()')[3]
    except Exception as e:
        title = detail_obj.xpath('//span[@class="cd_m_h_tit"]//text()')[0].strip()

    price = detail_obj.xpath('//span[@class="cd_m_info_jg"]/b/text()')[0].strip()

    print(detail_url, title, price)

    number += 1


if __name__ == '__main__':

    pool = Pool(4)

    for x in range(1, 51):
        print('开始获取第{}...'.format(x))
        pool.apply_async(get_list_page, args=(x,), callback=parse_list_page)

    pool.close()
    pool.join()
 
 

天眼

import requests,time
from lxml.html import etree
from fake_useragent import UserAgent
from urllib.parse import quote

from requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning
from multiprocessing import Pool
# from fontTools.ttLib import TTFont

# 关闭Https请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)


ua = UserAgent()
number_dict1 = {
    '0': '9',
    '1': '2',
    '2': '1',
    '3': '4',
    '4': '7',
    '5': '8',
    '6': '3',
    '7': '5',
    '8': '6',
    '9': '0',
    '-': '-'
}

# 8935: 6048
# 8936: 6043
# 8936-94-90: 2017-08-09
# 8936-95-84: 2017-06-28

number_dict2 = {
    '0': '9',
    '1': '4',
    '2': '5',
    '3': '1',
    '4': '8',
    '5': '6',
    '6': '7',
    '7': '3',
    '8': '2',
    '9': '0',
    '-': '-'
}

KEY_WORD = '智游'

# response = requests.get('https://static.tianyancha.com/fonts-styles/fonts/49/49631975/tyc-num.woff').text


def get_proxy():
    response = requests.get('http://localhost:5010/get/').text
    proxy = {'http': 'http://' + response}
    return proxy


def get_list_page(page_num):
    tm = str(time.time()).split('.')[0]
    headers = {
        'User-Agent': ua.random,
        'Host': 'www.tianyancha.com',
        'Cookie': 'TYCID=2b902090793a11e8bbf42fcb3431841d; undefined=2b902090793a11e8bbf42fcb3431841d; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1530015137,1530061830,1530104465,1530519246; ssuid=4009891320; aliyungf_tc=AQAAAP+boVulnQoAg6cPqxTilju98D0f; csrfToken=yg6QXuv2Dch1Abfr-giP-AH4; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758={}; RTYCID=24412db0b3da41c5be4439ba8f942ce8; bannerFlag=true; token=1675836c554a48fe9bcc18cfc45cb4d0; _utm=788b0bb711164fda9a5e6b1964bb5bf9; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252213037677318%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA'.format(tm)
    }
    list_url = 'https://www.tianyancha.com/search/p{}?key={}'.format(page_num, quote(KEY_WORD))
    response = requests.get(list_url, headers=headers, verify=False, proxies=get_proxy())
    return response.text


def parse_list_page(list_page):
    list_html = etree.HTML(list_page, parser=etree.HTMLParser(encoding='utf-8'))
    divs = list_html.cssselect('.search_row_new')
    all_a = list_html.cssselect('.query_name')

    detail_pool = Pool(4)

    for x in range(len(divs)):
        div = divs[x]

        detail_url = all_a[x].attrib['href']
        try:
            person = div.cssselect('.legalPersonName')[0].text
            zhuceziben = div.xpath('.//span[contains(@title, "人民币")]/text()')[0]
        except Exception:
            continue
        else:
            zhuceshijian = div.xpath('.//span[contains(@title, "-")]/text()')[0]

        detail_pool.apply_async(get_detail_page, args=(detail_url, person, zhuceziben, zhuceshijian), callback=parse_detail_page)

    detail_pool.close()
    detail_pool.join()


def get_detail_page(detail_url, person, zhuceziben, zhuceshijian):
    tm = str(time.time()).split('.')[0]
    headers = {
        'User-Agent': ua.random,
        'Host': 'www.tianyancha.com',
        'Cookie': 'TYCID=2b902090793a11e8bbf42fcb3431841d; undefined=2b902090793a11e8bbf42fcb3431841d; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1530015137,1530061830,1530104465,1530519246; ssuid=4009891320; aliyungf_tc=AQAAAP+boVulnQoAg6cPqxTilju98D0f; csrfToken=yg6QXuv2Dch1Abfr-giP-AH4; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758={}; RTYCID=24412db0b3da41c5be4439ba8f942ce8; bannerFlag=true; token=1675836c554a48fe9bcc18cfc45cb4d0; _utm=788b0bb711164fda9a5e6b1964bb5bf9; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252213037677318%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzAzNzY3NzMxOCIsImlhdCI6MTUzMDUzMTAyMSwiZXhwIjoxNTQ2MDgzMDIxfQ.VhZp2799GMlRKWPnleSODWuG2-fC7Prn9LdC0CYIxotinpsOwXgvJxpAfuxJGCmLUEK-90jJvOUOirPeeonrGA'.format(tm)
    }
    response = requests.get(detail_url, headers=headers, proxies=get_proxy())
    return response.text, detail_url, person, zhuceziben, zhuceshijian


def parse_detail_page(detail_tuple):
    detail_html = detail_tuple[0]
    detail_url, person, zhuceziben, zhuceshijian = detail_tuple[1], detail_tuple[2], detail_tuple[3], detail_tuple[4]
    detail_obj = etree.HTML(detail_html, parser=etree.HTMLParser(encoding='utf-8'))

    res_str = ''
    try:
        date_str = detail_obj.cssselect('.base0910 .tyc-num')[0].text
    except Exception:
        pass
    else:
        for res in date_str:
            res_str += number_dict1[res]

        # 如果number_dict1第一套规则匹配失败,尝试第二套规则转化。
        if res_str[0] != '2':
            res_str = ''
            for res in date_str:
                res_str += number_dict2[res]

        print(detail_url, person, zhuceziben, zhuceshijian, res_str)


if __name__ == '__main__':
    pool = Pool(1)
    for x in range(1, 6):
        pool.apply_async(get_list_page, args=(x,), callback=parse_list_page)
    pool.close()
    pool.join()

猜你喜欢

转载自blog.csdn.net/weixin_42312791/article/details/80890676
今日推荐