爬虫-新浪财经-信用卡优惠商店数据(2018-11-15)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_29622761/article/details/84106968

爬虫地址

http://money.finance.sina.com.cn/creditcard/view/vMerchantsearch.php

涉及技术

  1. requests请求链接
  2. re正则表达式
  3. lxml.etree.HTML 函数把字符串转为html对象
  4. xlrd和xlwt保存为excel文件

爬虫思路

  1. 先找到所有的银行和城市
  2. 用基本的链接http://money.finance.sina.com.cn/creditcard/view/vMerchantsearch.php 和银行,城市组成新的链接,比如http://money.finance.sina.com.cn/creditcard/view/vMerchantsearch.php?num=8&bank=37&city=%B1%B1%BE%A9,然后后根据分页组成带有页码的链接,比如http://money.finance.sina.com.cn/creditcard/view/vMerchantsearch.php?num=8&bank=37&city=%B1%B1%BE%A9&page=1,最后把具体的商户的链接提取出来进行页面解析
  3. 保存到excel
  4. 爬虫结束

亮点

  1. 支持自定义输入【银行】,【城市】下载
  2. 支持自定义从哪一页开始爬

爬虫代码

# -*- coding:utf-8*-
import csv
import json
import os
import re
import time
import io
import urllib

import lxml
import xlrd as xlrd
import xlwt as xlwt
from lxml import etree
import requests
import sys

from lxml.etree import _Element
from xlutils.copy import copy
sys.getdefaultencoding()
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')


global bank_name
global city_name
global shop_name
global shop_type
global card_type
global activity_deadtime
global shop_address
global telephone_num
global role

def get_all_detail_url(html,pattern):
    href_url = re.findall(pattern,html,re.I)
    #print(href_url)
    url_list = []
    for url in href_url:
        #print(url)
        if url not in url_list:
            url_list.append(url)
    return url_list



def get_page(url):
    try:
        response = requests.get(url)
        if response.status_code==200:
            html = response.content.decode('gb2312','ignore').replace(u'\xa9', u'').replace(u'\xa0',u'')
            mytree = lxml.etree.HTML(html)
            return mytree
    except requests.ConnectionError:
        return None

def parse_detail_page(detail_html):
    tb_list = detail_html.xpath('//tr')
    tb_result={}
    for i in range(len(tb_list)):
        card_type_content = tb_list[i].xpath('.//text()')
        content = ''
        for text in card_type_content:
            content = content + text.strip().replace('\r\n', '')
        #print(content)
        key = content.split(':')[0]
        value = content.split(':')[1]
        tb_result[key]=value
    #print(tb_result)
    card_type=tb_result['支持卡类']
    shop_address = tb_result['商户地址']
    telephone_num = tb_result['服务电话']
    city_name = tb_result['所属地区']
    activity_deadtime=tb_result['优惠截止']
    shop_name=''.join(detail_html.xpath('//span[@class="title font14"]/text()'))
    #print(shop_name)
    shop_type=''.join(detail_html.xpath('//div[@class="blk_01_content"]/a[3]/text()'))
    #print(shop_type)
    # city_name = ''.join(detail_html.xpath('//div[@class="blk_01_content"]/a[2]/text()'))
    # print(city_name)
    bank_name=''.join(detail_html.xpath('//div[@class="blk_01_content"]/a[1]/text()'))
    #print(bank_name)
    role = ''.join(detail_html.xpath('//div[@class="blk_02 clearfix"]/p[2]/text()'))
    #print(role)
    info_list=[shop_name,shop_type,city_name,bank_name,card_type,shop_address,telephone_num,activity_deadtime,role]
    return info_list




def save(file_name,data):
    if os.path.exists(file_name):
        # 打开excel
        rb = xlrd.open_workbook(file_name, formatting_info=True)
        # 用 xlrd 提供的方法获得现在已有的行数
        rn = rb.sheets()[0].nrows
        # 复制excel
        wb = copy(rb)
        # 从复制的excel文件中得到第一个sheet
        sheet = wb.get_sheet(0)
        # 向sheet中写入文件
        write_data(sheet, rn, data)
        # 删除原先的文件
        os.remove(file_name)
        # 保存
        wb.save(file_name)
    else:
        header = ['shop_name','shop_type','city_name','bank_name','card_type','shop_address','telephone_num','activity_deadtime','role','url']
        book = xlwt.Workbook(encoding='utf-8')
        sheet = book.add_sheet('新浪财经-信用卡数据')
        # 向 excel 中写入表头
        for h in range(len(header)):
            sheet.write(0, h, header[h])
        # 向sheet中写入内容
        write_data(sheet, 1, data)
        book.save(file_name)





def write_data(sheet, row,lst):
  for data_infos in lst:
    j = 0
    for data in data_infos:
      sheet.write(row, j, data)
      j += 1
    row += 1


def main():
    print('*' * 80)
    print('\t\t\t\t新浪财经-信用卡优惠商店数据下载')
    print('作者:谢华东  2018.11.12')
    print('--------------')
    base_url='http://money.finance.sina.com.cn/creditcard/view/vMerchantsearch.php'
    # html = lxml.etree.HTML(requests.get(base_url).content.decode('gb2312','ignore').replace(u'\xa9', u'').replace(u'\xa0',u''))
    html = get_page(base_url)
    bank_list = html.xpath('/html/body/div/div[3]/form/div/select[1]/option')
    city_list = html.xpath('/html/body/div/div[3]/form/div/select[2]/option')
    bank_dic = {}
    city_table_list = []
    for bank in bank_list:
        bank_num = int(bank.get('value'))
        bank_name = bank.text
        if bank_name=='选择银行':
            bank_name = '全部银行'
        bank_dic.update({bank_num:bank_name})
    #print(bank_dic)
    for city in city_list:
        city_name = city.text
        if city_name!='选择城市':
            city_table_list.append(city_name)

    #print(city_dic)
    bank_title = int(float(input('请输入银行代号({}): \n'.format(bank_dic))))
    while(bank_title not in bank_dic.keys()):
        bank_title = int(float(input('银行输入错误,请重新输入: \n')))

    city_title = input('请输入城市名称({}): \n'.format(city_table_list))
    while(city_title not in city_table_list):
        city_title = input('城市名称输入错误,请重新输入: \n')

    path = (input('请输入要保存的地址(例如:C:\\Users\\xhdong1\\Desktop\\),不输入则保存到当前地址:\n'))

    file_name = path + bank_dic[bank_title] + '_' +city_title + '.xls'
    print(file_name)


    if bank_dic[bank_title] == '全部银行' :
        if city_title =='全国':
            base_url = 'http://money.finance.sina.com.cn/creditcard/view/vMerchantsearch.php'
        else:
            city_encode_name = urllib.parse.quote(city_title.encode('gb2312'))
            base_url = 'http://money.finance.sina.com.cn/creditcard/view/vMerchantsearch.php?num=8&city={c}'.format(c=city_encode_name)
    else:
        if city_title == '全国':
            base_url = 'http://money.finance.sina.com.cn/creditcard/view/vMerchantsearch.php?num=8&bank={b}'.format(b = bank_title)
        else:
            city_encode_name = urllib.parse.quote(city_title.encode('gb2312'))
            base_url = 'http://money.finance.sina.com.cn/creditcard/view/vMerchantsearch.php?num=8&bank={b}&city={c}'.format(b=bank_title,c=city_encode_name)
    #print(base_url)


    #根据base_url获得有多少行数据
    html = get_page(base_url)
    total_page_list = html.xpath('//*[@id="divPages"]/span[last()]//text()')
    for page in total_page_list:
        #print(page)
        total_page =  int(str(page).split('/')[1].replace('页',''))
        #print(total_page)
    if total_page==0:
        print('{city}的{bank}没有活动信息'.format(city=city_title,bank=bank_dic[bank_title]))
        sys.exit(0)

    start_page = int(input('输入你要从第几页开始爬,按回车:\n'))


    for i in range(start_page,total_page):
        print('总共{total}页,正在爬第{i}/{total}页'.format(i=i,total=total_page))
        time.sleep(1)
        all_info_list=[]
        url = base_url + '&page='+str(i)
        print(url)
        response = requests.get(url)
        pattern = '<.*?(http://finance.sina.com.cn/creditcard/shops/\d+/\d+/\d+.html).*?'
        all_url_list = get_all_detail_url(response.content.decode('gb2312','ignore'),pattern)
        #print(''.join(all_url_list))
        for detail_url in all_url_list:
            #print(detail_url)
            detail_html = get_page(detail_url)
            info_list = parse_detail_page(detail_html)
            info_list.append(detail_url)
            all_info_list.append(info_list)
            print(info_list)
        print(all_info_list)
        save(file_name,all_info_list)




if __name__ == '__main__':
    main()



特别鸣谢

感谢自己。

猜你喜欢

转载自blog.csdn.net/qq_29622761/article/details/84106968