前提
本文基于此篇博文的实现思路以及给出的代码。不得不说,此博主心真的细,这个数据隐藏点都找到了,从而使得此爬虫的实现简单许多。
但我在使用博主代码时,发现人人贷网站有些许改变,于是对应代码我也相应做了些许修改(主要加了cookie请求头信息,不加的话,有些数据为空,以及对一些数据字段的更新修改)。
代码如下
# coding=utf-8
from requests.exceptions import RequestException
import requests
import json
import csv
import re
import os
class Spider(object):
def __init__(self):
self.headers = {
'Accept': 'application / json, text / javascript, * / *; q = 0.01',
'Accept - Encoding': 'gzip, deflate, br',
'Accept - Language': 'zh - CN, zh; q = 0.9',
'Connection': 'keep - alive',
'Host': 'www.renrendai.com',
'Referer': 'https: // www.renrendai.com / loan.html',
'User - Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'X - Requested - With': 'XMLHttpRequest',
'Cookie': 'rrdid=ccc75785-2c07-4dc4-b020-d846e1400e61; __jsluid_s=4c6ad9d4d4049fd05ad106c261fda012; Qs_lvt_181814=1584388964; Hm_lvt_a00f46563afb7c779eef47b5de48fcde=1584388968; gr_user_id=029f0129-ffae-41ed-8142-696b9cfc4616; grwng_uid=0008e890-1b28-4f43-9501-d22dc42f303a; _ga=GA1.2.1950582141.1584388974; _gid=GA1.2.1843462544.1584388974; renrendaiUsername=15580241130; utmSource=pc_pz_baidu; utm_medium=2075513; utm_campaign=716885827; utm_content=864625; utm_term=831733554_15058797666725; promotion_source=pc_pz_baidu; mediav=%7B%22eid%22%3A%22301358%22%2C%22ep%22%3A%22%22%2C%22vid%22%3A%22%22%2C%22ctn%22%3A%22%22%2C%22vvid%22%3A%22%22%7D; loginMethod=password; IS_MOBLIE_IDPASS=true-false; jforumUserInfo=eiSmTE3oI809bABL60b2VNQ6XE%2FegqCwFJN6FcAwPsE%3D%0A; _gat=1; activeTimestamp=17971412; we_token=LXY5Z0NXSzVmMHBIN1FVUmhFQW5pdTJZUS1SaDBxdFI6MTc5NzE0MTI6MjNjNWM5ZjljZWYwNzQyNWQ2ODA4MmQ0NzI1ZTBjMDRjNmY2N2E4ZQ%3D%3D; we_sid=s%3AkvTAOQE0ZgUL4tKzSTBlhqZYF-E-J2QG.mOqona1ez021fYXhK0kBadT9xkwlp1LtTI%2FdK3xJ2XU; JSESSIONID=2C91F95436A01AF4DA78482A3EA0292A; bf0acacc0a738790_gr_last_sent_sid_with_cs1=027d3626-985a-4c76-8c70-aa3e73a19965; bf0acacc0a738790_gr_last_sent_cs1=17971412; bf0acacc0a738790_gr_cs1=17971412; bf0acacc0a738790_gr_session_id=027d3626-985a-4c76-8c70-aa3e73a19965; bf0acacc0a738790_gr_session_id_027d3626-985a-4c76-8c70-aa3e73a19965=true; Qs_pv_181814=692552440239038100%2C3904391899668128000%2C3858476856754299000%2C3922951718213370000%2C2429641774634917000; Hm_lpvt_a00f46563afb7c779eef47b5de48fcde=1584393613'
}
self.count = 0 # 记录成功爬取的条数
# 获取散标信息
def get_sanbiao(self):
# 一共1000条,爬10次,每次100条
for page in range(10):
url = 'https://www.renrendai.com/loan/list/loanList?startNum={}&limit=100'.format(page)
try:
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
self.parse_sanbian(response.text)
except RequestException as e:
print(e)
# 解析散标信息
def parse_sanbian(self, data):
data = json.loads(data)
for item in data['data']['list']:
url = 'https://www.renrendai.com/loan-{}.html'.format(item['loanId'])
self.get_detailinfo(url)
# 获取详细信息
def get_detailinfo(self, url):
try:
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
self.count += 1
print('成功爬取第 {} 条'.format(self.count))
self.parse_detailinfo(response.text)
else:
print('failure: {}'.format(url))
except RequestException as e:
print(e)
# 解析详细信息
def parse_detailinfo(self, data):
data = data.replace(u'\xa9', u'').replace('\\u0022', '"').replace('\\u005C', '\\') # gbk无法对u'\xa9'代表的字符进行编码,在Unicode中u'\xa9'代表的是©。因此直接忽略掉。
data = re.compile("var info = '({.*?})'", re.S).findall(data)
data = json.loads(data[0])
# print(data['borrower'])
result = {
}
# 顶部信息
result['loanId'] = data['loan']['loanId'] # Number
result['borrowType'] = data['loan']['borrowType'] # 贷款类型
result['amount'] = data['loan']['amount'] #标的总额
result['interest'] = data['loan']['interest'] # 年利率
result['months'] = data['loan']['months'] # 还款期限
result['creditLevel'] = data['borrower']['creditLevel'] # 风险等级
result['repayType'] = '按季还款' if int(data['loan']['repayType']) else '按月还款' # 还款方式
result['loanType'] = '等额本息' if data['loan']['loanType'] == 'DEBX' else '付息还本' #借贷方式
result['repaySource'] = data['repaySource'] # 还款来源
# 借贷人信息
result['realName'] = data['borrower']['realName'] # 姓名
result['gender'] = data['borrower']['gender'] # 性别
result['age'] = 2019-int(data['borrower']['birthDay'][:4]) # 年龄
result['marriage'] = '已婚' if data['borrower']['marriage'] else '未婚' # 婚姻
result['graduation'] = data['borrower']['graduation'] # 学历
result['salary'] = data['borrower']['salary'] # 收入
result['houseLoan'] = '有' if data['borrower']['houseLoan'] else '无' # 房贷
result['carLoan'] = '有' if data['borrower']['carLoan'] else '无' # 车贷
result['officeDomain'] = data['borrower']['officeDomain'] # 公司行业
result['hasOthDebt'] =data['hasOthDebt'] # 其他负债
# 信用信息
result['totalCount'] = data['userLoanRecord']['totalCount'] # 申请借款
result['successCount'] = data['userLoanRecord']['successCount'] # 成功借款
result['alreadyPayCount'] = data['userLoanRecord']['alreadyPayCount'] # 还清笔数
result['availableCredits'] = data['borrower']['availableCredits'] #信用额度
result['borrowAmount'] = data['userLoanRecord']['borrowAmount'] # 借款总额
result['notPayTotalAmount'] = data['userLoanRecord']['notPayPrincipal']+data['userLoanRecord']['notPayInterest'] # 待还本息
result['overdueTotalAmount'] = data['userLoanRecord']['overdueTotalAmount'] # 逾期金额
result['overdueCount'] = data['userLoanRecord']['overdueCount'] # 逾期次数
result['failedCount'] = data['userLoanRecord']['failedCount'] # 严重逾期
self.save_excel(list(result.values()))
# 存到excel
def save_excel(self, data):
out = open('人人贷.csv', 'a', newline='')
write = csv.writer(out, dialect='excel')
write.writerow(data)
def run(self):
if os.path.exists('./人人贷.csv'):
os.remove('./人人贷.csv')
self.save_excel('序号 贷款类型 标的总额 年利率 还款期限 风险等级 还款方式 借贷方式 还款来源'
' 姓名 性别 年龄 婚姻 学历 收入 房贷 车贷 公司行业 其他负债'
' 申请借款 成功借款 还清笔数 信用额度 借款总额 待还本息 逾期金额 逾期次数 严重逾期'.split(' '))
self.get_sanbiao()
if __name__ == '__main__':
spider = Spider()
spider.run()
注意:大家在使用时记得修改成自己cookie信息,其次,由于网站结构随时可能会变,从而此爬虫相应随时可能会失效,具体还需大家自己调试。
顺便附上一份自己当时爬的数据,关注公众号—>BatFor<—即可获取