弄一下午的爬虫好不容易把数字的乱码解决了却发现里面的数字全部都是假的,大佬们看到能帮我解决一下嘛在线等啊下面是我写的代码
import requests
from lxml import etree
import time
import json
import random
import base64
from fontTools.ttLib import TTFont
import re
from io import BytesIO
name = input('请输入城市(拼音):')
urlt = 'https://'+name+'.zu.anjuke.com'
res = requests.get(urlt)
bs64_str = re.findall("charset=utf-8;base64,(.*?)'\)", res.text)[0]
def get_page_show_ret(string):
font = TTFont(BytesIO(base64.decodebytes(bs64_str.encode())))
c = font.getBestCmap()
ret_list = []
for char in string:
decode_num = ord(char)
if decode_num in c:
num = c[decode_num]
num = int(num[-2:]) - 1
ret_list.append(num)
else:
ret_list.append(char)
ret_str_show = ''
for num in ret_list:
ret_str_show += str(num)
return ret_str_show
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def parse(url_):
response = requests.get(url_,headers=headers)
response.encoding='utf-8'
return etree.HTML(response.text)
def parse_detail(list_url):
selector = parse(list_url)
time.sleep(random.randint(0,2))
all_list = selector.xpath('//*[@class="zu-itemmod"]')
for sel in all_list:
url_a = sel.xpath('div[1]/h3/a/@href')[0]
parse_id_detail(url_a)
def parse_id_detail(url_a):
alls = requests.get(url_a,headers=headers)
alls.encoding='utf-8'
selector = etree.HTML(alls.text)
time.sleep(random.randint(0,2))
items=[]
item = {}
price = selector.xpath('string(//*[@class="house-info-zufang cf"]/li[1]/span)')
price = get_page_show_ret(price)
type = selector.xpath('string(//*[@class="house-info-zufang cf"]/li[2]/span[2])')
type = get_page_show_ret(type)
mianji = selector.xpath('string(//*[@class="house-info-zufang cf"]/li[3]/span[2])')
mianji = get_page_show_ret(mianji)
chaoxiang = selector.xpath('string(//*[@class="house-info-zufang cf"]/li[4]/span[2])')
height = selector.xpath('string(//*[@class="house-info-zufang cf"]/li[5]/span[2])')
zhuangxiu = selector.xpath('string(//*[@class="house-info-zufang cf"]/li[6]/span[2])')
leixing= selector.xpath('string(//*[@class="house-info-zufang cf"]/li[7]/span[2])')
place= selector.xpath('string(//*[@class="house-info-zufang cf"]/li[8]/a)')
yaoqiu= selector.xpath('string(//*[@class="house-info-zufang cf"]/li[9]/span[2])')
try:
bianma= selector.xpath('//*[@class="right-info"]/span/text()')[0]
except:
bianma=''
try:
times= selector.xpath('//*[@class="right-info"]/b/text()')[0]
except:
times = ''
times = get_page_show_ret(times)
item['price']=price
item['type']=type
item['mianji'] = mianji
item['chaoxiang'] = chaoxiang
item['height'] = height
item['zhuangxiu'] = zhuangxiu
item['leixing'] = leixing
item['place'] = place
item['yaoqiu'] = yaoqiu
item['bianma'] = bianma
item['times'] = times
items.append(item)
json.dump(item,open('nanjing.json','a',encoding='utf-8'),ensure_ascii=False,indent=4)
print(price,type,mianji,chaoxiang,height,zhuangxiu,leixing,place,yaoqiu,bianma,times)
url_lists = 'https://'+name+'.zu.anjuke.com/fangyuan/p'
all_url = [url_lists + str(i) for i in range(1,30)]
for url in all_url:
parse_detail(url)
爬到的日期离谱的很,在线等在线等