前言
目标网址——起点小说:https://book.qidian.com/info/1010734492
一、什么是字体反爬?
字体反爬虫:在网页中的关键部分中采用自定义的字体来显示,防止爬虫爬取到关键信息。
二、如何解决?
1.获取字体文件
从页面源码入手,可以看到字体已经被转义成了𘡒𘡓𘡓𘡏𘡒𘡓
字体的源文件地址是:https://qidian.gtimg.com/qd_anti_spider/PCiagJxf.woff
2.获取字体的映射关系
-
Python导入fontTools模块:
pip install fontTools
from fontTools.ttLib import TTFont
-
下载字体文件
# 获取字体链接 font_class = re.findall("format\('eot'\); src: url\('(.*?)'\) format\('woff'\), url", page_info)[0] font_name = font_class.split('/')[-1].split('.')[0] print(font_class,font_name) # 下载woff字体 with open('my.woff','wb') as f: f.write(requests.get(url=font_class).content)
-
获取映射关系
font = TTFont('my.woff') font.saveXML('font.xml') with open('font.xml','rb') as f: txt = f.read().decode('utf8') # 映射关系 y_dict = { 'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6','seven': '7','eight': '8', 'nine': '9', 'period': '.'} # 获取字体映射内容 data2 = re.compile('code="(.*?)".*name="(.*?)"').findall(txt) x_dict = dict() for x in data2: base_code = str(int(x[0], 16)) x_dict[base_code] = y_dict[x[1]] return x_dict
-
匹配需转义的十六进制、并进行转义
# 正则匹配需转义的十六进制 confuse_data = re.findall('</style><span class="'+font_name+'">(.*?);</span></em><cite>',page_info) print(confuse_data) de_cnnfuse = [] # 遍历字体转义、进行字体匹配处理 for data in confuse_data: deal_data = data.replace('&#','').split(';') x = [] for info in deal_data: x.append(font_dict[info]) de_cnnfuse.append(''.join(x))
三、源码
'''
起点中文网 https://book.qidian.com/info/1010734492
'''
import re
from fontTools.ttLib import TTFont
import requests
def get_info(): # 获取字体映射关系
font = TTFont('my.woff')
font.saveXML('font.xml')
with open('font.xml','rb') as f:
txt = f.read().decode('utf8')
# 映射关系
y_dict = {
'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6','seven': '7','eight': '8', 'nine': '9', 'period': '.'}
# 获取字体映射内容
data2 = re.compile('code="(.*?)".*name="(.*?)"').findall(txt)
x_dict = dict()
for x in data2:
base_code = str(int(x[0], 16))
x_dict[base_code] = y_dict[x[1]]
return x_dict
if __name__ == '__main__':
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'zh-CN,zh;q=0.9',
}
response = requests.get('https://book.qidian.com/info/1010734492', headers=headers)
page_info = response.text
# 获取字体链接
font_class = re.findall("format\('eot'\); src: url\('(.*?)'\) format\('woff'\), url", page_info)[0]
font_name = font_class.split('/')[-1].split('.')[0]
print(font_class,font_name)
# 下载woff字体
with open('my.woff','wb') as f:
f.write(requests.get(url=font_class).content)
# 获取字体映射关系
font_dict = get_info()
print(font_dict)
# 正则匹配需转义的十六进制
confuse_data = re.findall('</style><span class="'+font_name+'">(.*?);</span></em><cite>',page_info)
print(confuse_data)
de_cnnfuse = []
# 遍历字体转义、进行字体匹配处理
for data in confuse_data:
deal_data = data.replace('&#','').split(';')
x = []
for info in deal_data:
x.append(font_dict[info])
de_cnnfuse.append(''.join(x))
title_list = ['总字数(万)','会员周点击数','总推荐数(万)','周推荐数']
print(dict(zip(title_list, de_cnnfuse)))