前言
在这里我就不再一一介绍每个步骤的具体操作了,因为在爬取老版今日头条数据的时候都已经讲的非常清楚了,所以在这里我只会在重点上讲述这个是这么实现的,如果想要看具体步骤请先去看我今日头条的文章内容,里面有非常详细的介绍以及是怎么找到加密js代码和api接口。
Python3爬取今日头条文章视频数据,完美解决as、cp、_signature的加密方法
QQ群聊
855262907
分析土巴兔装修网
先进行一次登陆,然后在开发者工具里面查找一下有参数的链接,搜索关键词username
或者password
看看有没有。
我们搜索发现,val
和password
参数被加密了,val
猜测应该就是账号了,要不然的话账号就没有被传输到服务器上了,既然找到了被加密参数就继续搜索看看在那个地方进行加密的。
第一个js文件名以login命名,明显更符合登陆时用的js文件,第二个是一段js代码:jq('#rsa_userNum').val(rsaString(password)); jq('#rsa_userName').val(rsaString(username))
很明显是Jquery
的语法,以及使用RSA算法
进行加密的。
直接下断点,然后在登陆一下看看,我们看见账号密码就在这块进行加密的,那我们跟进去,找到关键的public_key
,详细的可以百度RSA算法
。
从这里面就知道进行RSA算法
后在进行urlencode编码
一下。
跟进去之后发现public_key
,这个是RSA算法
的公钥
,得到后就简单多了,直接上代码吧。
Python代码:
import requests
from lxml import etree
import rsa
import base64
from PIL import Image
import pytesseract
class To8To():
def __init__(self,username,password):
self.session = requests.Session()
self.username = self.RSAString(username)
self.password = self.RSAString(password)
self.login()
# RSA算法并进行urlencode编码
def RSAString(self,text):
publickeystr = """-----BEGIN PUBLIC KEY-----
MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDhNhuAr4UjFv+cj99PbAQWWx9H
X+3jSRThJqJdXkWUMFMTRay8EYRtPFIiwiOUU4gCh4ePMxiuZJWUBHe1waOkXEFc
Kg17luhVqECsO+EOLhxa3yHoXA5HcSKlG85hNV3G4uQCr+C8SOE0vCGTnMdnEGmU
nG1AGGe44YKy6XR4VwIDAQAB
-----END PUBLIC KEY-----"""
result = rsa.encrypt(text.encode('utf-8'),rsa.PublicKey.load_pkcs1_openssl_pem(publickeystr.encode('utf-8')))
return requests.utils.quote(base64.b64encode(result).decode())
# 登陆土巴兔
def login(self):
url = "https://www.to8to.com/new_login.php"
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': '439',
'Content-Type': 'application/x-www-form-urlencoded',
# 'Cookie': 'uid=CgoLDl+fxedYQcbnA9+SAg==; sourcepath=b1; to8to_landtime=1604306408; to8to_keywordlist=,,,,,,,,,土巴兔; tracker2019session=%7B%22session%22%3A%22175881d12a348b-054bffcd976ceb-c781f38-2073600-175881d12a4664%22%7D; tracker2019jssdkcross=%7B%22distinct_id%22%3A%22175881d12a823b-0b99c9a8d8c678-c781f38-2073600-175881d12a93b1%22%7D; to8to_tcode=sz; to8to_tname=%E6%B7%B1%E5%9C%B3; to8to_townid=1130; Hm_lvt_dbdd94468cf0ef471455c47f380f58d2=1604306408; to8tocookieid=16ca84d48a125ffcf2b9c3901c5edc0f853582; to8tosessionid=s_ab26dc179eaefefdea5b3b108cbf2bf1; to8to_cook=OkOcClPzRWV8ZFJlCIF4Ag==; tender_popup_flag=true; layer-popup=true; to8to_landpage=http%3A//sz.to8to.com/; to8to_sourcepage=; to8to_nowpage=http%253A%252F%252Fsz.to8to.com%252F; PHPSESSID=bo9224bbpec2hms9upgn240105; agreementRead=true; Hm_lpvt_dbdd94468cf0ef471455c47f380f58d2=1604307019; act=freshen',
'Host': 'www.to8to.com',
'Origin': 'https://www.to8to.com',
'Referer': 'https://www.to8to.com/new_login.php',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
}
data = {
'referer': 'https://www.to8to.com/new_login.php',
'val': self.username,
'password': self.password
}
response = self.session.post(url=url,headers=headers,data=data)
print(response.text)
if "用户名不存在" in response.text or "登录失败次数过多,请30分钟后再试!" in response.text:
html = etree.HTML(response.text)
yzm_url = 'https:' + html.xpath('//img[@id="passport"]/@src')[0]
data['yzm'] = self.img_to_text(yzm_url)
response = self.session.post(url=url, headers=headers, data=data)
print(response.text)
# 验证码识别
def img_to_text(self,url):
parses = requests.utils.urlparse(url)
path = parses.path + parses.params + '?' + parses.query
headers = {
'referrer':'https://www.to8to.com/new_login.php',
'method': 'GET',
'path': path,
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
}
response = requests.get(url=url,headers=headers)
with open('yzm.png','wb') as f:
f.write(response.content)
yzm = Image.open('yzm.png')
yzm = yzm.convert('L')
threshold = 127
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
yzm.point(table, '1')
return pytesseract.image_to_string(yzm)
if __name__ == '__main__':
To8To('账号','密码')
验证码的识别率不是很高,所以不一定能识别成功,你们可以进行修改,改成手动输入或者对接OCR接口。