豆瓣的模拟登录比较简单,唯一麻烦的是需要手动输入验证码
注意当没有验证码的时候下面的代码是不行的,当没有验证码的时候会更加简单,form表单中
会少添加id和yzm属性
import requests
from lxml import etree
import os,time,urllib
import ssl
import re
#全局取消证书验证
ssl._create_default_https_context = ssl._create_unverified_context
#已登录用户才能访问的页面
user_page = 'https://www.douban.com/people/185450052/'
#豆瓣登录页面
login_url = 'https://accounts.douban.com/login'
#请求头
headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
#解析登录页面获取form表单提交时的表单信息
response_1 = requests.get(url='https://accounts.douban.com/login',headers=headers)
html = etree.HTML(response_1.text)
captched_url = html.xpath('//img[@id="captcha_image"]/@src')[0]
captched_url = str(captched_url)
#print(type(captched_url))
print(captched_url)
captched_url_2 = captched_url
re_result = re.match('https.*?id=(.*?)&',captched_url)
#print(re_result)
if re_result:
id = re_result.group(1)
print(id)
def download_picture(pic_url):
#查看指定文件夹是否存在
pwd = os.path.exists('/Users/mac/Desktop/Python-Scrapy/ying/picture/')
if pwd:
print('already exist!')
else:
#如果不存在则创建该文件夹
os.mkdir('/Users/mac/Desktop/Python-Scrapy/ying/picture/')
pic_data_url = urllib.request.urlopen(pic_url)
pic_data = pic_data_url.read()
localtime = time.strftime('%y%m%d%h%m%s',time.localtime())
filename = '/Users/mac/Desktop/Python-Scrapy/ying/picture/' + localtime + '.jpg'
f = open(filename,'wb')
f.write(pic_data)
f.close()
print('file' + ' ' + str(localtime) + '.jpg')
print('Finish!')
download_picture(captched_url_2)
yzm = input('请输入验证码:')
yzm = str(yzm)
#表单数据
post_data = {
'source':'movie',
'redir':'https://www.douban.com',
'form_email':'[email protected]',
'form_password':'mimamima123',
'login':'登录',
'captcha-solution': yzm,
'captcha-id': id,
}
session = requests.Session()
response = session.post(login_url,post_data)
print(session.cookies.get_dict())
#访问需要登录才能访问的页面
response_2 = session.get(url=user_page)
print(response_2.text)