Python爬虫模拟登陆知乎

在爬取一些网页的时候，往往有些网页的内容是需要登陆才可以获取的，这个时候我们就需要用到模拟登陆。
一开始搞模拟登陆的时候，我尝试过使用selenium的模拟点击，但是感觉那样太麻烦，一是每个网页都需要写特定的规则，而是耗时太长，效率太低。第三个就是验证码。
所以还是安安心心的用post请求吧
# coding=UTF-8
import re
import requests
from bs4 import BeautifulSoup
import time
import lxml
from PIL import Image
import json
import time
import cookielib
from mycptcha import APIClient
# import http.cookiejar
class Zhihu(object):
       # 初始化参数:请求头和session并加载cookie
       def __init__(self):
              self.headers = {
                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
                     "Host": "www.zhihu.com",
                     "Referer": "https://www.zhihu.com/",
                     }

              self.session = requests.Session()
              self.session.cookies = cookielib.LWPCookieJar("cookie")
              try:
                     self.session.cookies.load(ignore_discard=True)
              except IOError:
                     print('Cookie未加载！')

       #获取网页中的xsrf，后续需作为请求数据发送
       def get_xsrf(self):
              html = self.session.get('https://www.zhihu.com', headers=self.headers).text
              soup = BeautifulSoup(html, 'lxml')
              xsrf = soup.find('input').get('value')
              return xsrf
       #获取验证码图片并显示
       def get_captcha(self):
              cli=APIClient()
              t = str(int(time.time() * 1000))
              captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
              response=self.session.get(captcha_url,headers=self.headers)
              file=open('cptcha.gif','wb')
              with file as f:
                     f.write(response.content)

              print '正在为您自动识别并输入验证码...'
              captcha=cli.result()
              return captcha

       #登陆方法，需传入用户名和密码
       def login(self,username,password):
              #如果传入的为11位数字的话，则传入phone_num
              if re.match(r'\d{11}$',username):
                     url = 'http://www.zhihu.com/login/phone_num'
                     data={
                            '_xsrf': self.get_xsrf(),
                            'password': password,
                            'remember_me': 'true',
                            'phone_num': username
                     }
              #否则传入email
              else:
                     url = 'https://www.zhihu.com/login/email'
                     data = {
   
   '_xsrf': self.get_xsrf(),
                             'password': password,
                             'remember_me': 'true',
                             'email': username
                             }
              #发送post请求
              response = self.session.post(url, data=data, headers=self.headers)
              #将源代码导入为json格式
              result=json.loads(response.text)
              #如果返回状态为1，则登陆失败，需要传入验证码
              if(result['r']==1):
                     #传入captcha参数，数值为get_captcha方法的返回值
                     data['captcha']=self.get_captcha()
                     #再次请求
                     response2=self.session.post(url, data=data, headers=self.headers)
                     #输出登陆状态
                     print((json.loads(response2.text))['msg'])
              #保存cookies
              self.session.cookies.save(ignore_discard=True, ignore_expires=True)

       #测试是否已经登陆过
       def is_login(self):
              #此网址为用户个人资料，如果之前没有登陆过，浏览器则会重定向到登陆网址
              url = "https://www.zhihu.com/settings/profile"
              # 禁止重定向，否则登录失败重定向到首页也是响应200
              login_code = self.session.get(url, headers=self.headers, allow_redirects=False)
              if login_code.status_code == 200:
                     return True
              else:
                     return False
       #返回登陆过的session对象
       def get_session(self):
              return self.session

if __name__=='__main__':

       zhihu=Zhihu()
       if zhihu.is_login():
              print('已经登陆过的')
              session=zhihu.get_session()
       else:
              username=raw_input('请输入用户名：')
              password=raw_input('请输入密码：')
              zhihu.login(username,password)
              session=zhihu.get_session()
       url = "https://www.zhihu.com/settings/profile"
       info=session.get(url,headers=zhihu.headers)
       print(info.text)
Python爬虫模拟登陆知乎

猜你喜欢