法一:使用selenium
在middlewares.py中
import time from scrapy import signals from selenium import webdriver from scrapy.http import HtmlResponse import requests class LoginMiddle(object): def process_request(self,request,spider): if spider.name=='myzhihu':#判断是哪个爬虫名 if request.url.find('signup')!=-1:#这里signup是在链接中的signup,-1表示未登陆 spider.broswer=webdriver.Chrome() spider.broswer.get(request.url)#获取url,myzhihu中的start_url time.sleep(1) spider.broswer.find_element_by_xpath('//div[@class="SignContainer-switch"]/span').click() time.sleep(1) #获取输入框 username=spider.broswer.find_element_by_name('username')# password=spider.broswer.find_element_by_name('password') #传值 username.send_keys("188775729") password.send_keys("changeme_123") time.sleep(5) spider.broswer.find_element_by_xpath('//button[@class="Button SignFlow-submitButton Button--primary Button--blue"]').click() time.sleep(2) #获取登录后cookie spider.cookies=spider.broswer.get_cookies() return HtmlResponse(url=spider.broswer.current_url,#当前的url即登录后的url body=spider.broswer.page_source,#Gets the source of the current page. encoding="utf-8") else: # requests请求登录 session=requests.session()#请求session #cookie一个字典 for cookie in spider.cookies: session.cookies.set(cookie['name',cookie['value']]) session.headers.clear()#清除头 newpage=session.get(request.url,verify=False).content.decode() return HtmlResponse(url=request.url,body=newpage,encoding='urf-8')
myzhihu.py中
import scrapy class MyzhihuSpider(scrapy.Spider): name = 'myzhihu' allowed_domains = ['zhihu.com'] start_urls = ['https://www.zhihu.com/signup?next=%2F'] def __init__(self): self.broswer=None self.cookies=None super(MyzhihuSpider,self).__init__() def parse(self, response): print(response.body.decode('utf-8','ignore'),"*****************") pass
方法二:
在myzhihu.py中
class MyzhihuSpider(scrapy.Spider): name = 'myzhihu' allowed_domains = ['zhihu.com'] start_urls = ['https://www.zhihu.com/signup?next=%2F'] cookies={'tgw_l7_route': '4902c7c12bebebe28366186aba4ffcde', ' _xsrf': '0813131a-5167-4b4f-a038-d95fdf5f9147', ' d_c0': '"ADBm3dyntw2PTnp2Oa_uhUlVkwmXdBlEn54=|1528443225"', ' q_c1': '7d89624a3c254aa6a0b98bbdaf1e7304|1528443225000|1528443225000', ' _zap': '1df9c3c5-24fe-44e5-9dc6-117a0fa7b0c3', ' capsion_ticket': '"2|1:0|10:1528443398|14:capsion_ticket|44:Y2UxMTJjNDY2OTU5NDA1OTk1YWI3MzI4NmIxY2UxMmI=|7a0a3d835889610efdc1cf3c5981eec74fa608a24bf0026013a96eb7100ef1a3"', ' z_c0': '"2|1:0|10:1528443425|4:z_c0|92:Mi4xVC1LdkJBQUFBQUFBTUdiZDNLZTNEU1lBQUFCZ0FsVk5JWUFIWEFBOS1fNWhkV25fSVRXVnB3YTR2RjlRU0hqbWh3|a4e98e9fec089e37c36528c82d5824da1b2c4eb2fb5432e8deef34bba6319ef7"'} def start_requests(self): url="https://www.zhihu.com/" yield scrapy.FormRequest(url,cookies=self.cookies,callback=self.parse) def parse(self, response): print(response.body.decode('utf-8','ignore'))
cookie转为字典方法:
def changeCookie(cookie): ''' 将cookie转为字典 :param cookie: 传入cookie :return: ''' cookieDict = {} cookiesList = cookie.split(";") for i in cookiesList: name = i.split("=",maxsplit=1)[0].strip() value = i.split("=", maxsplit=1)[1].strip() cookieDict[name] = value return cookieDict if __name__ == '__main__': cookie = "RK=rmHT7e2LQD; pgv_pvi=9920637952; pac_uid=1_1687458949; eas_sid=H1z5Y0H4U3M3f8E004v1F3X6I7; LW_uid=Y1W590G4O333K87114z6Z4L5J4; LW_sid=Y1R53084d3L3t8U2O292E7c2p8; pgv_pvid_new=1687458949_103e8ac4183; tvfe_boss_uuid=5eed6635b6c9405a; mobileUV=1_15ecd77b5fe_99bfa; o_cookie=1687458949; qz_screen=1366x768; QZ_FE_WEBP_SUPPORT=1; pgv_pvid=2040220332; ptui_loginuin=1687458949; ptcz=1fd2762d2f45e34542eb5fcd75ac829861fc1c6ac443d23fccfd1ce0edd2fc9a; pt2gguin=o1687458949; __Q_w_s__QZN_TodoMsgCnt=1; g_ut=2; luin=o1687458949; lskey=00010000b3b68e991995d80b79f9c52a99c16eaf49281ced7df1d646c4bdd1d0bbd873720e1ce3bfe7780b86; pgv_info=ssid=s3993669940; pgv_si=s683402240; _qpsvr_localtk=0.20081414178281976; ptisp=ctc; uin=o1687458949; skey=@V88NAVwlq; p_uin=o1687458949; pt4_token=EInWhEKDya22RamMyviig8ozN6-NGB4bcYZ*WhjWhZY_; p_skey=BXvbOSDmGtLOCVr1BWlKf-ga4OylhjjAGXqkSSdDgG8_; Loading=Yes; 1687458949_todaycount=0; 1687458949_totalcount=0; cpu_performance_v8=54; x-stgw-ssl-info=99b84715396d1517bd1ed06c301aeca9|0.112|1528425001.578|1|r|I|TLSv1.2|ECDHE-RSA-AES128-GCM-SHA256|13500|N|0" cookies = changeCookie(cookie) print(cookies)