scrapy爬取需要登录的网站(知乎)

法一:使用selenium

在middlewares.py中



import time
from scrapy import signals
from selenium import webdriver
from scrapy.http import HtmlResponse
import requests

class LoginMiddle(object):
    def process_request(self,request,spider):
        if spider.name=='myzhihu':#判断是哪个爬虫名
            if request.url.find('signup')!=-1:#这里signup是在链接中的signup,-1表示未登陆
                spider.broswer=webdriver.Chrome()
                spider.broswer.get(request.url)#获取url,myzhihu中的start_url
                time.sleep(1)
                spider.broswer.find_element_by_xpath('//div[@class="SignContainer-switch"]/span').click()
                time.sleep(1)
                #获取输入框
                username=spider.broswer.find_element_by_name('username')#
                password=spider.broswer.find_element_by_name('password')
                #传值
                username.send_keys("188775729")
                password.send_keys("changeme_123")
                time.sleep(5)
                spider.broswer.find_element_by_xpath('//button[@class="Button SignFlow-submitButton Button--primary Button--blue"]').click()
                time.sleep(2)
                #获取登录后cookie
                spider.cookies=spider.broswer.get_cookies()
                return HtmlResponse(url=spider.broswer.current_url,#当前的url即登录后的url
                                        body=spider.broswer.page_source,#Gets the source of the current page.
                                            encoding="utf-8")
            else:
                # requests请求登录
                session=requests.session()#请求session
                #cookie一个字典
                for cookie in spider.cookies:
                    session.cookies.set(cookie['name',cookie['value']])
                    session.headers.clear()#清除头
                    newpage=session.get(request.url,verify=False).content.decode()
                    return HtmlResponse(url=request.url,body=newpage,encoding='urf-8')




myzhihu.py中

import scrapy


class MyzhihuSpider(scrapy.Spider):
    name = 'myzhihu'
    allowed_domains = ['zhihu.com']
    start_urls = ['https://www.zhihu.com/signup?next=%2F']
    def __init__(self):
        self.broswer=None
        self.cookies=None
        super(MyzhihuSpider,self).__init__()


    def parse(self, response):
        print(response.body.decode('utf-8','ignore'),"*****************")
        pass

方法二:

在myzhihu.py中

class MyzhihuSpider(scrapy.Spider):
    name = 'myzhihu'
    allowed_domains = ['zhihu.com']
    start_urls = ['https://www.zhihu.com/signup?next=%2F']
    cookies={'tgw_l7_route': '4902c7c12bebebe28366186aba4ffcde', ' _xsrf': '0813131a-5167-4b4f-a038-d95fdf5f9147', ' d_c0': '"ADBm3dyntw2PTnp2Oa_uhUlVkwmXdBlEn54=|1528443225"', ' q_c1': '7d89624a3c254aa6a0b98bbdaf1e7304|1528443225000|1528443225000', ' _zap': '1df9c3c5-24fe-44e5-9dc6-117a0fa7b0c3', ' capsion_ticket': '"2|1:0|10:1528443398|14:capsion_ticket|44:Y2UxMTJjNDY2OTU5NDA1OTk1YWI3MzI4NmIxY2UxMmI=|7a0a3d835889610efdc1cf3c5981eec74fa608a24bf0026013a96eb7100ef1a3"', ' z_c0': '"2|1:0|10:1528443425|4:z_c0|92:Mi4xVC1LdkJBQUFBQUFBTUdiZDNLZTNEU1lBQUFCZ0FsVk5JWUFIWEFBOS1fNWhkV25fSVRXVnB3YTR2RjlRU0hqbWh3|a4e98e9fec089e37c36528c82d5824da1b2c4eb2fb5432e8deef34bba6319ef7"'}
    def start_requests(self):
        url="https://www.zhihu.com/"
        yield scrapy.FormRequest(url,cookies=self.cookies,callback=self.parse)


    def parse(self, response):
        print(response.body.decode('utf-8','ignore'))

cookie转为字典方法:

def changeCookie(cookie):
    '''
    将cookie转为字典
    :param cookie: 传入cookie
    :return:
    '''

    cookieDict = {}
    cookiesList = cookie.split(";")
    for i in cookiesList:
        name = i.split("=",maxsplit=1)[0].strip()
        value = i.split("=", maxsplit=1)[1].strip()
        cookieDict[name] = value
    return cookieDict


if __name__ == '__main__':
    cookie = "RK=rmHT7e2LQD; pgv_pvi=9920637952; pac_uid=1_1687458949; eas_sid=H1z5Y0H4U3M3f8E004v1F3X6I7; LW_uid=Y1W590G4O333K87114z6Z4L5J4; LW_sid=Y1R53084d3L3t8U2O292E7c2p8; pgv_pvid_new=1687458949_103e8ac4183; tvfe_boss_uuid=5eed6635b6c9405a; mobileUV=1_15ecd77b5fe_99bfa; o_cookie=1687458949; qz_screen=1366x768; QZ_FE_WEBP_SUPPORT=1; pgv_pvid=2040220332; ptui_loginuin=1687458949; ptcz=1fd2762d2f45e34542eb5fcd75ac829861fc1c6ac443d23fccfd1ce0edd2fc9a; pt2gguin=o1687458949; __Q_w_s__QZN_TodoMsgCnt=1; g_ut=2; luin=o1687458949; lskey=00010000b3b68e991995d80b79f9c52a99c16eaf49281ced7df1d646c4bdd1d0bbd873720e1ce3bfe7780b86; pgv_info=ssid=s3993669940; pgv_si=s683402240; _qpsvr_localtk=0.20081414178281976; ptisp=ctc; uin=o1687458949; skey=@V88NAVwlq; p_uin=o1687458949; pt4_token=EInWhEKDya22RamMyviig8ozN6-NGB4bcYZ*WhjWhZY_; p_skey=BXvbOSDmGtLOCVr1BWlKf-ga4OylhjjAGXqkSSdDgG8_; Loading=Yes; 1687458949_todaycount=0; 1687458949_totalcount=0; cpu_performance_v8=54; x-stgw-ssl-info=99b84715396d1517bd1ed06c301aeca9|0.112|1528425001.578|1|r|I|TLSv1.2|ECDHE-RSA-AES128-GCM-SHA256|13500|N|0"

    cookies = changeCookie(cookie)
    print(cookies)

发布了18 篇原创文章 · 获赞 7 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/qq_39965716/article/details/80623605