1.在头文件中带上登录后的cookies值
在spider文件中
name = ‘爬虫名字’
allowed_domains = [' ']
start_urls = ( )
cookies = { }
。。。。。。(一些获取html的函数)
最后调用回调函数
def start_requests(self): for url in self.start_urls: #yield scrapy.Request(url, callback = self.parse) #url = "http://www.renren.com/410043129/profile" yield scrapy.FormRequest(url, cookies = self.cookies, callback = self.parse_page)
2.start_requests(self),从登录界面开始爬取 带上账号和密码登录,回到函数是主页面
def start_requests(self): url = 'http://www.renren.com/PLogin.do' yield scrapy.FormRequest( url = url, formdata = {"email" : "[email protected]", "password" : "alarmchime"}, callback = self.parse_page) def parse_page(self, response): with open("mao2.html", "w") as filename: filename.write(response.body) 3. 向登录界面发起一个请求获得隐藏字段
start_urls = ( "http://www.renren.com/PLogin.do", ) def parse(self, response): (通过xpath获得隐藏字段) _xsrf = response.xpath("//_xsrf").extract()[0] yield scrapy.FormRequest.from_response( response, formdata = {"email" : "[email protected]", "password" : "alarmchime"}, "_xsrf" = _xsrf}, callback = self.parse_page )