一 Post 请求
在爬虫文件中重写父类的start_requests(self)方法
- 父类方法源码(Request):
def start_requests(self): for url in self.start_urls: yield scrapy.Request(url=url,callback=self.parse)
- 重写该方法(FormRequests(url=url,callback=self.parse,formdata=data))
def start_requests(self): data={ 'kw': 'xml', } for url in self.start_urls:
#post请求,并传递参数
yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data)
二 多页面手动爬去数据
import scrapy from QiubaiPagePro.items import QiubaipageproItem class QiubaiSpider(scrapy.Spider): name = 'qiubai' # allowed_domains = ['xxx.com'] start_urls = ['https://www.qiushibaike.com/text/'] ''' https://www.qiushibaike.com/text/page/13/ ''' url='https://www.qiushibaike.com/text/page/%d/' #手动发起请求 page=1 def parse(self, response): div_list = response.xpath('//div[@id="content-left"]/div') for div in div_list: author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() content = div.xpath('./a[1]/div[@class="content"]/span//text()').extract() content = "".join(content) #实例管道对象 item=QiubaipageproItem() item['author']=author item['content']=content.strip() yield item
- 1 构造请求url的格式
url='https://www.qiushibaike.com/text/page/%d/' #手动发起请求 page=1
- 2 手动发送请求
if self.page<=12: self.page += 1 url=format(self.url%self.page) #手动发送请求 yield scrapy.Request(url=url,callback=self.parse)