爬取斗鱼图片

创建项目
scrapy startproject douyu

编写items.py

1 import scrapy
2 
3 class DouyuItem(scrapy.Item):
4     nickname = scrapy.Field()
5     imagelink = scrapy.Field()
6     imagePath = scrapy.Field()

创建基础类的爬虫

scrapy genspider douyutupian capi.douyucdn.cn

手机抓包得到API接口,返回JSON格式数据

douyutupian.py

 1 import scrapy
 2 from douyu.items import DouyuItem
 3 import json
 4 
 5 
 6 class DouyumeinvSpider(scrapy.Spider):
 7     name = "douyumeinv"
 8     allowed_domains = ["capi.douyucdn.cn"]
 9 
10     offset = 0
11     url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
12 
13     start_urls = [url + str(offset)]
14 
15     def parse(self, response):
16         # 把json格式的数据转换为python格式,data段是列表
17         data = json.loads(response.text)["data"]
18         for each in data:
19             item = DouyuItem()
20             item["nickname"] = each["nickname"]
21             item["imagelink"] = each["vertical_src"]
22 
23             yield item
24 
25         self.offset += 20
26         yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

管道文件
pipelines.py

 1 import scrapy
 2 from scrapy.utils.project import get_project_settings
 3 from scrapy.pipelines.images import ImagesPipeline
 4 import os
 5 
 6 class ImagesPipeline(ImagesPipeline):
 7     #def process_item(self, item, spider):
 8     #    return item
 9     # 获取settings文件里设置的变量值
10     IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
11 
12     def get_media_requests(self, item, info):
13         image_url = item["imagelink"]
14         yield scrapy.Request(image_url)
15 
16     def item_completed(self, result, item, info):
17         image_path = [x["path"] for ok, x in result if ok]
18 
19         os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["nickname"] + ".jpg")
20 
21         item["imagePath"] = self.IMAGES_STORE + "/" + item["nickname"]
22 
23         return item
settings.py
 1 BOT_NAME = 'douyu'
 2 
 3 SPIDER_MODULES = ['douyu.spiders']
 4 NEWSPIDER_MODULE = 'douyu.spiders'
 5 
 6 DEFAULT_REQUEST_HEADERS = {
 7     "User-Agent" : "DYZB/1 CFNetwork/808.2.16 Darwin/16.3.0"
 8 }
 9 
10 ITEM_PIPELINES = {
11     'douyu.pipelines.ImagesPipeline': 300,
12 }
13 
14 IMAGES_STORE = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"

猜你喜欢

转载自www.cnblogs.com/wanglinjie/p/9240373.html