开发环境:
- Python:3.5
- Scrapy:1.5.1
- scrapy-djangoitem:1.1.1
- Django:2.1.4
以虎嗅网人工智能板块下《神经网络生成极慢视频,从此不再错过任何细节》一文的缩略图为例。图片点这里
通过重写file_path方法,可以将文件后缀更改为gif。
1 def file_path(self, request, response=None, info=None): 2 item = request.meta['item'] 3 index = request.meta['index'] 4 filename = u'huxiu_article/{0}/{1}'.format(article_id, 5 'image' + str(index) + "." + "gif") 6 return filename
但是这样保存下来的图片依然是静态的。如下图所示:
通过观察scrapy.pipelines.images.py文件中的ImagesPipeline类,发现如下代码,其默认格式为jpeg。
1 def image_downloaded(self, response, request, info): 2 checksum = None 3 for path, image, buf in self.get_images(response, request, info): 4 if checksum is None: 5 buf.seek(0) 6 checksum = md5sum(buf) 7 width, height = image.size 8 self.store.persist_file( 9 path, buf, info, 10 meta={'width': width, 'height': height}, 11 headers={'Content-Type': 'image/jpeg'}) 12 return checksum
所以如果我们想要下载GIF图片,则需要在继承ImagesPipeline类后,对image_downloaded方法进行重写。
1 def check_gif(self, image): 2 if image.format is None: 3 return True 4 5 def persist_gif(self, key, data, info): 6 root, ext = os.path.splitext(key) 7 absolute_path = self.store._get_filesystem_path(key) 8 self.store._mkdir(os.path.dirname(absolute_path), info) 9 f = open(absolute_path, 'wb') # use 'b' to write binary data. 10 f.write(data) 11 12 def image_downloaded(self, response, request, info): 13 checksum = None 14 for path, image, buf in self.get_images(response, request, info): 15 if checksum is None: 16 buf.seek(0) 17 checksum = md5sum(buf) 18 width, height = image.size 19 if self.check_gif(image): 20 self.persist_gif(path, response.body, info) 21 else: 22 self.store.persist_file( 23 path, buf, info, 24 meta={'width': width, 'height': height}, 25 headers={'Content-Type': 'image/jpeg'}) 26 return checksum
这里需要注意的是check_gif方法,如果图片是jpg/jpeg格式的,那么在debug模式下可以看到format这里是正常的。
但是图片是GIF格式是,format就为None了。而不是我以为的'GIF'。
再次运行程序,可以看到这次不再是静态的图片了 。
完整代码如下:
1 import os 2 3 from scrapy import Request 4 from scrapy.pipelines.images import ImagesPipeline 5 from scrapy.utils.misc import md5sum 6 7 8 class ImagePipeline(ImagesPipeline): 9 def file_path(self, request, response=None, info=None): 10 # 定义文件名格式 11 filename = 'my file' 12 return filename 13 14 def get_media_requests(self, item, info): 15 for index, img_url in enumerate(item['image_urls']): 16 yield Request(img_url, meta={'item': item, 'index': index}) 17 18 def check_gif(self, image): 19 if image.format is None: 20 return True 21 22 def persist_gif(self, key, data, info): 23 root, ext = os.path.splitext(key) 24 absolute_path = self.store._get_filesystem_path(key) 25 self.store._mkdir(os.path.dirname(absolute_path), info) 26 f = open(absolute_path, 'wb') # use 'b' to write binary data. 27 f.write(data) 28 29 def image_downloaded(self, response, request, info): 30 checksum = None 31 for path, image, buf in self.get_images(response, request, info): 32 if checksum is None: 33 buf.seek(0) 34 checksum = md5sum(buf) 35 width, height = image.size 36 if self.check_gif(image): 37 self.persist_gif(path, response.body, info) 38 else: 39 self.store.persist_file( 40 path, buf, info, 41 meta={'width': width, 'height': height}, 42 headers={'Content-Type': 'image/jpeg'}) 43 return checksum