Downloader Middleware

Downloader Middleware

源码解析:

 1 # 文件:E:\Miniconda\Lib\site-packages\scrapy\core\downloader\middleware.py
 2 """
 3 Downloader Middleware manager
 4 
 5 See documentation in docs/topics/downloader-middleware.rst
 6 """
 7 import six
 8 
 9 from twisted.internet import defer
10 
11 from scrapy.http import Request, Response
12 from scrapy.middleware import MiddlewareManager
13 from scrapy.utils.defer import mustbe_deferred
14 from scrapy.utils.conf import build_component_list
15 
16 
17 class DownloaderMiddlewareManager(MiddlewareManager):
18 
19     component_name = 'downloader middleware'
20 
21     @classmethod
22     def _get_mwlist_from_settings(cls, settings):
23         # 从settings.py或这custom_setting中拿到自定义的Middleware中间件
24         '''
25         'DOWNLOADER_MIDDLEWARES': {
26             'mySpider.middlewares.ProxiesMiddleware': 400,
27             # SeleniumMiddleware
28             'mySpider.middlewares.SeleniumMiddleware': 543,
29             'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
30         },
31         '''
32         return build_component_list(
33             settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
34 
35     # 将所有自定义Middleware中间件的处理函数添加到对应的methods列表中
36     def _add_middleware(self, mw):
37         if hasattr(mw, 'process_request'):
38             self.methods['process_request'].append(mw.process_request)
39         if hasattr(mw, 'process_response'):
40             self.methods['process_response'].insert(0, mw.process_response)
41         if hasattr(mw, 'process_exception'):
42             self.methods['process_exception'].insert(0, mw.process_exception)
43 
44     # 整个下载流程
45     def download(self, download_func, request, spider):
46         @defer.inlineCallbacks
47         def process_request(request):
48             # 处理request请求,依次经过各个自定义Middleware中间件的process_request方法,前面有加入到list中
49             for method in self.methods['process_request']:
50                 response = yield method(request=request, spider=spider)
51                 assert response is None or isinstance(response, (Response, Request)), \
52                         'Middleware %s.process_request must return None, Response or Request, got %s' % \
53                         (six.get_method_self(method).__class__.__name__, response.__class__.__name__)
54                 # 这是关键地方
55                 # 如果在某个Middleware中间件的process_request中处理完之后,生成了一个response对象
56                 # 那么会直接将这个response return 出去,跳出循环,不再处理其他的process_request
57                 # 之前我们的header,proxy中间件,都只是加个user-agent,加个proxy,并不做任何return值
58                 # 还需要注意一点:就是这个return的必须是Response对象
59                 # 后面我们构造的HtmlResponse正是Response的子类对象
60                 if response:
61                     defer.returnValue(response)
62             # 如果在上面的所有process_request中,都没有返回任何Response对象的话
63             # 最后,会将这个加工过的Request送往download_func,进行下载,返回的就是一个Response对象
64             # 然后依次经过各个Middleware中间件的process_response方法进行加工,如下
65             defer.returnValue((yield download_func(request=request,spider=spider)))
66 
67         @defer.inlineCallbacks
68         def process_response(response):
69             assert response is not None, 'Received None in process_response'
70             if isinstance(response, Request):
71                 defer.returnValue(response)
72 
73             for method in self.methods['process_response']:
74                 response = yield method(request=request, response=response,
75                                         spider=spider)
76                 assert isinstance(response, (Response, Request)), \
77                     'Middleware %s.process_response must return Response or Request, got %s' % \
78                     (six.get_method_self(method).__class__.__name__, type(response))
79                 if isinstance(response, Request):
80                     defer.returnValue(response)
81             defer.returnValue(response)
82 
83         @defer.inlineCallbacks
84         def process_exception(_failure):
85             exception = _failure.value
86             for method in self.methods['process_exception']:
87                 response = yield method(request=request, exception=exception,
88                                         spider=spider)
89                 assert response is None or isinstance(response, (Response, Request)), \
90                     'Middleware %s.process_exception must return None, Response or Request, got %s' % \
91                     (six.get_method_self(method).__class__.__name__, type(response))
92                 if response:
93                     defer.returnValue(response)
94             defer.returnValue(_failure)
95 
96         deferred = mustbe_deferred(process_request, request)
97         deferred.addErrback(process_exception)
98         deferred.addCallback(process_response)
99         return deferred

猜你喜欢

转载自www.cnblogs.com/guozepingboke/p/10774181.html