在处理请求时引发任何异常时将调用的函数。这包括因404 HTTP错误而失败的页面等。
它接收 Twisted Failure实例作为第一个参数。
import scrapy
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
class ErrbackSpider(scrapy.Spider):
name = "errback_example"
start_urls = [
"http://www.httpbin.org/", # HTTP 200 expected
"http://www.httpbin.org/status/404", # Not found error
"http://www.httpbin.org/status/500", # server issue
"http://www.httpbin.org:12345/", # non-responding host, timeout expected
"http://www.httphttpbinbin.org/", # DNS error expected
]
def start_requests(self):
for u in self.start_urls:
yield scrapy.Request(u, callback=self.parse_httpbin,
errback=self.errback_httpbin,
dont_filter=True)
def parse_httpbin(self, response):
self.logger.info('Got successful response from {}'.format(response.url))
# do something useful here...
def errback_httpbin(self, failure):
# log all failures
self.logger.error(repr(failure))
# in case you want to do something special for some errors,
# you may need the failure's type:
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)
再举一个简单实用的例子(部分代码):
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import TimeoutError, TCPTimedOutError, ConnectionRefusedError
from twisted.web._newclient import ResponseFailed, ResponseNeverReceived
from scrapy.spidermiddlewares.httperror import HttpError
from scrapy.utils.response import response_status_message # 获取错误代码信息
def request_errback(self, failure):
request = failure.request
if failure.check(HttpError):
response = failure.value.response
self.logger.error(
'errback <%s> %s , response status:%s' %
(request.url, failure.value, response_status_message(response.status))
)
elif failure.check(ResponseFailed):
self.logger.error('errback <%s> ResponseFailed' % request.url)
elif failure.check(ConnectionRefusedError):
self.logger.error('errback <%s> ConnectionRefusedError' % request.url)
elif failure.check(ResponseNeverReceived):
self.logger.error('errback <%s> ResponseNeverReceived' % request.url)
elif failure.check(TCPTimedOutError, TimeoutError):
self.logger.error('errback <%s> TimeoutError' % request.url)
else:
self.logger.error('errback <%s> OtherError' % request.url)