版权声明:派森带你学python,欢迎加群:923414804与群主一起学习 https://blog.csdn.net/weixin_44369414/article/details/85916500
在scrapy爬虫项目中经常遇到 爬取数据时报错无法及时处理 导致数据爬取不完整 只能先查看log才能发现报错
首先写一个简单的邮件发送模块
#邮件服务封装
import smtplib
from email.mime.text import MIMEText
from email.utils import formataddr
class EmailHandler(object):
def __init__(self,user,password,type = 0):
"""
:param user:str 发送人邮箱地址(用户名)
:param password:str 发送人在QQ或163申请的授权码
:param type:int 0 为QQ邮箱 1 为163邮箱
"""
self.__QQ = {'smtp':'smtp.qq.com','port':465}
self.__163 = {'smtp':'smtp.163.com','port':25}
self.user = user
self.password = password
if type == 0:
self.server=smtplib.SMTP_SSL (self.__QQ['smtp'],self.__QQ['port'])
self.server.login (self.user,self.password)
elif type == 1:
self.server=smtplib.SMTP_SSL (self.__163['smtp'],self.__163['port'])
self.server.login (self.user,self.password)
def send_mail(self,To,subject,content):
"""
:param To:str 接收人邮箱地址
:param subject:str 邮件标题
:param content:str 邮件内容
:return:bool True 成功 False 失败
"""
try:
msg = MIMEText(content,'plain','utf-8')
msg['From'] = formataddr(['spider邮件报警系统',self.user])
msg['To'] = formataddr(['',To])
msg['Subject'] = subject
self.server.sendmail(self.user,To,msg.as_string())
print("【%s】邮件发送成功"%subject)
return True
except Exception as f:
print("【%s】邮件发送失败,请检查信息"%subject)
return False
需要指定以下几个参数
#邮箱信息
MAIL_CONFIG = {
'user':'xxxxx', #邮箱账号
'password':'xxxx', #邮箱授权码
'to_add':'xxx', #要发送的邮箱地址
'mail_title':'scrapy_标题' #邮件标题
}
本项目中主要使用的 pydispatch模块 绑定信号的方式发送邮件(代码片段)
from pydispatch import dispatcher
err_spider = object()
def __init__(self):
#初始化邮件发送次数
self.mail_count = 0
dispatcher.connect(self.send_mail, signal=err_spider)
super(xxx, self).__init__()
def send_mail(self, error):
"当spider出现error时发送邮件到邮箱"
if self.mail_count < 1:
mailmanager = EmailHandler(mail_conf.get('user', ''), mail_conf.get('password', ''))
mailmanager.send_mail(mail_conf.get('to_add', ''), mail_conf.get('mail_title', ''), 'spider出现错误请及时查看\r%s' % error)
self.mail_count += 1
准备工作已经完成,接下来就是在scrapy 爬取数据出现问题时 调用这个模块向指定邮箱发送邮件(代码片段)
#列表页数据
def parse(self, response):
#列表页条目
data_lists = response.xpath('//div[@id="listbox30"]/div')
try:
#最后一个div是分页数据
for data in data_lists[:-1]:
item = WangdaitianyanItem()
item['title'] = data.xpath('div[1]/div/div[1]/a/@title').extract_first() #标题
log.msg('[info] 正在爬取【%s】' % (item['title']), level=log.INFO)
item['img'] = data.xpath('div[2]/div/a/img/@data-src').extract_first() #封面图
item['introduction'] = data.xpath('div[1]/div/div[2]/text()').extract_first() #简介
item['source'] = data.xpath('div[1]/div/div[3]/div[1]/span[1]/a/text()').extract_first() #源
item['release_time'] = data.xpath('div[1]/div/div[3]/div[1]/span[3]/text()').extract_first() #发布时间
item['read_count'] = data.xpath('div[1]/div/div[3]/div[2]/span[2]/text()').extract_first() #回复数
item['comment_count'] = data.xpath('div[1]/div/div[3]/div[2]/span[5]/text()').extract_first() #评论数
#抓取详情页数据
#//news.p2peye.com/article-513444-1.html
url = data.xpath('div[1]/div/div[1]/a/@href').extract_first() #url
yield scrapy.Request(url='http:%s'%url, callback=self.details_page, meta={'item':item})
#分页部分
#如果检测不到下一页 不在请求
try:
next_page = data_lists[-1].xpath('div/a[contains(@title,"下一页")]/@href').extract_first() #拿去下一页url
yield scrapy.Request(url='https://news.p2peye.com%s'%next_page, callback=self.parse)
except Exception as e:
pass
except Exception as e:
#发送邮件
dispatcher.send(signal=err_spider, error=traceback.format_exc())
当爬虫出现问题时会以邮件的形式发送到邮箱