版权声明:未经博主同意,禁止转载。谢谢! https://blog.csdn.net/cp_123321/article/details/84960656
在spider文件中的程序为
import scrapy
from ..items import BolespiderItem
class BoleSpider(scrapy.Spider):
name = 'bole'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
"""
获取每个链接的网址
:param response:
:return:
"""
div_list = response.css("div#archive>div[class='post floated-thumb']")
for div in div_list:
href = div.css("div:nth-child(2)>p:nth-child(1)>a[class='archive-title']::attr(href)").extract_first("")
# print(href)
yield scrapy.Request(url=href,callback=self.parse_content_info,dont_filter=True,meta={"content_url":href})
#取下一页的地址
next_href = response.css("div[class='navigation margin-20']>a[class='next page-numbers']::attr(href)").extract_first("")
if next_href:
yield scrapy.Request(url=next_href,callback=self.parse,dont_filter=True)
else:
print("没有下一页了")
def parse_content_info(self,response):
"""
获取每个文章的详细信息
:param response:
:return:
"""
#文章链接
content_url = response.meta["content_url"]
title = response.css(".grid-8>div:nth-child(1)>div:nth-child(1)>h1::text").extract_first("")
time = response.css(".grid-8>div:nth-child(1)>div:nth-child(2)>p::text").extract_first().replace("\r\n","").strip()
#所属类型
content_type = response.css(".grid-8>div:nth-child(1)>div:nth-child(2)>p>a:nth-child(1)::text").extract_first("")
#评论数
comment_num = response.css(".grid-8>div:nth-child(1)>div:nth-child(2)>p>a[href='#article-comment']::text").extract_first("无评论数")
#原文出处
source_text_name = response.css("div.copyright-area>a:nth-child(1)::text").extract_first("")
source_text_url = response.css("div.copyright-area>a:nth-child(1)::attr(href)").extract_first("")
#译文出处
translation_name = response.css("div.copyright-area>a:nth-child(2)::text").extract_first("无译文")
translation_url = response.css("div.copyright-area>a:nth-child(1)::attr(href)").extract_first("无网址")
#点赞数
give_like_num = response.css(".post-adds>span:nth-child(1)>h10::text").extract_first("")
#收藏 = 数
collect_num = response.css(".post-adds>span:nth-child(2)::text").extract_first("")
# print("===")
item = BolespiderItem()
item["content_url"] = content_url
item["title"] = title
item["time"] = time
item["content_type"] = content_type
item["comment_num"] = comment_num
item["source_text_name"] = source_text_name
item["source_text_url"] = source_text_url
item["translation_name"] = translation_name
item["translation_url"] = translation_url
item["give_like_num"] = give_like_num
item["collect_num"] = collect_num
item["content_url"] = content_url
yield item
items.py文件中的代码
import scrapy
class BolespiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
content_url = scrapy.Field()
title = scrapy.Field()
time = scrapy.Field()
content_type = scrapy.Field()
comment_num = scrapy.Field()
source_text_name = scrapy.Field()
source_text_url = scrapy.Field()
translation_name = scrapy.Field()
translation_url = scrapy.Field()
give_like_num = scrapy.Field()
collect_num = scrapy.Field()
settings.py文件中的代码
#第67行的代码
ITEM_PIPELINES = {
'BoLeSpider.pipelines.HandleDataPipeline': 300,
'BoLeSpider.pipelines.MySQLTwistedPipeline': 301,
}
#在最下边添加的代码段
MYSQL_HOST = "localhost"
MYSQL_PORT = 3306
MYSQL_USER = "root"
MYSQL_PASSWD = "123456"
MYSQL_CHARSET = "utf8"
MYSQL_DBNAME = "jobbole"
pipeline.py文件里的程序
#清洗数据有两个方法,一个是在spider文件里爬取数据后直接清洗,二是在pipeline里清洗数据,这两个方法我都写了
from twisted.enterprise import adbapi
from MySQLdb.cursors import DictCursor
import hashlib
#定义一个清洗数据的pipeline
class HandleDataPipeline(object):
#定义一个对网址进行加密的函数
def md5_by_url(self,url):
url = url.encode("utf-8")
m = hashlib.md5()
m.update(url)
return m.hexdigest()
# url = url.encode("utf-8")
# m = hashlib.md5()
# m.update(url)
# return m.hexdigest()
def process_item(self,item,spider):
#将网址取出来,调用md5_by_url函数
content_url = item["content_url"]
content_url = self.md5_by_url(content_url)
time = item["time"]
collect_num = item["collect_num"]
time = time.replace("\r","").replace("\n","").strip()
collect_num = collect_num.replace("收藏","").strip()
item["time"] = time
item["collect_num"] = collect_num
item["content_url"] = content_url
return item
# 定义一个异步存储至mysql的pipeline
# scrapy的解析是异步多线程的,解析速度非常快,而mysql的execute()和commit()提交数据库的方式是同步执行,所有数据库的写入速度比较慢,一旦数据量较大,可能会导致item中的数据插入不及时,造成数据库写入堵塞,最终导致数据库卡死或者数据丢失.所以需要使用数据库的异步写入.
# from twisted.enterprise import adbapi
# from MySQLdb.cursors import DictCursor
class MySQLTwistedPipeline(object):
def __init__(self,dbpool):
#初始化线程池对象
self.dbpool = dbpool
@classmethod
def from_crawler(cls,crawler):
args = dict(host=crawler.settings.get("MYSQL_HOST"),port=crawler.settings.get("MYSQL_PORT"),user=crawler.settings.get("MYSQL_USER"),db=crawler.settings.get("MYSQL_DBNAME"),passwd=crawler.settings.get("MYSQL_PASSWD"),charset=crawler.settings.get("MYSQL_CHARSET"),cursorclass=DictCursor)
#创建一个线程池对象
#参数一:用于连接mysql数据库的驱动的名称
#相当于同时创建了含有很多个线程池对象(游标或者数据库连接)
dbpool = adbapi.ConnectionPool("MySQLdb",**args)
return cls(dbpool)
def insert_sql(self,cursor,item):
inser_sql = "insert into bole(content_url,title,time,content_type,comment_num,source_text_name,source_text_url,translation_name,translation_url,give_like_num,collect_num)values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(item["content_url"],item["title"],item["time"],item["content_type"],item["comment_num"],item["source_text_name"],item["source_text_url"],item["translation_name"],item["translation_url"],item["give_like_num"],item["collect_num"])
cursor.execute(inser_sql)
def process_item(self,item,spider):
"""
在线程池dbpool中通过runInteraction()函数,来实现异步插入数据的操作,runInteraction()会将inser_sql这个函数交给线程池中的某一个线程具体执行.
:param item:
:param spider:
:return:
"""
result = self.dbpool.runInteraction(self.insert_sql,item)
#如果数据插入失败,会执行addErrback()内部的函数调用
result.addErrback(self.error_info)
def error_info(self,failure):
print("数据插入失败,原因是:",failure)