使用scrapy爬取伯乐在线多线程存为MySQL数据库

在spider文件中的程序为

import scrapy
from ..items import BolespiderItem

class BoleSpider(scrapy.Spider):
    name = 'bole'
    allowed_domains = ['blog.jobbole.com']
    start_urls = ['http://blog.jobbole.com/all-posts/']

    def parse(self, response):
        """
        获取每个链接的网址
        :param response:
        :return:
        """
        div_list = response.css("div#archive>div[class='post floated-thumb']")
        for div in div_list:
            href = div.css("div:nth-child(2)>p:nth-child(1)>a[class='archive-title']::attr(href)").extract_first("")
            # print(href)
            yield scrapy.Request(url=href,callback=self.parse_content_info,dont_filter=True,meta={"content_url":href})
        #取下一页的地址
        next_href = response.css("div[class='navigation margin-20']>a[class='next page-numbers']::attr(href)").extract_first("")
        if next_href:
            yield scrapy.Request(url=next_href,callback=self.parse,dont_filter=True)
        else:
            print("没有下一页了")

    def parse_content_info(self,response):
        """
        获取每个文章的详细信息
        :param response:
        :return:
        """
        #文章链接
        content_url = response.meta["content_url"]
        title = response.css(".grid-8>div:nth-child(1)>div:nth-child(1)>h1::text").extract_first("")
        time = response.css(".grid-8>div:nth-child(1)>div:nth-child(2)>p::text").extract_first().replace("\r\n","").strip()
        #所属类型
        content_type = response.css(".grid-8>div:nth-child(1)>div:nth-child(2)>p>a:nth-child(1)::text").extract_first("")
        #评论数
        comment_num = response.css(".grid-8>div:nth-child(1)>div:nth-child(2)>p>a[href='#article-comment']::text").extract_first("无评论数")
        
        #原文出处
        source_text_name = response.css("div.copyright-area>a:nth-child(1)::text").extract_first("")
        source_text_url = response.css("div.copyright-area>a:nth-child(1)::attr(href)").extract_first("")
        #译文出处
        translation_name = response.css("div.copyright-area>a:nth-child(2)::text").extract_first("无译文")
        translation_url = response.css("div.copyright-area>a:nth-child(1)::attr(href)").extract_first("无网址")
        #点赞数
        give_like_num = response.css(".post-adds>span:nth-child(1)>h10::text").extract_first("")
        #收藏 = 数
        collect_num = response.css(".post-adds>span:nth-child(2)::text").extract_first("")
        # print("===")

        item = BolespiderItem()
        item["content_url"] = content_url
        item["title"] = title
        item["time"] = time
        item["content_type"] = content_type
        item["comment_num"] = comment_num
        item["source_text_name"] = source_text_name
        item["source_text_url"] = source_text_url
        item["translation_name"] = translation_name
        item["translation_url"] = translation_url
        item["give_like_num"] = give_like_num
        item["collect_num"] = collect_num
        item["content_url"] = content_url
        yield item

items.py文件中的代码

import scrapy


class BolespiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    content_url = scrapy.Field()
    title = scrapy.Field()
    time = scrapy.Field()
    content_type = scrapy.Field()
    comment_num = scrapy.Field()
    source_text_name = scrapy.Field()
    source_text_url = scrapy.Field()
    translation_name = scrapy.Field()
    translation_url = scrapy.Field()
    give_like_num = scrapy.Field()
    collect_num = scrapy.Field()

settings.py文件中的代码

#第67行的代码
ITEM_PIPELINES = {
   'BoLeSpider.pipelines.HandleDataPipeline': 300,
'BoLeSpider.pipelines.MySQLTwistedPipeline': 301,
}



#在最下边添加的代码段
MYSQL_HOST = "localhost"
MYSQL_PORT = 3306
MYSQL_USER = "root"
MYSQL_PASSWD = "123456"
MYSQL_CHARSET = "utf8"
MYSQL_DBNAME = "jobbole"

pipeline.py文件里的程序

#清洗数据有两个方法,一个是在spider文件里爬取数据后直接清洗,二是在pipeline里清洗数据,这两个方法我都写了
from twisted.enterprise import adbapi
from MySQLdb.cursors import DictCursor
import hashlib
#定义一个清洗数据的pipeline
class HandleDataPipeline(object):
    #定义一个对网址进行加密的函数
    def md5_by_url(self,url):
        url = url.encode("utf-8")
        m = hashlib.md5()
        m.update(url)
        return m.hexdigest()
        # url = url.encode("utf-8")
        # m = hashlib.md5()
        # m.update(url)
        # return m.hexdigest()
    def process_item(self,item,spider):
        #将网址取出来,调用md5_by_url函数
        content_url = item["content_url"]
        content_url = self.md5_by_url(content_url)

        time = item["time"]
        collect_num = item["collect_num"]
        time = time.replace("\r","").replace("\n","").strip()
        collect_num = collect_num.replace("收藏","").strip()
        item["time"] = time
        item["collect_num"] = collect_num
        item["content_url"] = content_url
        return item

# 定义一个异步存储至mysql的pipeline
# scrapy的解析是异步多线程的,解析速度非常快,而mysql的execute()和commit()提交数据库的方式是同步执行,所有数据库的写入速度比较慢,一旦数据量较大,可能会导致item中的数据插入不及时,造成数据库写入堵塞,最终导致数据库卡死或者数据丢失.所以需要使用数据库的异步写入.
# from twisted.enterprise import adbapi
# from MySQLdb.cursors import DictCursor
class MySQLTwistedPipeline(object):
    def __init__(self,dbpool):
        #初始化线程池对象
        self.dbpool = dbpool

    @classmethod
    def from_crawler(cls,crawler):
        args = dict(host=crawler.settings.get("MYSQL_HOST"),port=crawler.settings.get("MYSQL_PORT"),user=crawler.settings.get("MYSQL_USER"),db=crawler.settings.get("MYSQL_DBNAME"),passwd=crawler.settings.get("MYSQL_PASSWD"),charset=crawler.settings.get("MYSQL_CHARSET"),cursorclass=DictCursor)
        #创建一个线程池对象
        #参数一:用于连接mysql数据库的驱动的名称
        #相当于同时创建了含有很多个线程池对象(游标或者数据库连接)
        dbpool = adbapi.ConnectionPool("MySQLdb",**args)
        return cls(dbpool)

    def insert_sql(self,cursor,item):
        inser_sql = "insert into bole(content_url,title,time,content_type,comment_num,source_text_name,source_text_url,translation_name,translation_url,give_like_num,collect_num)values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(item["content_url"],item["title"],item["time"],item["content_type"],item["comment_num"],item["source_text_name"],item["source_text_url"],item["translation_name"],item["translation_url"],item["give_like_num"],item["collect_num"])
        cursor.execute(inser_sql)

    def process_item(self,item,spider):
        """
        在线程池dbpool中通过runInteraction()函数,来实现异步插入数据的操作,runInteraction()会将inser_sql这个函数交给线程池中的某一个线程具体执行.
        :param item:
        :param spider:
        :return:
        """
        result = self.dbpool.runInteraction(self.insert_sql,item)
        #如果数据插入失败,会执行addErrback()内部的函数调用
        result.addErrback(self.error_info)

    def error_info(self,failure):
        print("数据插入失败,原因是:",failure)

使用scrapy爬取伯乐在线多线程存为MySQL数据库

猜你喜欢