python scrapy实例:爬取笔趣阁长篇电子书

python scrapy实例:爬取笔趣阁长篇电子书

入门案例请看本人的scrapy学习一、二、三,本篇为进阶案例(网上有好多爬虫案例,但是80%都是无效的,大部分原因是原链接失效或者原网页发生改变,如果该实例失效,请读者留言告知,笔者定将第一时间更新)

#item.py代码
import scrapy


class YieldtestItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    content = scrapy.Field()
#piplines.py代码

import pymysql

class YieldtestPipeline(object):
    def __init__(self):
        self.connection = pymysql.connect(host="localhost", user="root", password="123456", database="test", charset="utf8", port=3306)
        self.cursor = self.connection.cursor()
    def process_item(self, item, spider):
        sql = 'insert into article (content) VALUES(%s); '
        self.cursor.execute(sql,item['content'])
        self.connection.commit()
        return item
    def __del__(self):
        self.cursor.close()
        self.connection.close()
     
 #写完piplines.py后需要在settings.py里面设置
ITEM_PIPELINES = {
   'yieldtest.pipelines.YieldtestPipeline': 300,
}

在spides目录下新建artiscrawl.py

代码如下

import scrapy
from yieldtest.items import YieldtestItem
import time

class ArticleCrawl(scrapy.Spider):
    name = 'articlecrawl'
    allowed_domains = ["biquge.info"]
    start_urls = ['http://www.biquge.info/32_32050/index.html']
    def parse(self, response):
        for href in response.xpath('//div[@id="list"]/dl/dd/a'):
            url = response.urljoin(href.xpath('@href').extract()[0])  #必须是字符串
            yield scrapy.Request(url,callback=self.parse_dir_contents)
        #     item = YieldtestItem()
        #     item['url'] = [url]
        #     yield item
        # for href in response.css('div[id="list"]>dl>dd>a::attr("href")'):
        #     print(href.extract(),type(href.extract()))    #css选择器的返回值是字符串不是list

    def parse_dir_contents(self,response):
        aa = response.xpath('//div[@id="content"]/text()').extract()
        content =''.join(aa).replace('\xa0','')
        item = YieldtestItem()
        item['content'] = content
        time.sleep(1)
        yield item

最后设置每次循环爬取延迟一秒防止出现request请求返回503(一秒不行就两秒)

#在settings.py里面设置
DOWNLOAD_DELAY = 1

在cmd命令里输入 scrapy crawl articlecrawl
即可爬取

最后爬取效果展示
在这里插入图片描述

发布了11 篇原创文章 · 获赞 2 · 访问量 1512

猜你喜欢

转载自blog.csdn.net/mostermoonsky/article/details/104104920
今日推荐