python scrapy实例:爬取笔趣阁长篇电子书
入门案例请看本人的scrapy学习一、二、三,本篇为进阶案例(网上有好多爬虫案例,但是80%都是无效的,大部分原因是原链接失效或者原网页发生改变,如果该实例失效,请读者留言告知,笔者定将第一时间更新)
#item.py代码
import scrapy
class YieldtestItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
content = scrapy.Field()
#piplines.py代码
import pymysql
class YieldtestPipeline(object):
def __init__(self):
self.connection = pymysql.connect(host="localhost", user="root", password="123456", database="test", charset="utf8", port=3306)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
sql = 'insert into article (content) VALUES(%s); '
self.cursor.execute(sql,item['content'])
self.connection.commit()
return item
def __del__(self):
self.cursor.close()
self.connection.close()
#写完piplines.py后需要在settings.py里面设置
ITEM_PIPELINES = {
'yieldtest.pipelines.YieldtestPipeline': 300,
}
在spides目录下新建artiscrawl.py
代码如下
import scrapy
from yieldtest.items import YieldtestItem
import time
class ArticleCrawl(scrapy.Spider):
name = 'articlecrawl'
allowed_domains = ["biquge.info"]
start_urls = ['http://www.biquge.info/32_32050/index.html']
def parse(self, response):
for href in response.xpath('//div[@id="list"]/dl/dd/a'):
url = response.urljoin(href.xpath('@href').extract()[0]) #必须是字符串
yield scrapy.Request(url,callback=self.parse_dir_contents)
# item = YieldtestItem()
# item['url'] = [url]
# yield item
# for href in response.css('div[id="list"]>dl>dd>a::attr("href")'):
# print(href.extract(),type(href.extract())) #css选择器的返回值是字符串不是list
def parse_dir_contents(self,response):
aa = response.xpath('//div[@id="content"]/text()').extract()
content =''.join(aa).replace('\xa0','')
item = YieldtestItem()
item['content'] = content
time.sleep(1)
yield item
最后设置每次循环爬取延迟一秒防止出现request请求返回503(一秒不行就两秒)
#在settings.py里面设置
DOWNLOAD_DELAY = 1
在cmd命令里输入 scrapy crawl articlecrawl
即可爬取
最后爬取效果展示