scrapy爬取某网站小说

今天用scrapy试着爬一下bqg的小说

先是检查网页,确定不是动态加载数据,文章标题和内容url直接就可以获取到

0.创建一个新的项目
scrapy startproject try_1
cd try_1
scrapy genspider santi www.xxx.com

1 spider部分

import scrapy
from items import TrySItem2
import re
class SantiSpider(scrapy.Spider):
    name = 'santi'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.52bqg.com/book_88879/']

    def parse(self, response):
        list_ = response.xpath('//*[@id="list"]/dl/dd')
        items = TrySItem2()
        for dd in list_:
            items['title'] = dd.xpath('./a/text()').get()
			# 获取url
            d_url = dd.xpath('./a/@href').get()
			# 拼接url
            d_url = 'https://www.52bqg.com/book_88879/{}'.format(str(d_url))

            yield scrapy.Request(url=d_url,callback=self.parse_content,meta={'items':items})
            break #防止被封先爬一章试试,去掉之后就全爬
            
    def parse_content(self,response):
        items = response.meta['items']
        content = response.xpath('//*[@id="content"]//text()').getall()
        items['content'] = ''.join(content)
        yield items

2 items部分

class TrySItem2(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()
  1. pipelines 部分

class TryS_2_Pipeline(object):
    santi = None
    def open_spider(self,spider):
        self.santi = open('./santi.txt','w',encoding='utf-8')
    def process_item(self, items, spider):
        title = items['title']
        content =items['content']
        self.santi.write(title+'\n'+content+'\n')

        print(title)
        return items

    def close_spider(self,spider):
        self.santi.close()

4.settings 部分
主要改了这么几个参数

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'

ROBOTSTXT_OBEY = False

LOG_LEVEL = 'ERROR'

DOWNLOAD_DELAY = 2

ITEM_PIPELINES = {
   'try_2.pipelines.TryS_2_Pipeline': 300,
}

5 .最后为了方便运行创建一个start.py文件

from scrapy import cmdline

cmdline.execute('scrapy crawl santi'.split())
发布了3 篇原创文章 · 获赞 1 · 访问量 38

猜你喜欢

转载自blog.csdn.net/medusee/article/details/105678515
今日推荐