scrapy爬取纵横中文网

任务

  1. 使用 scrapy 爬取纵横中文网月票榜
  2. 把小说名、作者、章节名、内容保存到数据库
  3. 效果如下图所示

novel 表

在这里插入图片描述
chapter 表

在这里插入图片描述

设计数据库

CREATE DATABASE IF NOT EXISTS `zongheng`;

USE `zongheng`;


/*Table structure for table `novel` */

DROP TABLE IF EXISTS `novel`;

CREATE TABLE `novel` (
  `id` INT(11) NULL AUTO_INCREMENT,
  `novelName` VARCHAR(255) COLLATE utf8_bin NOT NULL unique,
  `author` VARCHAR(255) COLLATE utf8_bin NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=INNODB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

/*Data for the table `novel` */

/*Table structure for table `chapter` */

DROP TABLE IF EXISTS `chapter`;

CREATE TABLE `chapter` (
  `id` INT(11) NULL AUTO_INCREMENT,
  `chapterName` VARCHAR(255) COLLATE utf8_bin NOT NULL,
  `content` TEXT COLLATE utf8_bin NULL,
  `novelId` INT(11) NOT NULL,
  PRIMARY KEY (`id`),
  KEY `FK_chapter` (`novelId`),
  CONSTRAINT `FK_chapter` FOREIGN KEY (`novelId`) REFERENCES `novel` (`id`)
) ENGINE=INNODB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

文件结构

在这里插入图片描述

代码实现

创建项目

scrapy startproject zongheng

spider.py

import scrapy
from zongheng.items import ZonghengItem
import copy

class zongHeng(scrapy.Spider):
    name = "zongheng"

    start_urls = ["http://www.zongheng.com/"]
    
    def parse(self,response):
        books_info = response.xpath('//ul[@id="monthTicketRankList"]//a')
        
        for book in books_info:
            item = ZonghengItem()
            href = book.xpath('./@href').extract_first()         # 目录url 
            book = book.xpath('./text()').extract_first()        # 小说名
            # http://book.zongheng.com/book/1013348.html 替换为 http://book.zongheng.com/showchapter/1013348.html
            href = href[:25]+href[25:].replace("book","showchapter")    # 全部目录url
            print(href,book)
            item['book'] = book
            yield scrapy.Request(url=href,callback=self.parse_dir,meta={
    
    'item':item})
            
    def parse_dir(self,response):
        ''' 小说目录
        '''
        item = response.meta['item']
        chapter_info = response.xpath('//li[@class=" col-4"]/a')
        author = response.xpath('//div[@class="book-meta"]//a/text()').extract_first()
        print('作者:',author)
        item['author'] = author


        for chapter in chapter_info:
            href = chapter.xpath('./@href').extract_first()             # 小说内容url
            item['chapter'] = chapter.xpath('./text()').extract_first()    # 章节名

            yield scrapy.Request(url=href,callback=self.parse_content,meta={
    
    'item':copy.deepcopy(item)})    # 使用深复制
            # 使用Request函数传递item时,使用的是浅复制(对象的字段值被复制时,字段引用的对象不会被复制)

    def parse_content(self,response):
        ''' 小说内容
        '''
        item = response.meta['item']
        content = response.xpath('//div[@class="content"]/p/text()').extract()  # 小说 p 标签内容
        # 小说内容处理
        string = ""
        for s in content:
            string = string + s + '\n'
        item['content'] = string
        yield item

items.py


import scrapy


class ZonghengItem(scrapy.Item):
    book = scrapy.Field()
    author = scrapy.Field()
    chapter = scrapy.Field()
    content = scrapy.Field()

pipelines.py

import pymysql

class ZonghengPipeline(object):
    def __init__(self):
        self.id = 0
    def process_item(self, item, spider):
        # print(item)

        # 把数据插入 novel 表
        with self.connection.cursor() as cursor:
            sql = "insert ignore into novel(novelName,author) value(%s,%s)"
            try:
                result = cursor.execute(sql,(item['book'],item['author']))
                if result != 0:     # 没有重复数据
                    self.id = self.connection.insert_id()
                    self.connection.commit()
            except Exception as e:
                print(e)
        # 把数据插入 chapter 表
        with self.connection.cursor() as cursor:
            sql = "insert into chapter(chapterName,content,novelId) value(%s,%s,%s)"
            try:
                cursor.execute(sql,(item['chapter'],item['content'],self.id))
                self.connection.commit()
            except Exception as e:
                print(e)
        return item
    def open_spider(self,spider):
        print("spider start")
        self.connection = pymysql.connect(host='localhost',
                                          user='root',
                                          password='123456',
                                          db='zongheng',
                                          charset='utf8mb4',
                                          cursorclass=pymysql.cursors.DictCursor)
        
    def close_spider(self, spider):
        print("spider end")
        self.connection.close()

run.py

from scrapy import cmdline

cmdline.execute("scrapy crawl zongheng -s LOG_ENABLED=False".split())

settings.py

# 把下面的注释去掉
ITEM_PIPELINES = {
    
    
    'book.pipelines.BookPipeline': 300,
}

下载

https://github.com/CarveStone/scrapy-crawl-zhongheng

猜你喜欢

转载自blog.csdn.net/weixin_44018458/article/details/109226790