任务
- 使用 scrapy 爬取纵横中文网月票榜
- 把小说名、作者、章节名、内容保存到数据库
- 效果如下图所示
novel 表
chapter 表
设计数据库
CREATE DATABASE IF NOT EXISTS `zongheng`;
USE `zongheng`;
/*Table structure for table `novel` */
DROP TABLE IF EXISTS `novel`;
CREATE TABLE `novel` (
`id` INT(11) NULL AUTO_INCREMENT,
`novelName` VARCHAR(255) COLLATE utf8_bin NOT NULL unique,
`author` VARCHAR(255) COLLATE utf8_bin NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=INNODB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
/*Data for the table `novel` */
/*Table structure for table `chapter` */
DROP TABLE IF EXISTS `chapter`;
CREATE TABLE `chapter` (
`id` INT(11) NULL AUTO_INCREMENT,
`chapterName` VARCHAR(255) COLLATE utf8_bin NOT NULL,
`content` TEXT COLLATE utf8_bin NULL,
`novelId` INT(11) NOT NULL,
PRIMARY KEY (`id`),
KEY `FK_chapter` (`novelId`),
CONSTRAINT `FK_chapter` FOREIGN KEY (`novelId`) REFERENCES `novel` (`id`)
) ENGINE=INNODB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
文件结构
代码实现
创建项目
scrapy startproject zongheng
spider.py
import scrapy
from zongheng.items import ZonghengItem
import copy
class zongHeng(scrapy.Spider):
name = "zongheng"
start_urls = ["http://www.zongheng.com/"]
def parse(self,response):
books_info = response.xpath('//ul[@id="monthTicketRankList"]//a')
for book in books_info:
item = ZonghengItem()
href = book.xpath('./@href').extract_first() # 目录url
book = book.xpath('./text()').extract_first() # 小说名
# http://book.zongheng.com/book/1013348.html 替换为 http://book.zongheng.com/showchapter/1013348.html
href = href[:25]+href[25:].replace("book","showchapter") # 全部目录url
print(href,book)
item['book'] = book
yield scrapy.Request(url=href,callback=self.parse_dir,meta={
'item':item})
def parse_dir(self,response):
''' 小说目录
'''
item = response.meta['item']
chapter_info = response.xpath('//li[@class=" col-4"]/a')
author = response.xpath('//div[@class="book-meta"]//a/text()').extract_first()
print('作者:',author)
item['author'] = author
for chapter in chapter_info:
href = chapter.xpath('./@href').extract_first() # 小说内容url
item['chapter'] = chapter.xpath('./text()').extract_first() # 章节名
yield scrapy.Request(url=href,callback=self.parse_content,meta={
'item':copy.deepcopy(item)}) # 使用深复制
# 使用Request函数传递item时,使用的是浅复制(对象的字段值被复制时,字段引用的对象不会被复制)
def parse_content(self,response):
''' 小说内容
'''
item = response.meta['item']
content = response.xpath('//div[@class="content"]/p/text()').extract() # 小说 p 标签内容
# 小说内容处理
string = ""
for s in content:
string = string + s + '\n'
item['content'] = string
yield item
items.py
import scrapy
class ZonghengItem(scrapy.Item):
book = scrapy.Field()
author = scrapy.Field()
chapter = scrapy.Field()
content = scrapy.Field()
pipelines.py
import pymysql
class ZonghengPipeline(object):
def __init__(self):
self.id = 0
def process_item(self, item, spider):
# print(item)
# 把数据插入 novel 表
with self.connection.cursor() as cursor:
sql = "insert ignore into novel(novelName,author) value(%s,%s)"
try:
result = cursor.execute(sql,(item['book'],item['author']))
if result != 0: # 没有重复数据
self.id = self.connection.insert_id()
self.connection.commit()
except Exception as e:
print(e)
# 把数据插入 chapter 表
with self.connection.cursor() as cursor:
sql = "insert into chapter(chapterName,content,novelId) value(%s,%s,%s)"
try:
cursor.execute(sql,(item['chapter'],item['content'],self.id))
self.connection.commit()
except Exception as e:
print(e)
return item
def open_spider(self,spider):
print("spider start")
self.connection = pymysql.connect(host='localhost',
user='root',
password='123456',
db='zongheng',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
def close_spider(self, spider):
print("spider end")
self.connection.close()
run.py
from scrapy import cmdline
cmdline.execute("scrapy crawl zongheng -s LOG_ENABLED=False".split())
settings.py
# 把下面的注释去掉
ITEM_PIPELINES = {
'book.pipelines.BookPipeline': 300,
}