json
方法一:通过在cmd中输命令来实现
scrapy crawl novel -o novel.json -s FEED_EXPORT_ENCIDING=UTF-8
novel为爬虫名称(name)
scrapy将数据保存为csv文件: scrapy crawl novel -o novel.csv -s FEED_EXPORT_ENCIDING=UTF-8
方法二:通过pipelines实现
1.自定义自己的pipeline
pipelines.py
import json class JsonWriterPipeline(object): def __init__(self): self.file = open('jobbole.json', 'wb') def process_item(self, item, spider): line = json.dumps(dict(item)) + "\n" self.file.write(line.encode('utf-8')) return item
2.在settings中开启自己的pipeline
扫描二维码关注公众号,回复:
1930105 查看本文章
settings.py
ITEM_PIPELINES = { # 'Jobbole.pipelines.JobbolePipeline': 300, # Json数据保存 'Jobbole.pipelines.JsonWriterPipeline':1, }
Mongodb
1.在pipelines.py中自定义自己的pipeline
import pymongo # 保存到Mongo数据库里面 class MongoPipeline(object): def __init__(self, client, db): self.client = pymongo.MongoClient(client) self.db = self.client[db] @classmethod def from_crawler(cls, crawler): obj = cls( client=crawler.settings.get('MONGOCLIENT', 'localhost'), db=crawler.settings.get('DB', 'jobbole') ) return obj def process_item(self, item, spider): self.db['jobbole'].update_one({'artitle_item': item['artitle_item']}, {'$set': dict(item)}, True) return item
2.在settings.py中开启自己的pipeline
ITEM_PIPELINES = { # 保存到Mongo数据库中 'Jobbole.pipelines.MongoPipeline':1, } MONGOCLIENT = 'localhost'
DB = 'jobbole'
Mysql
1.在pipelines.py中自定义自己的pipeline
import pymysql class DBPipeline(object): def __init__(self, host, port, db, user, passwd, charset): self.db = pymysql.connect(host=host, port=port, db=db, user=user, passwd=passwd, charset=charset) self.cursor = self.db.cursor() @classmethod def from_crawler(cls, crawler): # 连接数据库 obj = cls( host=crawler.settings.get('MYSQL_HOST', 'localhost'), port=3306, db=crawler.settings.get('MYSQL_DBNAME', 'jobbole'), user=crawler.settings.get('MYSQL_USER', 'root'), passwd=crawler.settings.get('MYSQL_PASSWD', '123456'), charset='utf8') return obj def process_item(self, item, spider): # 插入数据 try: self.cursor.execute( """insert into jobbole (artitle_item, release_time, fenlei ,dianzan, num,comment, content) value (%s, %s, %s, %s, %s, %s, %s)""", (item['artitle_item'], item['release_time'], item['fenlei'], item['dianzan'], item['num'], item['comment'], item['content'])) # 提交sql语句 self.db.commit() return item except Exception as e: print(e) self.db.rollback()
2.在settings中开启自己的pipeline
ITEM_PIPELINES = { # 保存到Mysql数据库中 'Jobbole.pipelines.DBPipeline':1, }
MYSQL_HOST = 'localhost' MYSQL_DBNAME = 'jobbole' MYSQL_USER = 'root' MYSQL_PASSWD = '123456'
excel
1.在pipelines.py中自定义自己的pipeline
from openpyxl import Workbook
# 将数据保存到 excel 中 class ExcelPipeline( object): # 设置工序一 wb = Workbook() # class 实例化 ws = wb.active # 激活工作表 ws.append([ ' 文章标题 ' , ' 发布时间 ' , ' 分类 ' , ' 点赞次数 ' , ' 评论次数 ' , ' 内容 ']) # 设置表头添加一行数据 def process_item( self , item , spider): # 工序具体内容 line = [item[ 'artitle_item'] , item[ 'release_time'] , item[ 'fenlei'] , item[ 'dianzan'] , item[ 'num'] , item[ 'comment'] ,item[ 'content']] # 把数据中每一项整理出来 self.ws.append(line) # 将数据以行的形式添加到 xlsx 中 self.wb.save( 'jobbole.xlsx') # 保存 xlsx 文件 return item
2.在settings.py中开启自己的pipeline
ITEM_PIPELINES = { # 保存到Excel中 'Jobbole.pipelines.ExcelPipeline':1, }