1.在pipelines.py中自定义自己的pipeline
import pymongo
class MongoPipeline(object):
def __init__(self, client, db):
self.client = pymongo.MongoClient(client)
self.db = self.client[db]
# from_crawler()作用就是从settings.py中读取相关配置,然后可以将读取结果保存在类中使用。
@classmethod
def from_crawler(cls, crawler):
# 创建当前类的对象,并传递两个参数。
obj = cls(
client=crawler.settings.get('MONGOCLIENT', 'localhost'),
db=crawler.settings.get('DB', 'test')
)
return obj
def process_item(self, item, spider):
self.db['novel'].update_one({'url': item['url']}, {'$set': dict(item)}, True)
return item
2.在settings.py中开启自己的pipeline
ITEM_PIPELINES = {
'NovelSpider.pipelines.MongoPipeline': 301,
}
MONGOCLIENT = 'localhost'
DB = 'novel'