Setting设置
# -*- coding: utf-8 -*-
# Scrapy settings for yangguang project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# redis增量爬虫设置
# 指定那个去重方法给request对象去重
# 调度器 指定scheduler队列
# 让redis持续保存数据 设置False,会在关闭redis的时候清空redis
# 设置路径操作redis
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL = "redis://127.0.0.1:6379"
# scrapy_redis实现的items保存到redis的pipeline。
ITEM_PIPELINES = {
......
'scrapy_redis.pipelines.RedisPipeline': 400,
}
# 也可以写成
# REDIS_HOST = “192.168.207.124”
# REDIS_PORT = 6379
-----------------------------------------------------------------------------------------------
# 项目名
BOT_NAME = 'yangguang'
# 爬虫位置
SPIDER_MODULES = ['yangguang.spiders']
# 新建爬虫位置
NEWSPIDER_MODULE = 'yangguang.spiders'
# 查看cookies传递路径
COOKIES_DEBUG =True
# 设置报警级别
LOG_LEVEL="WARNING"
# 设置log日志保存的地址(终端中不再显示)
# LOG_FILE="./log.log"
# 设置请求中的 User-Agent (浏览器的身份标识,用户代理)
#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36'
# 遵守robots协议.(网站中允许爬去的范围)
ROBOTSTXT_OBEY = True
# 设置mongo存储位置为本机
MONGO_HOST="local_host"
# 配置Scrapy执行的最大并发请求(默认值:16)
#CONCURRENT_REQUESTS = 32
#配置对同一网站要求延迟(默认值:0秒)
#DOWNLOAD_DELAY = 3
# 每个域名请求并发数
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
# 每个ip请求并发数
#CONCURRENT_REQUESTS_PER_IP = 16
# 禁用cookie(默认启用)
#COOKIES_ENABLED = False
# 禁用Telnet控制台(默认启用)
#TELNETCONSOLE_ENABLED = False
# 覆盖默认请求头 (注:User-Agent不能写到这里)
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# 启用或禁用spider中间件
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'yangguang.middlewares.YangguangSpiderMiddleware': 543,
#}
# 启用或禁用downloader中间件
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'yangguang.middlewares.YangguangDownloaderMiddleware': 543,
#}
# 启用或禁用扩展
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
#配置项目管道
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 开启ITEM_PIPELINES,yield 才能接收item返回到pipelines.py中调用,存入mongodb数据库。 (300是权重值,越小越先执行)
ITEM_PIPELINES = {
'yangguang.pipelines.YangguangPipeline': 300,
}
# 启用并配置自动节流阀扩展(默认禁用) 防止请求过快,将服务器抓崩。
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# 启用和配置HTTP缓存(默认禁用)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'