Scrapy 中 settings 配置

Setting设置

# -*- coding: utf-8 -*-

 

# Scrapy settings for yangguang project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#  redis增量爬虫设置

# 指定那个去重方法给request对象去重

扫描二维码关注公众号,回复: 9286653 查看本文章

# 调度器 指定scheduler队列

# 让redis持续保存数据  设置False,会在关闭redis的时候清空redis

# 设置路径操作redis

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

SCHEDULER = "scrapy_redis.scheduler.Scheduler"

SCHEDULER_PERSIST = True

REDIS_URL = "redis://127.0.0.1:6379"

 

# scrapy_redis实现的items保存到redis的pipeline。

ITEM_PIPELINES = {

    ......

    'scrapy_redis.pipelines.RedisPipeline': 400,

}

# 也可以写成

# REDIS_HOST = “192.168.207.124”

# REDIS_PORT = 6379

-----------------------------------------------------------------------------------------------

 

# 项目名

BOT_NAME = 'yangguang'

 

# 爬虫位置

SPIDER_MODULES = ['yangguang.spiders']

 

# 新建爬虫位置

NEWSPIDER_MODULE = 'yangguang.spiders'

 

# 查看cookies传递路径

COOKIES_DEBUG =True

 

# 设置报警级别

LOG_LEVEL="WARNING"

 

# 设置log日志保存的地址(终端中不再显示)

# LOG_FILE="./log.log"

 

# 设置请求中的 User-Agent (浏览器的身份标识,用户代理)

#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36'

 

# 遵守robots协议.(网站中允许爬去的范围)

ROBOTSTXT_OBEY = True

 

# 设置mongo存储位置为本机

MONGO_HOST="local_host"

 

# 配置Scrapy执行的最大并发请求(默认值:16)

#CONCURRENT_REQUESTS = 32

 

#配置对同一网站要求延迟(默认值:0秒) 

#DOWNLOAD_DELAY = 3

 

# 每个域名请求并发数

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

# 每个ip请求并发数

#CONCURRENT_REQUESTS_PER_IP = 16

 

# 禁用cookie(默认启用)

#COOKIES_ENABLED = False

 

# 禁用Telnet控制台(默认启用)

#TELNETCONSOLE_ENABLED = False

 

# 覆盖默认请求头 (注:User-Agent不能写到这里)

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}

 

# 启用或禁用spider中间件

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'yangguang.middlewares.YangguangSpiderMiddleware': 543,

#}

 

# 启用或禁用downloader中间件

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'yangguang.middlewares.YangguangDownloaderMiddleware': 543,

#}

 

# 启用或禁用扩展

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}

 

#配置项目管道

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

# 开启ITEM_PIPELINES,yield 才能接收item返回到pipelines.py中调用,存入mongodb数据库。 (300是权重值,越小越先执行)

ITEM_PIPELINES = {

   'yangguang.pipelines.YangguangPipeline': 300,

}

 

# 启用并配置自动节流阀扩展(默认禁用) 防止请求过快,将服务器抓崩。

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

 

# 启用和配置HTTP缓存(默认禁用)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

发布了9 篇原创文章 · 获赞 18 · 访问量 6568

猜你喜欢

转载自blog.csdn.net/zlc1990628/article/details/84328372