Scrapy 中 settings 配置

Setting设置

# -*- coding: utf-8 -*-

# Scrapy settings for yangguang project

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

# https://doc.scrapy.org/en/latest/topics/settings.html

# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

# redis增量爬虫设置

# 指定那个去重方法给request对象去重

扫描二维码关注公众号，回复： 9286653 查看本文章

# 调度器指定scheduler队列

# 让redis持续保存数据设置False，会在关闭redis的时候清空redis

# 设置路径操作redis

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

SCHEDULER = "scrapy_redis.scheduler.Scheduler"

SCHEDULER_PERSIST = True

REDIS_URL = "redis://127.0.0.1:6379"

# scrapy_redis实现的items保存到redis的pipeline。

ITEM_PIPELINES = {

......

'scrapy_redis.pipelines.RedisPipeline': 400,

}

# 也可以写成

# REDIS_HOST = “192.168.207.124”

# REDIS_PORT = 6379

-----------------------------------------------------------------------------------------------

# 项目名

BOT_NAME = 'yangguang'

# 爬虫位置

SPIDER_MODULES = ['yangguang.spiders']

# 新建爬虫位置

NEWSPIDER_MODULE = 'yangguang.spiders'

# 查看cookies传递路径

COOKIES_DEBUG =True

# 设置报警级别

LOG_LEVEL="WARNING"

# 设置log日志保存的地址(终端中不再显示)

# LOG_FILE="./log.log"

# 设置请求中的　User-Agent (浏览器的身份标识，用户代理)

#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36'

# 遵守robots协议.(网站中允许爬去的范围)

ROBOTSTXT_OBEY = True

# 设置mongo存储位置为本机

MONGO_HOST="local_host"

#　配置Scrapy执行的最大并发请求(默认值：16)

#CONCURRENT_REQUESTS = 32

#配置对同一网站要求延迟（默认值：0秒）　

#DOWNLOAD_DELAY = 3

# 每个域名请求并发数

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

# 每个ip请求并发数

#CONCURRENT_REQUESTS_PER_IP = 16

#　禁用cookie(默认启用)

#COOKIES_ENABLED = False

#　禁用Telnet控制台(默认启用)

#TELNETCONSOLE_ENABLED = False

#　覆盖默认请求头　(注：User-Agent不能写到这里)

#DEFAULT_REQUEST_HEADERS = {

# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

# 'Accept-Language': 'en',

#　启用或禁用spider中间件

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

# 'yangguang.middlewares.YangguangSpiderMiddleware': 543,

#　启用或禁用downloader中间件

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

# 'yangguang.middlewares.YangguangDownloaderMiddleware': 543,

#　启用或禁用扩展

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

# 'scrapy.extensions.telnet.TelnetConsole': None,

#配置项目管道

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

# 开启ITEM_PIPELINES，yield　才能接收item返回到pipelines.py中调用，存入mongodb数据库。　(300是权重值，越小越先执行)

ITEM_PIPELINES = {

'yangguang.pipelines.YangguangPipeline': 300,

}

#　启用并配置自动节流阀扩展(默认禁用)　防止请求过快，将服务器抓崩。

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

#　启用和配置HTTP缓存（默认禁用）

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

若纷飞

发布了9 篇原创文章 · 获赞 18 · 访问量 6568

私信关注