服务器 scrapy 部署小实例(scrapyd + supervisor)

本项目仅供学习交流,请勿商用;

一、创建项目

# 创建scrapy爬虫项目
scrapy startproject meinv

cd meinv/

# 查看可用爬虫模板,并基于 crawl 模板创建全站爬虫
scrapy genspider -l
scrapy genspider -t crawl mm 2717.com

二、项目代码

./meinv/spiders/mm.py
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule
from ..items import MeinvItem

class MmSpider(CrawlSpider):
    name = 'mm'
    allowed_domains = ['2717.com']
    start_urls = ['https://www.2717.com/ent/meinvtupian/']

    rules = (
        # 大界面的下一页
        Rule(LinkExtractor(allow=r'list_\d+_\d+\.html'), ),
        # 单人相册的下一页
        Rule(LinkExtractor(allow=r'/ent/meinvtupian/2019/\d+.html'), ),
        # 单张相册的界面,需要提取的内容
        Rule(LinkExtractor(allow=r'\d+_\d+\.html'), callback='parse_item', follow=False),
    )

    def parse_item(self, response):
        loader = ItemLoader(MeinvItem(), response=response)
        loader.add_xpath('title', '//div[@class="articleV4Body"]/p/a[1]/img/@alt')
        loader.add_xpath('url', '//div[@class="articleV4Body"]/p/a[1]/img/@src')
        yield loader.load_item()
./meinv/items.py
import scrapy, re
from scrapy.loader.processors import TakeFirst, Join, MapCompose

def delete_blank(data):
    return re.sub(r'\s+', '', data)

class MeinvItem(scrapy.Item):

    title = scrapy.Field(
        input_processor=MapCompose(delete_blank),
        output_processor=Join()
    )

    url = scrapy.Field(
        output_processor=TakeFirst()
    )
./meinv/pipelines.py
# -*- coding: utf-8 -*-
import pymysql

class MeinvPipeline(object):

    def __init__(self):
        self.conn = pymysql.connect(
            host='127.0.0.1',
            port=3306,
            user='root',
            password='******',
            database='scrapy_test',
            charset='utf8'
        )
        # 得到一个可以执行SQL语句并且将结果作为字典返回的游标
        self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
        # 创建表
        self.cursor.execute('create table if not exists mm(id int not null primary key auto_increment, title varchar (255) default null , url varchar (255) default null );')
        self.conn.commit()

    def process_item(self, item, spider):

        if spider.name == 'mm':
            print('*'*20)
            print(item['title'])
            print(item['url'][0]) # 列表需要提取一下
            print('*'*20)
            print()
            self.cursor.execute("insert into mm values (0, '%s', '%s');" % (item['title'], item['url'][0]))
            self.conn.commit()

./meinv/settings.py
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'application/json, text/plain, */*',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
ITEM_PIPELINES = {
    'meinv.pipelines.MeinvPipeline': 300,
}

三、运行命令

scrapy crawl mm

四、服务器部署

1、安装 scrapyd

pip install scrapyd

2、配置

/data/virtualenvs/testdemo/lib/python3.6/site-packages/scrapyd/default_scrapyd.conf
bind_address = 0.0.0.0  # 外网

3、开启 5000 和 6800 端口

firewall-cmd --list-all
firewall-cmd --permanent --add-port=5000/tcp
firewall-cmd --permanent --add-port=6800/tcp
firewall-cmd --reload

4、开启服务测试

输入命令 scrapyd,开启 scrapyd 服务,浏览器访问  http://192.168.5.149:6800/

5、安装 scrapyd-client

pip install scrapyd-client
配置 ./scrapy.cfg
[settings]
default = meinv.settings

[deploy:mm]
url = http://192.168.5.149:6800/
project = meinv
scrapyd-deploy -l  # 查看所有爬虫
报错:
    scrapyd-deploy:23: ScrapyDeprecationWarning: Module `scrapy.utils.http` is deprecated, Please import from `w3lib.http` instead. from scrapy.utils.http import basic_auth_header
解决:
    from scrapy.utils.http import basic_auth_header改为 from w3lib.http import basic_auth_header

6、开始运行

# 开启 scrapyd 服务
scrapyd

# 上传 scrapy 到 scrapyd 服务器  scrapyd-deploy <target> -p <project>
scrapyd-deploy mm -p meinv

# 开始运行
curl http://192.168.5.149:6800/schedule.json -d project=meinv -d spider=mm

五、supervisor 守护进程

1、安装

# 守护进程
pip install supervisor
# web 网页管理爬虫
pip install spiderkeeper

2、supervisor 配置文件

mkdir /etc/supervisor

echo_supervisord_conf > /etc/supervisor/supervisord.conf

vim /etc/supervisor/supervisord.conf

[include]
files = conf.d/*.conf

3、添加 scrapyd 的配置文件

mkdir /etc/supervisor/conf.d

vim /etc/supervisor/conf.d/scrapyd.conf

[program:scrapyd]
autostart=true
directory=/data/workspace/scrapy_project/meinv
command=/data/virtualenvs/testdemo/bin/scrapyd
user=root
stderr_logfile=/var/log/scrapyd.err.log
stdout_logfile=/var/log/scrapyd.out.log

4、spiderkeeper(用于web管理爬虫)

# 添加 spiderkeeper 的配置文件(spiderkeeper 可以识别多台 scrapyd,具体多加 --server 就可以)
vim /etc/supervisor/conf.d/spiderkeeper.conf

[program:spiderkeeper]
directory=/data/workspace/scrapy_project/meinv
command=/data/virtualenvs/testdemo/bin/spiderkeeper --server=http://192.168.5.149:6800
user=root
stderr_logfile=/var/log/spiderkeeper.err.log
stdout_logfile=/var/log/spiderkeeper.out.log

5、supervisor 命令

# 指定配置文件启动
supervisord -c /etc/supervisor/supervisord.conf
            
# 重启
supervisorctl reload

# 关闭
supervisorctl shutdown

6、supervisor 设置 systemctl

vim /lib/systemd/system/supervisord.service


[Unit]
Description=supervisord

[Service]
Type=forking
ExecStart=/usr/local/python36/bin/supervisord -c /etc/supervisor/supervisord.conf
ExecReload=/usr/local/python36/bin/supervisorctl reload
ExecStop=/usr/local/python36/bin/supervisorctl shutdown
KillMode=process
Restart=on-failure
RestartSec=42s

[Install]
WantedBy=multi-user.target



systemctl daemon-reload
systemctl enable supervisord.service
systemctl start supervisord

7、spiderkeeper 网页管理爬虫

# 登录
http://192.168.5.149:5000

# 网页中,新建 project
# 打包egg爬虫文件(需要提前 pip install scrapyd-client)
scrapyd-deploy --build-egg output.egg
# 然后再网站上操作运行

在这里插入图片描述

发布了26 篇原创文章 · 获赞 4 · 访问量 720

猜你喜欢

转载自blog.csdn.net/qq_36072270/article/details/104496948