实在不行来瓶敌敌畏吧!Scrapy框架爬取某农药肥料网站+异步Mysql数据库存储

最近在开发中需要用到很多图片,结合以前学过python,想着能不能自己动手搞一个爬虫出来,并且有一个师弟发消息问我能不能帮他写个python脚本,说干就干。

ps:敲完了今晚的码子,竟然对scrapy异步爬虫有点懵逼,技术不能停滞啊,这才几天就忘差不多了。慢慢来吧,心态放稳,lets go!

#先来通过查看网页请求看看爬虫基本的逻辑思路
在这里插入图片描述

在这里插入图片描述

  • ↓↓↓爬虫的码子↓↓↓主要是爬虫代码

 # -*- coding: utf-8 -*-
from fake_useragent import UserAgent
import scrapy
import json
from nongcha.items import NongchaItem  # 引入item字段


class NcSpider(scrapy.Spider):
    name = 'nc'
    allowed_domains = ['cha.191.cn']
    #start_urls = ['http://cha.191.cn/home/search']
    ua = UserAgent()
    user_agent = f'{ua.random}'

    def __init__(self):
        super(NcSpider, self).__init__()
        self.page = 1

    def start_requests(self):
        """重写父类的请求函数"""
        yield scrapy.FormRequest(
            url='http://cha.191.cn/home/search/dosearch',
            formdata={
                'p': f'{self.page}',
                'keyword': '敌敌畏'},
            headers={
                'Cookie': 'PHPSESSID=komdv7aiqci9eq1hkf0u4npc63',
            #     'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                'Referer': 'http://cha.191.cn/home/search'
            }
        )
        # 发起formrequest格式的请求,携带请求需要的相关参数

    def parse(self, response):
        """解析返回的数据的函数"""
        data = json.loads(response.text)
        if data['status'] == '0':  # 判断数据是否返回成功 0 为一个字符串
            infos = data['data']['list']  # 获取详细数据,列表嵌套20个字典
            if infos:   #  如果返回的数据为真
                for info in infos:  # 迭代这个列表
                    nc_id = info.get('id')  # 商品id
                    nc_name = info.get('name')  # 商品名称
                    nc_company = info.get('company_name')  # 商品公司名称
                    nc_registry = info.get('registry')  # 商品登记证号
                    nc_img = 'http://cha.191.cn/home/product/detailToImage/productId/' + nc_id  # 拼接商品详情页图片的地址
                    print(nc_company)
                    item = NongchaItem()  # 创建item实例  以便下载+存储数据
                    item['nc_id'] = nc_id
                    item['nc_name'] = nc_name
                    item['nc_company'] = nc_company
                    item['nc_registry'] = nc_registry
                    item['nc_img'] = [nc_img]
                    yield item

                self.page += 1  # 下一页数据
                try:
                    yield scrapy.FormRequest(
                        url='http://cha.191.cn/home/search/dosearch',
                        formdata={
                            'p': f'{self.page}',
                            'keyword': '敌敌畏'},
                        headers={
                            'Cookie': 'PHPSESSID=komdv7aiqci9eq1hkf0u4npc63',
                            #     'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                            'Referer': 'http://cha.191.cn/home/search'
                        },
                        callback=self.parse
                    )
                    # 发起formrequest格式的请求,携带请求需要的相关参数
                except Exception as out_of_page:
                    print(out_of_page)
                    print('全部数据爬取完成!')


if __name__ == '__main__':
    from scrapy.cmdline import execute
    execute(['scrapy', 'crawl', 'nc'])

  • ↓↓↓pipeline的码子↓↓↓最复杂的吧 分为下载图片+Excel表格存储+异步数据库存储

    # -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline  # 引入scrapy自带的图片下载pipeline
from scrapy import Request
import xlwt
from twisted.enterprise import adbapi
import pymysql
from nongcha.items import NongchaItem

class NongchaPipeline(object):

    def process_item(self, item, spider):
        return item


class CustomImagePipeline(ImagesPipeline):

    """图片下载数据模型,继承自自带模型"""

    def get_media_requests(self, item, info):  # 重写一个创建请求对象的函数
        return [Request(x, meta={'item': item}) for x in item.get(self.images_urls_field, [])]
        # 创建图片地址的请求对象,并将item放在request中进行携带

    def file_path(self, request, response=None, info=None):
        """返回图片路径以及保存地址的函数"""
        item = request.meta.get('item')  # 从request中取出item
        if item:
            nc_company = item['nc_company']  # 商品名称
            nc_id = item['nc_id']  # 商品id
            path = f'{nc_company}/{nc_id}.jpg'  # 拼接图片路径
            item['img_path'] = f'images/{path}'  # 将图片存储的路径 写入item字段
            return path


class ExcelPipeline(object):
    """将数据写入Excel表格的模型"""

    def open_spider(self, spider):
        """启动爬虫时执行的函数"""

        self.workbook = xlwt.Workbook(encoding='utf-8')  # 建立工作簿
        self.sheet = self.workbook.add_sheet('ncc_data')  # 写入表头
        self.sheet.write(0, 0, 'nc_id')
        self.sheet.write(0, 1, 'nc_name')
        self.sheet.write(0, 2, 'nc_company')
        self.sheet.write(0, 3, 'nc_registry')
        self.sheet.write(0, 4, 'nc_img')
        self.sheet.write(0, 5, 'img_path')

        self.count = 0  # 记录行号

    def process_item(self, item, spider):
        """处理数据的函数"""
        self.count += 1  # 逐行加一
        self.sheet.write(self.count, 0, item['nc_id'])
        self.sheet.write(self.count, 1, item['nc_name'])
        self.sheet.write(self.count, 2, item['nc_company'])
        self.sheet.write(self.count, 3, item['nc_registry'])
        self.sheet.write(self.count, 4, item['nc_img'][0])
        self.sheet.write(self.count, 5, item['img_path'])

        self.workbook.save('ncc.data.xls')  # 保存数据
        return item


class AsyncWriteMysql(object):
    """异步写入数据库的模型"""

    def __init__(self):
        """初始化属性"""
        prams = dict(
            host='127.0.0.1',  # 本地
            port=3306,  # 端口
            user='root',  # 用户
            password='123456',  # 密码
            db='ncc',  # 数据库名称
            charset='utf8',  # 字符集
            use_unicode=True,  # 是否使用unicode编码
            cursorclass=pymysql.cursors.DictCursor  # 使用字典类型的游标
        )  # 创建字典型的参数
        self.db_pool = adbapi.ConnectionPool('pymysql', **prams)  # 创建连接池对象 1.用来操作mysql的第三方包名称 2.连接数据库所需的参数

    def process_item(self, item, spider):
        """处理数据的函数"""
        result = self.db_pool.runInteraction(self.insert_item, item)  # 让连接池执行任务,异步执行任务,任务完成之后 会返回执行的结果
        result.addErrback(self.insert_error, item)  # 给执行结果添加错误回调函数
        return item

    def insert_item(self, cursor, item):
        """向数据库插入数据的函数 cursor为必要参数"""
        # item = NongchaItem()
        item.save(cursor)  # 判断当前数据属于哪一个爬虫类,并调取函数作相应保存

    def insert_error(self, fail, item):  # 错误回调函数
        print(item['nc_id'])  # 输出出现错误的数据id
        print(fail)  # 输出错误信息

  • **↓↓↓item的码子↓↓↓主要是写入数据字段以及保存数据进数据库 **

    # -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class NongchaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    nc_id = scrapy.Field()
    nc_name = scrapy.Field()
    nc_company = scrapy.Field()
    nc_registry = scrapy.Field()
    nc_img = scrapy.Field()
    img_path = scrapy.Field()

    def save(self, cursor):
        """将数据保存至数据库的函数"""
        cursor.execute("INSERT INTO ncc_data(nc_id, nc_name, nc_company, nc_registry, nc_img, img_path)VALUES (%s, %s, %s, %s, %s, %s)", (self['nc_id'], self['nc_name'], self['nc_company'], self['nc_registry'], self['nc_img'][0], self['img_path']))
        # 执行sql语句

  • **↓↓↓settings的码子↓↓↓大部分注释掉 **

    # -*- coding: utf-8 -*-

# Scrapy settings for nongcha project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'nongcha'

SPIDER_MODULES = ['nongcha.spiders']
NEWSPIDER_MODULE = 'nongcha.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'nongcha (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 16

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

DEFAULT_REQUEST_HEADERS ={
    'Host': 'cha.191.cn',
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# #激活爬虫的中间件
# SPIDER_MIDDLEWARES = {
#    'nongcha.middlewares.NongChaMiddleware': 100,
# }

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'nongcha.middlewares.NongChaMiddleware': 543,
# }

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'nongcha.pipelines.CustomImagePipeline': 200,
    'nongcha.pipelines.ExcelPipeline': 300,
    'nongcha.pipelines.AsyncWriteMysql': 250

}
# 保存图片地址的字段名
IMAGES_URLS_FIELD = 'nc_img'
#  图片保存的路径
IMAGES_STORE = 'images'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

↓↓↓数据存储界面↓↓↓

#敌敌畏图片数据 基本包含所有的信息了
在这里插入图片描述
#EXCEL表单数据
在这里插入图片描述
#Mysql数据库在这里插入图片描述
#就酱,欢迎一起研究,继续补习Typescript去啦~

发布了12 篇原创文章 · 获赞 22 · 访问量 3247

猜你喜欢

转载自blog.csdn.net/bosslay/article/details/89115344