python笔记(Django, Flask, Scrapy信号 scrapy-redis )

详细参考
一、信号

  1. Django
    models.py

    from django.db import models
    
    class User(models.Model):
        title = models.CharField(max_length=32)
    

    views.py

    from django.shortcuts import render,HttpResponse
    from app01 import models
    
    def func1(request):
        # models.User.objects.create(title='老男孩')
        return HttpResponse('创建成功')
    
    def func2(request):
        models.User.objects.create(title='小男孩')
        return HttpResponse('创建成功')
    
    def func3(request):
        models.User.objects.create(title='小少年')
        return HttpResponse('创建成功')
    
    def func4(request):
        models.User.objects.create(title='小青年')
        return HttpResponse('创建成功')
    

    __ init __.py(与settings同级的目录)

    from django.db.models import signals
    
    def before_save1(*args,**kwargs):
        print('有车来了,我要服务了--》',args,kwargs)
    
    def before_save2(*args,**kwargs):
        print('有车来了,我要服务了--》',args,kwargs)
    
    def after_save1(*args,**kwargs):
        print('有车来了,完事了--》',args,kwargs)
    
    signals.pre_save.connect(before_save1)
    signals.pre_save.connect(before_save2)
    signals.post_save.connect(after_save1)
    
  2. Flask
    app.py

    from flask import Flask,render_template
    from flask import signals
    
    app = Flask(__name__)
    
    def x1(arg):
        print('x1')
    def x2(arg):
        print('x2')
    signals.request_started.connect(x1)
    signals.request_started.connect(x2)
    
    
    # @app.before_request
    # def bf():
    #     print('bbbbb')
    #     return render_template("asdfadf")
    
    
    @app.route('/index')
    def func():
        print('视图函数')
        return "asdfasdf"
    
    
    if __name__ == '__main__':
        # app.__call__
        app.run()
        # with app.app_context():
        #     pass
    
  3. Scrapy

    ext.py(与settings同级的目录下)

    from scrapy import signals
    
    class MyExtend(object):
    	def __init__(self):
    		pass
    
    	@classmethod
    	def from_crawler(cls, crawler):
    		self = cls()
    
    		crawler.signals.connect(self.x1, signal=signals.spider_opened)
    		crawler.signals.connect(self.x2, signal=signals.spider_closed)
    
    		return self
    
    	def x1(self, spider):
    		print('open')
    
    	def x2(self, spider):
    		print('close')
    

    配置:

    	EXTENSIONS = {
    	'xdb.ext.MyExtend':666,
    }
    

二、scrapy-redis
基于scrapy-redis的去重规则

  1. 完全自定义

    from scrapy.dupefilter import BaseDupeFilter
    import redis
    from scrapy.utils.request import request_fingerprint
    
    class DupFilter(BaseDupeFilter):
    	def __init__(self):
    		self.conn = redis.Redis(host='140.143.227.206',port=8888,password='beta')
    
    	def request_seen(self, request):
    		"""
    		检测当前请求是否已经被访问过
    		:param request: 
    		:return: True表示已经访问过;False表示未访问过
    		"""
    		fid = request_fingerprint(request)
    		result = self.conn.sadd('visited_urls', fid)
    		if result == 1:
    			return False
    		return True
    
  2. 继承scrapy-redis 实现自定制

     from scrapy_redis.dupefilter import RFPDupeFilter
     		from scrapy_redis.connection import get_redis_from_settings
     		from scrapy_redis import defaults
    
     		class RedisDupeFilter(RFPDupeFilter):
     			@classmethod
     			def from_settings(cls, settings):
     				"""Returns an instance from given settings.
    
     				This uses by default the key ``dupefilter:<timestamp>``. When using the
     				``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
     				it needs to pass the spider name in the key.
    
     				Parameters
     				----------
     				settings : scrapy.settings.Settings
    
     				Returns
     				-------
     				RFPDupeFilter
     					A RFPDupeFilter instance.
    
    
     				"""
     				server = get_redis_from_settings(settings)
     				# XXX: This creates one-time key. needed to support to use this
     				# class as standalone dupefilter with scrapy's default scheduler
     				# if scrapy passes spider on open() method this wouldn't be needed
     				# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
     				key = defaults.DUPEFILTER_KEY % {'timestamp': 'xiaodongbei'}
     				debug = settings.getbool('DUPEFILTER_DEBUG')
     				return cls(server, key=key, debug=debug)
    

    配置:

     # ############### scrapy redis连接 ####################
    
     REDIS_HOST = '140.143.227.206'                            # 主机名
     REDIS_PORT = 8888                                   # 端口
     REDIS_PARAMS  = {'password':'beta'}                                  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
     REDIS_ENCODING = "utf-8"                            # redis编码类型             默认:'utf-8'
     # REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)
     
     # ############### scrapy redis去重 ####################
     
     
     DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
     
     # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
     DUPEFILTER_CLASS = 'dbd.xxx.RedisDupeFilter'
    

猜你喜欢

转载自blog.csdn.net/qq_41433183/article/details/89944424
今日推荐