在爬虫中基于redis实现的几种队列

队列基类

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<[email protected]>
#         http://binux.me
# Created on 2015-04-27 22:48:04

import time
import redis
# 进行序列化的一个工具 在此对其进行改写 使用 pickle 完成相应的工作
# import umsgpack
import pickle
from six.moves import queue as BaseQueue


class RedisQueue(object):
    """
    A Queue like message built over redis
    """

    Empty = BaseQueue.Empty
    Full = BaseQueue.Full
    max_timeout = 0.3

    def __init__(self, name, host='localhost', port=6379, db=0,
                 maxsize=0, lazy_limit=True, password=None, cluster_nodes=None):
        """
        Constructor for RedisQueue
        maxsize:    an integer that sets the upperbound limit on the number of
                    items that can be placed in the queue.
        lazy_limit: redis queue is shared via instance, a lazy size limit is used
                    for better performance.
        """
        self.name = name
        if(cluster_nodes is not None):
            # 使用的是 redis 集群
            from rediscluster import StrictRedisCluster
            self.redis = StrictRedisCluster(startup_nodes=cluster_nodes)
        else:
            self.redis = redis.StrictRedis(host=host, port=port, db=db, password=password)
        self.maxsize = maxsize
        self.lazy_limit = lazy_limit
        self.last_qsize = 0

    def qsize(self):
        """
        求出队列的长度
        :return:
        """
        self.last_qsize = self.redis.llen(self.name)
        return self.last_qsize

    def empty(self):
        """
        判断队列是否为空
        :return:
        """
        if self.qsize() == 0:
            return True
        else:
            return False

    def full(self):
        """
        判断队列是否已满
        :return:
        """
        if self.maxsize and self.qsize() >= self.maxsize:
            return True
        else:
            return False

    def put_nowait(self, obj):
        """
        无阻塞地向队列中添加数据
        无阻塞的意思是如果队列已满 就立即抛出异常
        :param obj:
        :return:
        """
        if self.lazy_limit and self.last_qsize < self.maxsize:
            pass
        elif self.full():
            raise self.Full
        # 从左边进 从右边出 默认是一个先进先出的队列
        # self.last_qsize = self.redis.rpush(self.name, umsgpack.packb(obj))
        self.last_qsize = self.redis.rpush(self.name, pickle.dumps(obj))
        return True

    def put(self, obj, block=True, timeout=None):
        """
        阻塞式地向队列中添加数据
        阻塞的意思是 如果队列已满 就等待
        :param obj:
        :param block:  是否阻塞
        :param timeout:  最大的等待时间
        :return:
        """
        if not block:
            return self.put_nowait(obj)

        start_time = time.time()
        while True:
            try:
                return self.put_nowait(obj)
            except self.Full:
                if timeout:
                    lasted = time.time() - start_time
                    if timeout > lasted:
                        time.sleep(min(self.max_timeout, timeout - lasted))
                    else:
                        raise
                else:
                    time.sleep(self.max_timeout)

    def get_nowait(self):
        """
        无延迟地从队列中获取数据
        :return:
        """
        # 左进右出 默认是一个先进先出的队列
        ret = self.redis.lpop(self.name)
        if ret is None:
            raise self.Empty
        # return umsgpack.unpackb(ret)
        return pickle.loads(ret)

    def get(self, block=True, timeout=None):
        """
        阻塞地从对列中获取
        :param block:
        :param timeout:
        :return:
        """
        if not block:
            return self.get_nowait()

        start_time = time.time()
        while True:
            try:
                return self.get_nowait()
            except self.Empty:
                if timeout:
                    lasted = time.time() - start_time
                    if timeout > lasted:
                        time.sleep(min(self.max_timeout, timeout - lasted))
                    else:
                        raise
                else:
                    time.sleep(self.max_timeout)

Queue = RedisQueue


if __name__ == "__main__":
    q = Queue(name="test_queue", host="192.168.0.101")
    for i in range(100):
        q.put(i)

    for j in range(100):
        print(q.get())

先进先出队列

# 先进先出的队列
from redis_queue.base import Queue


class FifoRedisQueue(Queue):
    """
    先进先出的队列 继承即可使用
    """
    pass


q = FifoRedisQueue(name="test_queue", host="192.168.0.101")
for i in range(100):
    q.put(i)

for j in range(100):
    print(q.get())

后进先出队列

import pickle

from redis_queue.base import Queue


class LifoReidsQueue(Queue):
    """
    后进先出的队列
    需要重写 get_nowait 以及 put_nowait 方法
    """
    def get_nowait(self):
        """
        无延迟地从队列中获取数据
        :return:
        """
        # 默认是左进右出
        # 改为右进右出 即为一个后进先出的队列
        ret = self.redis.rpop(self.name)
        if ret is None:
            raise self.Empty
        return pickle.loads(ret)


q = LifoReidsQueue(name="test_queue", host="192.168.0.101")
for i in range(100):
    q.put(i)

for j in range(100):
    print(q.get())

优先级队列

# 实现优先级队列
import pickle
import time

from redis_queue.base import Queue


class PriorityRedisQueue(Queue):

    def get_nowait(self):
        """
        -1 -1: 默认获取优先级最大的数据
        0, 0 默认获取优先级最小的数据
        """
        # 先获取到这个数据
        # 然后将这个数据删除
        # 这两步应该是具有原子性的被完成
        ret = self.redis.zrange(self.name, -1, -1)
        self.redis.zrem(self.name, ret[0])
        if ret is None:
            raise self.Empty
        return pickle.loads(ret[0])
        # return ret[0]

    def put_nowait(self, obj):
        """
        向优先级队列中添加数据
        :param obj: 一个元组 (score, value)
        :return:
        """
        if self.lazy_limit and self.last_qsize < self.maxsize:
            pass
        elif self.full():
            raise self.Full
        self.last_qsize = self.redis.zadd(self.name, {pickle.dumps(obj[1]): obj[0]})
        # self.last_qsize = self.redis.zadd(self.name, {obj[1]: obj[0]})
        return True


q = PriorityRedisQueue(name="test_queue2", host="192.168.0.101")
datas = [(99, "kailun"), (100, "ruiyang"), (101, "rm")]
for data in datas:
    q.put(data)

print(q.get())
print(q.get())
print(q.get())

后记

更新时间: 2020-02-04

发布了291 篇原创文章 · 获赞 104 · 访问量 41万+

猜你喜欢

转载自blog.csdn.net/Enjolras_fuu/article/details/104175419