布隆过滤器原理及python实现

1 原理

布隆过滤器的原理及数学推导可以参考:

https://www.cnblogs.com/xiaohuiduan/p/11488020.html

2  python demo实现

from bitarray import bitarray

# 3rd party
import mmh3

class BloomFilter(set):
    def __init__(self, size, hash_count):
        super(BloomFilter, self).__init__()
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)
        self.size = size
        self.hash_count = hash_count
    def __len__(self):
        return self.size

    def __iter__(self):
        return iter(self.bit_array)

    def add(self,item):
        for i in range(self.hash_count):
            index = (mmh3.hash(item,i) % self.size)
            self.bit_array[index] =1
        return self

    def __contains__(self,item):
        out = True
        for i in range(self.hash_count):
            index = (mmh3.hash(item,i) % self.size)
            if self.bit_array[index] == 0:
                out = False
        return out

def main():
    bloom = BloomFilter(100, 10)
    animals = ['dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse']
    for a in animals:
        bloom.add(a)
    other_animals = ['badger', 'cow', 'dog', 'sheep', 'bee', 'wolf']
    for other_animal in other_animals:
        if other_animal in bloom:
            print('{} is  in the bloom'.format(other_animal))
        else:
            print('{} is not in the bloom filter as expected'.format(other_animal))
    


main()

结果:

 针对时间戳自定义的BloomFilter:

import time
import BitVector
import os
import sys
import datetime

# 默认值:20190101:00:00:00
startTime = 1546272000


class BloomFilter:

    def __init__(self, bitset: BitVector = None):
        if bitset is None:
            bitset = BitVector.BitVector(size=BIT_SIZE)
        self.bitSet = bitset

    def add(self, value: int):
        """
        给指定索引赋值为1
        :param value:
        :return:
        """
        # 防止扩容问题
        while True:
            bitLen = self.bitSet.length()
            if bitLen > value:
                self.bitSet[value] = 1
                break
            else:
                n = BitVector.BitVector(size=bitLen * 2)
                for r in self._toarray():
                    n[r] = 1
                self.bitSet = n

    def hash(self, value: datetime.datetime):
        """
        对时间转成int
        :param value:
        :return:
        """
        diff = int(time.mktime(value.timetuple()))
        diff -= startTime
        hour = diff / 3600
        return int(hour)

    def contains(self, value: int):
        """
        查看新的值是否包含
        :param value:
        :return:
        """
        return True & self.bitSet[value] == 1

    def _toarray(self):
        """
        核心:不用看
        :return:
        """
        runs = []
        start_index = -1
        for r in self.bitSet.runs():
            start_index += len(r)
            if r == "1":
                runs.append(int(start_index))
        return runs

    def string(self):
        """
        返回bit上值为1的索引
        :return:
        """
        return self.bitSet.get_bitvector_in_ascii()

    def toDate(self):
        """
        将内部设置的值转为时间:
        :return:
        """
        runs = self._toarray()
        ds = []
        for r in runs:
            tm = datetime.datetime.fromtimestamp((r * 3600) + startTime)
            ds.append(tm)
        return ds


def NewBitBloomFilter(bit):
    """
    自定义构建
    :param bit:
    :return:
    """
    return BloomFilter(bit)


def NewStrBloomFilter(s: str):
    """
    根据字符串来构建
    :param s:字符串
    :return:
    """
    bit = BitVector.BitVector(textstring=s)
    return NewBitBloomFilter(bit)


def NewSizeBloomFilter(size: int = 24 * 365 * 100, ):
    """
    根据bit长度构建
    :param size: 长度
    :return:
    """
    bit = BitVector.BitVector(size=size)
    return NewBitBloomFilter(bit)


if __name__ == '__main__':
    bf = NewSizeBloomFilter(10)
    now = datetime.datetime.now()
    bf.add(bf.hash(now + datetime.timedelta(hours=24)))
    print(bf.contains(bf.hash(now + datetime.timedelta(hours=24))))
    bf.add(bf.hash(now + datetime.timedelta(hours=1)))
    bf.add(bf.hash(now + datetime.timedelta(days=366, hours=1)))
    print(bf.toDate())
    bf2 = NewStrBloomFilter(bf.string())
    bf2.add(bf2.hash(now + datetime.timedelta(hours=11)))
    print(bf2.toDate())
发布了123 篇原创文章 · 获赞 71 · 访问量 11万+

猜你喜欢

转载自blog.csdn.net/boke14122621/article/details/103615557