1 原理
布隆过滤器的原理及数学推导可以参考:
https://www.cnblogs.com/xiaohuiduan/p/11488020.html
2 python demo实现
from bitarray import bitarray
# 3rd party
import mmh3
class BloomFilter(set):
def __init__(self, size, hash_count):
super(BloomFilter, self).__init__()
self.bit_array = bitarray(size)
self.bit_array.setall(0)
self.size = size
self.hash_count = hash_count
def __len__(self):
return self.size
def __iter__(self):
return iter(self.bit_array)
def add(self,item):
for i in range(self.hash_count):
index = (mmh3.hash(item,i) % self.size)
self.bit_array[index] =1
return self
def __contains__(self,item):
out = True
for i in range(self.hash_count):
index = (mmh3.hash(item,i) % self.size)
if self.bit_array[index] == 0:
out = False
return out
def main():
bloom = BloomFilter(100, 10)
animals = ['dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse']
for a in animals:
bloom.add(a)
other_animals = ['badger', 'cow', 'dog', 'sheep', 'bee', 'wolf']
for other_animal in other_animals:
if other_animal in bloom:
print('{} is in the bloom'.format(other_animal))
else:
print('{} is not in the bloom filter as expected'.format(other_animal))
main()
结果:
针对时间戳自定义的BloomFilter:
import time
import BitVector
import os
import sys
import datetime
# 默认值:20190101:00:00:00
startTime = 1546272000
class BloomFilter:
def __init__(self, bitset: BitVector = None):
if bitset is None:
bitset = BitVector.BitVector(size=BIT_SIZE)
self.bitSet = bitset
def add(self, value: int):
"""
给指定索引赋值为1
:param value:
:return:
"""
# 防止扩容问题
while True:
bitLen = self.bitSet.length()
if bitLen > value:
self.bitSet[value] = 1
break
else:
n = BitVector.BitVector(size=bitLen * 2)
for r in self._toarray():
n[r] = 1
self.bitSet = n
def hash(self, value: datetime.datetime):
"""
对时间转成int
:param value:
:return:
"""
diff = int(time.mktime(value.timetuple()))
diff -= startTime
hour = diff / 3600
return int(hour)
def contains(self, value: int):
"""
查看新的值是否包含
:param value:
:return:
"""
return True & self.bitSet[value] == 1
def _toarray(self):
"""
核心:不用看
:return:
"""
runs = []
start_index = -1
for r in self.bitSet.runs():
start_index += len(r)
if r == "1":
runs.append(int(start_index))
return runs
def string(self):
"""
返回bit上值为1的索引
:return:
"""
return self.bitSet.get_bitvector_in_ascii()
def toDate(self):
"""
将内部设置的值转为时间:
:return:
"""
runs = self._toarray()
ds = []
for r in runs:
tm = datetime.datetime.fromtimestamp((r * 3600) + startTime)
ds.append(tm)
return ds
def NewBitBloomFilter(bit):
"""
自定义构建
:param bit:
:return:
"""
return BloomFilter(bit)
def NewStrBloomFilter(s: str):
"""
根据字符串来构建
:param s:字符串
:return:
"""
bit = BitVector.BitVector(textstring=s)
return NewBitBloomFilter(bit)
def NewSizeBloomFilter(size: int = 24 * 365 * 100, ):
"""
根据bit长度构建
:param size: 长度
:return:
"""
bit = BitVector.BitVector(size=size)
return NewBitBloomFilter(bit)
if __name__ == '__main__':
bf = NewSizeBloomFilter(10)
now = datetime.datetime.now()
bf.add(bf.hash(now + datetime.timedelta(hours=24)))
print(bf.contains(bf.hash(now + datetime.timedelta(hours=24))))
bf.add(bf.hash(now + datetime.timedelta(hours=1)))
bf.add(bf.hash(now + datetime.timedelta(days=366, hours=1)))
print(bf.toDate())
bf2 = NewStrBloomFilter(bf.string())
bf2.add(bf2.hash(now + datetime.timedelta(hours=11)))
print(bf2.toDate())