数据结构和算法布隆过滤器示例

一、布隆过滤器概述

布隆过滤器是一种数据结构，旨在快速且高效地告诉您某个元素是否存在于集合中。这种效率的代价是布隆过滤器是一种概率数据结构：它告诉我们元素要么肯定不在集合中，要么可能在集合中。

假设您正在github上创建一个帐户，您想输入一个很酷的用户名，您输入它并收到一条消息，“用户名已被占用”。您在用户名中添加了您的出生日期，仍然被占用。现在你也添加了你的部分身份证号，仍然得到“用户名已经被占用”。这真的很令人沮丧，不是吗？

但是你有没有想过github通过搜索数百万注册的用户名来检查用户名的可用性有多快。有很多方法可以完成这项工作

Linear search：这是最差的办法。

Binary Search：按字母顺序存储所有用户名，并将输入的用户名与列表中的中间一个进行比较，如果匹配，则取用户名，否则计算输入的用户名是在中间名之前还是之后，如果在之后，忽略所有用户名在中间（含）之前。现在搜索中间一个并重复这个过程，直到你得到一个匹配或没有匹配的搜索结束。这种技术更好，但仍然需要很多时间。

所以，必须要找到更好的办法！

Bloom Filter是一种可以完成这项工作的数据结构。

要了解布隆过滤器，您必须知道什么是散列。哈希函数接受输入并输出一个固定长度的唯一标识符，用于识别输入。
数据结构和算法八、散列表大多数编程语言都自带散列表这种能够快速读取的数据结构。但在不同的语言中，它有不同的名字，除了散列表，还有散列、映射、散列映射、字典、哈希表、关联数组。散列表是最重要的数据结构之一，它使用一种称为散列函数的特殊函数，该函数将给定值与键映射以更快地访问元素。散列表是一种存储一些信息的数据结构，信息基本上有两个主要组成部分，即键和值。散列表可以在关联数组的帮助下实现。映射的效率取决于用于映射的散列函数的效率。https://skydance.blog.csdn.net/article/details/123410798 布隆过滤器通过快速散列函数运行项目并从该散列中采样位并在位域中以特定间隔将它们从 0 设置为 1 来工作。为了检查布隆过滤器中是否存在，对相同的位进行采样。许多项目可能有重叠的位，但是由于散列函数产生唯一标识符，如果散列中的单个位仍然是 0，那么我们知道它之前没有被添加。

二、使用c++实现布隆过滤器

布隆过滤器是一种数据结构，可以有效地确定一个元素是否可能是集合的成员或绝对不是集合的成员。下面将介绍 C++ 布隆过滤器的简单实现。

这是具有 4 个样本散列函数（k = 4）的样本布隆过滤器的实现，位数组的大小为 100。

#include <bits/stdc++.h>
#define ll long long
using namespace std;

// hash 1
int h1(string s, int arrSize)
{
	ll int hash = 0;
	for (int i = 0; i < s.size(); i++)
	{
		hash = (hash + ((int)s[i]));
		hash = hash % arrSize;
	}
	return hash;
}

// hash 2
int h2(string s, int arrSize)
{
	ll int hash = 1;
	for (int i = 0; i < s.size(); i++)
	{
		hash = hash + pow(19, i) * s[i];
		hash = hash % arrSize;
	}
	return hash % arrSize;
}

// hash 3
int h3(string s, int arrSize)
{
	ll int hash = 7;
	for (int i = 0; i < s.size(); i++)
	{
		hash = (hash * 31 + s[i]) % arrSize;
	}
	return hash % arrSize;
}

// hash 4
int h4(string s, int arrSize)
{
	ll int hash = 3;
	int p = 7;
	for (int i = 0; i < s.size(); i++) {
		hash += hash * 7 + s[0] * pow(p, i);
		hash = hash % arrSize;
	}
	return hash;
}


// 查找操作
bool lookup(bool* bitarray, int arrSize, string s)
{
	int a = h1(s, arrSize);
	int b = h2(s, arrSize);
	int c = h3(s, arrSize);
	int d = h4(s, arrSize);

	if (bitarray[a] && bitarray[b] && bitarray
		&& bitarray[d])
		return true;
	else
		return false;
}

// 插入操作
void insert(bool* bitarray, int arrSize, string s)
{
	// check if the element in already present or not
	if (lookup(bitarray, arrSize, s))
		cout << s << " is Probably already present" << endl;
	else
	{
		int a = h1(s, arrSize);
		int b = h2(s, arrSize);
		int c = h3(s, arrSize);
		int d = h4(s, arrSize);

		bitarray[a] = true;
		bitarray[b] = true;
		bitarray = true;
		bitarray[d] = true;

		cout << s << " inserted" << endl;
	}
}

// Driver Code
int main()
{
	bool bitarray[100] = { false };
	int arrSize = 100;
	string sarray[33]
		= { "abound", "abounds",	 "abundance",
			"abundant", "accessible", "bloom",
			"blossom", "bolster",	 "bonny",
			"bonus", "bonuses",	 "coherent",
			"cohesive", "colorful",	 "comely",
			"comfort", "gems",		 "generosity",
			"generous", "generously", "genial",
			"bluff", "cheater",	 "hate",
			"war",	 "humanity",	 "racism",
			"hurt",	 "nuke",		 "gloomy",
			"facebook", "geeksforgeeks", "twitter" };
	for (int i = 0; i < 33; i++) {
		insert(bitarray, arrSize, sarray[i]);
	}
	return 0;
}

三、使用python实现布隆过滤器

python实现代码

# Python 3 program to build Bloom Filter
# Install mmh3 and bitarray 3rd party module first
# pip install mmh3
# pip install bitarray
import math
import mmh3
from bitarray import bitarray


class BloomFilter(object):

	'''
	Class for Bloom filter, using murmur3 hash function
	'''

	def __init__(self, items_count, fp_prob):
		'''
		items_count : int
			Number of items expected to be stored in bloom filter
		fp_prob : float
			False Positive probability in decimal
		'''
		# False possible probability in decimal
		self.fp_prob = fp_prob

		# Size of bit array to use
		self.size = self.get_size(items_count, fp_prob)

		# number of hash functions to use
		self.hash_count = self.get_hash_count(self.size, items_count)

		# Bit array of given size
		self.bit_array = bitarray(self.size)

		# initialize all bits as 0
		self.bit_array.setall(0)

	def add(self, item):
		'''
		Add an item in the filter
		'''
		digests = []
		for i in range(self.hash_count):

			# create digest for given item.
			# i work as seed to mmh3.hash() function
			# With different seed, digest created is different
			digest = mmh3.hash(item, i) % self.size
			digests.append(digest)

			# set the bit True in bit_array
			self.bit_array[digest] = True

	def check(self, item):
		'''
		Check for existence of an item in filter
		'''
		for i in range(self.hash_count):
			digest = mmh3.hash(item, i) % self.size
			if self.bit_array[digest] == False:

				# if any of bit is False then,its not present
				# in filter
				# else there is probability that it exist
				return False
		return True

	@classmethod
	def get_size(self, n, p):
		'''
		Return the size of bit array(m) to used using
		following formula
		m = -(n * lg(p)) / (lg(2)^2)
		n : int
			number of items expected to be stored in filter
		p : float
			False Positive probability in decimal
		'''
		m = -(n * math.log(p))/(math.log(2)**2)
		return int(m)

	@classmethod
	def get_hash_count(self, m, n):
		'''
		Return the hash function(k) to be used using
		following formula
		k = (m/n) * lg(2)

		m : int
			size of bit array
		n : int
			number of items expected to be stored in filter
		'''
		k = (m/n) * math.log(2)
		return int(k)

测试

from bloomfilter import BloomFilter
from random import shuffle

n = 20 #no of items to add
p = 0.05 #false positive probability

bloomf = BloomFilter(n,p)
print("Size of bit array:{}".format(bloomf.size))
print("False positive Probability:{}".format(bloomf.fp_prob))
print("Number of hash functions:{}".format(bloomf.hash_count))

# words to be added
word_present = ['abound','abounds','abundance','abundant','accessible',
				'bloom','blossom','bolster','bonny','bonus','bonuses',
				'coherent','cohesive','colorful','comely','comfort',
				'gems','generosity','generous','generously','genial']

# word not added
word_absent = ['bluff','cheater','hate','war','humanity',
			'racism','hurt','nuke','gloomy','facebook',
			'geeksforgeeks','twitter']

for item in word_present:
	bloomf.add(item)

shuffle(word_present)
shuffle(word_absent)

test_words = word_present[:10] + word_absent
shuffle(test_words)
for word in test_words:
	if bloomf.check(word):
		if word in word_absent:
			print("'{}' is a false positive!".format(word))
		else:
			print("'{}' is probably present!".format(word))
	else:
		print("'{}' is definitely not present!".format(word))