#!/usr/bin/python3
# -*- coding:utf-8 -*-
# @Software : PyCharm
# @CreateTime: 2020-05-31 22:49
# @Author : https://www.bilibili.com/video/BV1P7411E7az
# @File : bloomfilter
import hashlib
import redis
class MultipleHash(object):
'''根據提供的原始數據和預定義的salt,生成多個hash值'''
def __init__(self, salts, hash_func_name='md5'):
self.hash_func = getattr(hashlib, hash_func_name)
if len(salts) < 3:
raise Exception("請提供至少三個salts")
self.salts = salts
def get_hash_values(self, data):
'''根據提供的原始數據,返回多個hash函數值'''
hash_values = []
for i in self.salts:
hash_object = self.hash_func()
hash_object.update(self._safe_data(data)) # data 必須是二進制類型
hash_object.update(self._safe_data(i))
ret = hash_object.hexdigest()
hash_values.append(int(ret, 16))
return hash_values
def _safe_data(self, data):
"""
:param data: 給定的原始數據
:return: 二進制類型的字符串數據
"""
if isinstance(data, bytes):
return data
elif isinstance(data, str):
return data.encode()
else:
raise Exception('請提供str類型')
class BloomFilter(object):
def __init__(self, salts, redis_host='localhost', redis_port=6379, redis_db=0, redis_key='bloomfilter'):
self.redis_host = redis_host
self.redis_port = redis_port
self.redis_db = redis_db
self.redis_key = redis_key
self.client = self._get_redis_client()
self.multiple_hash = MultipleHash(salts)
def _get_redis_client(self):
'''返回一個帶Redis連接對象'''
pool = redis.ConnectionPool(host=self.redis_host,
port=self.redis_port,
db=self.redis_db)
# client = redis.StrictRedis(connection_pool=pool)
client = redis.Redis(connection_pool=pool)
return client
def save(self, data):
hash_values = self.multiple_hash.get_hash_values(data)
for hash_value in hash_values:
offset = self._get_offset(hash_value)
self.client.setbit(self.redis_key, offset, 1)
return True
def is_exist(self, data):
hash_values = self.multiple_hash.get_hash_values(data)
for hash_value in hash_values:
offset = self._get_offset(hash_value)
flag = self.client.getbit(self.redis_key, offset)
if flag == 0:
return False
return True
def _get_offset(self, hash_value):
# redis k,v 中v裏村的是二進制,意味着可接受任何格式的數據,如JPEG、JSON等
# redis 裏value最多能容納的數據長度是 512M
# return hash_value % (2 ** 9 * 2 ** 20 * 2 ** 3)
return hash_value % 4294967296 # 512 * 1024 * 1024 * 8
if __name__ == '__main__':
pass
# mh = MultipleHash(['1', '2', '3'])
# mh = MultipleHash(['1'])
# print(mh.get_hash_values('askdfjasjdfsaifklaisfasf'))
data = ['asadsasfa', '123', '123', '456', 'ewq', 'zxczc', 'ewq']
bm = BloomFilter(salts=['1', '2', '3', '4'])
for d in data:
if not bm.is_exist(d):
bm.save(d)
print("映射數據成功:", d)
else:
print("發現重複數據:", d)
python-redis-布隆算法實現
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.