python-redis-布隆算法實現

#!/usr/bin/python3
# -*- coding:utf-8 -*-
# @Software  : PyCharm
# @CreateTime: 2020-05-31 22:49
# @Author    : https://www.bilibili.com/video/BV1P7411E7az
# @File      : bloomfilter

import hashlib
import redis

class MultipleHash(object):
    '''根據提供的原始數據和預定義的salt,生成多個hash值'''
    def __init__(self, salts, hash_func_name='md5'):
        self.hash_func = getattr(hashlib, hash_func_name)
        if len(salts) < 3:
            raise Exception("請提供至少三個salts")
        self.salts = salts

    def get_hash_values(self, data):
        '''根據提供的原始數據,返回多個hash函數值'''
        hash_values = []
        for i in self.salts:
            hash_object = self.hash_func()
            hash_object.update(self._safe_data(data)) # data 必須是二進制類型
            hash_object.update(self._safe_data(i))
            ret = hash_object.hexdigest()

            hash_values.append(int(ret, 16))
        return hash_values

    def _safe_data(self, data):
        """
        :param data: 給定的原始數據
        :return:  二進制類型的字符串數據
        """
        if isinstance(data, bytes):
            return data
        elif isinstance(data, str):
            return data.encode()
        else:
            raise Exception('請提供str類型')


class BloomFilter(object):
    def __init__(self, salts, redis_host='localhost', redis_port=6379, redis_db=0, redis_key='bloomfilter'):
        self.redis_host = redis_host
        self.redis_port = redis_port
        self.redis_db = redis_db
        self.redis_key = redis_key

        self.client = self._get_redis_client()
        self.multiple_hash = MultipleHash(salts)

    def _get_redis_client(self):
        '''返回一個帶Redis連接對象'''
        pool = redis.ConnectionPool(host=self.redis_host,
                                port=self.redis_port,
                                db=self.redis_db)
        # client = redis.StrictRedis(connection_pool=pool)
        client = redis.Redis(connection_pool=pool)
        return client

    def save(self, data):
        hash_values = self.multiple_hash.get_hash_values(data)
        for hash_value in hash_values:
            offset = self._get_offset(hash_value)
            self.client.setbit(self.redis_key, offset, 1)
        return True

    def is_exist(self, data):
        hash_values = self.multiple_hash.get_hash_values(data)
        for hash_value in hash_values:
            offset = self._get_offset(hash_value)
            flag = self.client.getbit(self.redis_key, offset)
            if flag == 0:
                return False
        return True

    def _get_offset(self, hash_value):
        # redis k,v 中v裏村的是二進制,意味着可接受任何格式的數據,如JPEG、JSON等
        # redis 裏value最多能容納的數據長度是 512M
        # return hash_value % (2 ** 9 * 2 ** 20 * 2 ** 3)
        return hash_value % 4294967296 # 512 * 1024 * 1024 * 8

if __name__ == '__main__':
    pass
    # mh = MultipleHash(['1', '2', '3'])
    # mh = MultipleHash(['1'])
    # print(mh.get_hash_values('askdfjasjdfsaifklaisfasf'))
    data = ['asadsasfa', '123', '123', '456', 'ewq', 'zxczc', 'ewq']
    bm = BloomFilter(salts=['1', '2', '3', '4'])
    for d in data:
        if not bm.is_exist(d):
            bm.save(d)
            print("映射數據成功:", d)
        else:
            print("發現重複數據:", d)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章