1.布隆過濾器原理
請參考以下兩篇文檔
SpringBoot+Redis布隆過濾器
布隆過濾器
2.Guava布隆過濾器
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>21.0</version>
</dependency>
/**
* Hello world!
*/
public class App {
/**
* 插入的數據量
*/
private static final int insertions = 100000;
/**
* 誤差率
*/
private static double fpp = 0.02;
public static void main(String[] args) {
//初始化一個存儲string數據的布隆過濾器,默認誤判率是0.03
BloomFilter<String> bf = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), insertions, fpp);
//用於存放所有實際存在的key,用於是否存儲
Set<String> sets = new HashSet<String>(insertions);
//用於存放所有實際存在的key,用於取出
List<String> lists = new ArrayList<String>(insertions);
//插入隨機字符串
for (int i = 0; i < insertions; i++) {
String uuid = UUID.randomUUID().toString();
bf.put(uuid);
sets.add(uuid);
lists.add(uuid);
}
int rightNum = 0;
int wrongNum = 0;
for (int i = 0; i < 10000; i++) {
// 0-10000之間,可以被100整除的數有100個(100的倍數)
String data = i % 100 == 0 ? lists.get(i / 100) : UUID.randomUUID().toString();
//這裏用了might,看上去不是很自信,所以如果布隆過濾器判斷存在了,我們還要去sets中實錘
if (bf.mightContain(data)) {
if (sets.contains(data)) {
rightNum++;
continue;
}
wrongNum++;
}
}
System.out.println(rightNum);
System.out.println(wrongNum);
BigDecimal percent = new BigDecimal(wrongNum).divide(new BigDecimal(9900), 2, RoundingMode.HALF_UP);
BigDecimal bingo = new BigDecimal(9900 - wrongNum).divide(new BigDecimal(9900), 2, RoundingMode.HALF_UP);
System.out.println("在100W個元素中,判斷100個實際存在的元素,布隆過濾器認爲存在的:" + rightNum);
System.out.println("在100W個元素中,判斷9900個實際不存在的元素,誤認爲存在的:" + wrongNum + ",命中率:" + bingo + ",誤判率:" + percent);
}
}
guava實現布隆過濾器是把數據放在本地內存中,我們項目往往是分佈式的,我們還可以把數據放在redis中,用redis來實現布隆過濾器,這就需要我們自己設計映射函數,自己度量二進制向量的長度
3.Redis布隆過濾器
Redis安裝Bloom Filter
Redis從4.0纔開始支持bloom filter。
git clone https://github.com/RedisLabsModules/redisbloom.git
cd redisbloom
make # 編譯
git命令報錯
解決方法:
安裝git命令即可解決問題,Linux百科網是以CentOS 7 64位系統爲例,在安裝git命令之前需要先啓用EPEL存儲庫。
一:先啓用EPEL存儲庫
系統位數不同命令也不同,以下列舉了CentOS 7 64位和32位的啓用EPEL存儲庫命令,大家按照系統版本選擇執行即可(納尼?不清楚自己是多少位系統?參考:如何查看Linux系統位數?32位或64位?)。
RHEL/CentOS 7 64位執行以下命令:
執行命令:wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
執行命令:rpm -ivh epel-release-latest-7.noarch.rpm
RHEL/CentOS 7 32位執行以下命令:
執行命令:get http://dl.fedoraproject.org/pub/epel/6/i386/epel-release-6-8.noarch.rpm
執行命令:rpm -ivh epel-release-6-8.noarch.rpm
二:安裝git命令
執行命令:yum install -y git
然後等待系統自動安裝吧,當提示“Complete!”,說明已經安裝成功,可以使用git命令啦!
讓Redis啓動時可以加載bloom filter有兩種方式:
手工加載式:
redis-server --loadmodule ./redisbloom/rebloom.so
每次啓動自加載:
編輯Redis的redis.conf文件,加入
loadmodule /soft/redisbloom/redisbloom.so
BloomFilterHelper
public class BloomFilterHelper<T> {
// 散列函數
private int numHashFunctions;
// 二進制大小
private int bitSize;
// 過濾器
private Funnel<T> funnel;
public BloomFilterHelper(int expectedInsertions) {
this.funnel = (Funnel<T>) Funnels.stringFunnel(Charset.defaultCharset());
bitSize = optimalNumOfBits(expectedInsertions, 0.03);
numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, bitSize);
}
public BloomFilterHelper(Funnel<T> funnel, int expectedInsertions, double fpp) {
this.funnel = funnel;
bitSize = optimalNumOfBits(expectedInsertions, fpp);
numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, bitSize);
}
public int[] murmurHashOffset(T value) {
int[] offset = new int[numHashFunctions];
long hash64 = Hashing.murmur3_128().hashObject(value, funnel).asLong();
int hash1 = (int) hash64;
int hash2 = (int) (hash64 >>> 32);
for (int i = 1; i <= numHashFunctions; i++) {
int nextHash = hash1 + i * hash2;
if (nextHash < 0) {
nextHash = ~nextHash;
}
offset[i - 1] = nextHash % bitSize;
}
return offset;
}
/**
* 計算bit數組長度
*/
private int optimalNumOfBits(long n, double p) {
if (p == 0) {
p = Double.MIN_VALUE;
}
return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
}
/**
* 計算hash方法執行次數
*/
private int optimalNumOfHashFunctions(long n, long m) {
return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
}
}
RedisBloomFilter
@Service
public class RedisBloomFilter<T> {
@Autowired
private RedisTemplate redisTemplate;
/**
* 刪除緩存的KEY
*
* @param key KEY
*/
public void delete(String key) {
redisTemplate.delete(key);
}
/**
* 根據給定的布隆過濾器添加值,在添加一個元素的時候使用,批量添加的性能差
*
* @param bloomFilterHelper 布隆過濾器對象
* @param key KEY
* @param value 值
* @param <T> 泛型,可以傳入任何類型的value
*/
public <T> void add(BloomFilterHelper<T> bloomFilterHelper, String key, T value) {
int[] offset = bloomFilterHelper.murmurHashOffset(value);
for (int i : offset) {
redisTemplate.opsForValue().setBit(key, i, true);
}
}
/**
* 根據給定的布隆過濾器添加值,在添加一批元素的時候使用,批量添加的性能好,使用pipeline方式(如果是集羣下,請使用優化後RedisPipeline的操作)
*
* @param bloomFilterHelper 布隆過濾器對象
* @param key KEY
* @param valueList 值,列表
* @param <T> 泛型,可以傳入任何類型的value
*/
public <T> void addList(BloomFilterHelper<T> bloomFilterHelper, String key, List<T> valueList) {
redisTemplate.executePipelined(new RedisCallback<Long>() {
@Override
public Long doInRedis(RedisConnection connection) throws DataAccessException {
connection.openPipeline();
for (T value : valueList) {
int[] offset = bloomFilterHelper.murmurHashOffset(value);
for (int i : offset) {
connection.setBit(key.getBytes(), i, true);
}
}
return null;
}
});
}
/**
* 根據給定的布隆過濾器判斷值是否存在
*
* @param bloomFilterHelper 布隆過濾器對象
* @param key KEY
* @param value 值
* @param <T> 泛型,可以傳入任何類型的value
* @return 是否存在
*/
public <T> boolean contains(BloomFilterHelper<T> bloomFilterHelper, String key, T value) {
int[] offset = bloomFilterHelper.murmurHashOffset(value);
for (int i : offset) {
if (!redisTemplate.opsForValue().getBit(key, i)) {
return false;
}
}
return true;
}
}
測試類
@RestController
public class TestController {
@Autowired
private RedisBloomFilter redisBloomFilter;
@GetMapping("/test")
public void index() {
/**
* 請求的數據
*/
int expectedInsertions = 50000;
/**
* 誤差率
*/
double fpp = 0.1;
redisBloomFilter.delete("bloom");
BloomFilterHelper<CharSequence> bloomFilterHelper = new BloomFilterHelper<>(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp);
int rightNum = 0;
int wrongNum = 0;
// 添加10000個元素
List<String> valueList = new ArrayList<>();
List<String> keyList = new ArrayList<>();
for (int i = 0; i < expectedInsertions / 5; i++) {
String uuid = UUID.randomUUID().toString();
valueList.add(uuid);
keyList.add(uuid);
}
long beginTime = System.currentTimeMillis();
redisBloomFilter.addList(bloomFilterHelper, "bloom", valueList);
long costMs = (System.currentTimeMillis() - beginTime) / 1000;
System.out.println("布隆過濾器添加" + expectedInsertions / 5 + "個值,耗時:" + costMs + "s");
for (int i = 0; i < 10000; i++) {
// 0-10000之間,可以被100整除的數有100個(100的倍數)
String data = i % 100 == 0 ? keyList.get(i / 100) : UUID.randomUUID().toString();
boolean result = redisBloomFilter.contains(bloomFilterHelper, "bloom", data);
if (result) {
if (keyList.contains(data)) {
rightNum++;
continue;
}
wrongNum++;
}
}
BigDecimal percent = new BigDecimal(wrongNum).divide(new BigDecimal(9900), 2, RoundingMode.HALF_UP);
BigDecimal bingo = new BigDecimal(9900 - wrongNum).divide(new BigDecimal(9900), 2, RoundingMode.HALF_UP);
System.out.println("在5W個元素中,判斷100個實際存在的元素,布隆過濾器認爲存在的:" + (rightNum+wrongNum));
System.out.println("在5W個元素中,判斷9900個實際不存在的元素,誤認爲存在的:" + wrongNum + ",命中率:" + bingo + ",誤判率:" + percent);
System.out.println("驗證結果耗時:" + (System.currentTimeMillis() - beginTime) / 1000 + "s");
}
}