1.布隆过滤器原理
请参考以下两篇文档
SpringBoot+Redis布隆过滤器
布隆过滤器
2.Guava布隆过滤器
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>21.0</version>
</dependency>
/**
* Hello world!
*/
public class App {
/**
* 插入的数据量
*/
private static final int insertions = 100000;
/**
* 误差率
*/
private static double fpp = 0.02;
public static void main(String[] args) {
//初始化一个存储string数据的布隆过滤器,默认误判率是0.03
BloomFilter<String> bf = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), insertions, fpp);
//用于存放所有实际存在的key,用于是否存储
Set<String> sets = new HashSet<String>(insertions);
//用于存放所有实际存在的key,用于取出
List<String> lists = new ArrayList<String>(insertions);
//插入随机字符串
for (int i = 0; i < insertions; i++) {
String uuid = UUID.randomUUID().toString();
bf.put(uuid);
sets.add(uuid);
lists.add(uuid);
}
int rightNum = 0;
int wrongNum = 0;
for (int i = 0; i < 10000; i++) {
// 0-10000之间,可以被100整除的数有100个(100的倍数)
String data = i % 100 == 0 ? lists.get(i / 100) : UUID.randomUUID().toString();
//这里用了might,看上去不是很自信,所以如果布隆过滤器判断存在了,我们还要去sets中实锤
if (bf.mightContain(data)) {
if (sets.contains(data)) {
rightNum++;
continue;
}
wrongNum++;
}
}
System.out.println(rightNum);
System.out.println(wrongNum);
BigDecimal percent = new BigDecimal(wrongNum).divide(new BigDecimal(9900), 2, RoundingMode.HALF_UP);
BigDecimal bingo = new BigDecimal(9900 - wrongNum).divide(new BigDecimal(9900), 2, RoundingMode.HALF_UP);
System.out.println("在100W个元素中,判断100个实际存在的元素,布隆过滤器认为存在的:" + rightNum);
System.out.println("在100W个元素中,判断9900个实际不存在的元素,误认为存在的:" + wrongNum + ",命中率:" + bingo + ",误判率:" + percent);
}
}
guava实现布隆过滤器是把数据放在本地内存中,我们项目往往是分布式的,我们还可以把数据放在redis中,用redis来实现布隆过滤器,这就需要我们自己设计映射函数,自己度量二进制向量的长度
3.Redis布隆过滤器
Redis安装Bloom Filter
Redis从4.0才开始支持bloom filter。
git clone https://github.com/RedisLabsModules/redisbloom.git
cd redisbloom
make # 编译
git命令报错
解决方法:
安装git命令即可解决问题,Linux百科网是以CentOS 7 64位系统为例,在安装git命令之前需要先启用EPEL存储库。
一:先启用EPEL存储库
系统位数不同命令也不同,以下列举了CentOS 7 64位和32位的启用EPEL存储库命令,大家按照系统版本选择执行即可(纳尼?不清楚自己是多少位系统?参考:如何查看Linux系统位数?32位或64位?)。
RHEL/CentOS 7 64位执行以下命令:
执行命令:wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
执行命令:rpm -ivh epel-release-latest-7.noarch.rpm
RHEL/CentOS 7 32位执行以下命令:
执行命令:get http://dl.fedoraproject.org/pub/epel/6/i386/epel-release-6-8.noarch.rpm
执行命令:rpm -ivh epel-release-6-8.noarch.rpm
二:安装git命令
执行命令:yum install -y git
然后等待系统自动安装吧,当提示“Complete!”,说明已经安装成功,可以使用git命令啦!
让Redis启动时可以加载bloom filter有两种方式:
手工加载式:
redis-server --loadmodule ./redisbloom/rebloom.so
每次启动自加载:
编辑Redis的redis.conf文件,加入
loadmodule /soft/redisbloom/redisbloom.so
BloomFilterHelper
public class BloomFilterHelper<T> {
// 散列函数
private int numHashFunctions;
// 二进制大小
private int bitSize;
// 过滤器
private Funnel<T> funnel;
public BloomFilterHelper(int expectedInsertions) {
this.funnel = (Funnel<T>) Funnels.stringFunnel(Charset.defaultCharset());
bitSize = optimalNumOfBits(expectedInsertions, 0.03);
numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, bitSize);
}
public BloomFilterHelper(Funnel<T> funnel, int expectedInsertions, double fpp) {
this.funnel = funnel;
bitSize = optimalNumOfBits(expectedInsertions, fpp);
numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, bitSize);
}
public int[] murmurHashOffset(T value) {
int[] offset = new int[numHashFunctions];
long hash64 = Hashing.murmur3_128().hashObject(value, funnel).asLong();
int hash1 = (int) hash64;
int hash2 = (int) (hash64 >>> 32);
for (int i = 1; i <= numHashFunctions; i++) {
int nextHash = hash1 + i * hash2;
if (nextHash < 0) {
nextHash = ~nextHash;
}
offset[i - 1] = nextHash % bitSize;
}
return offset;
}
/**
* 计算bit数组长度
*/
private int optimalNumOfBits(long n, double p) {
if (p == 0) {
p = Double.MIN_VALUE;
}
return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
}
/**
* 计算hash方法执行次数
*/
private int optimalNumOfHashFunctions(long n, long m) {
return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
}
}
RedisBloomFilter
@Service
public class RedisBloomFilter<T> {
@Autowired
private RedisTemplate redisTemplate;
/**
* 删除缓存的KEY
*
* @param key KEY
*/
public void delete(String key) {
redisTemplate.delete(key);
}
/**
* 根据给定的布隆过滤器添加值,在添加一个元素的时候使用,批量添加的性能差
*
* @param bloomFilterHelper 布隆过滤器对象
* @param key KEY
* @param value 值
* @param <T> 泛型,可以传入任何类型的value
*/
public <T> void add(BloomFilterHelper<T> bloomFilterHelper, String key, T value) {
int[] offset = bloomFilterHelper.murmurHashOffset(value);
for (int i : offset) {
redisTemplate.opsForValue().setBit(key, i, true);
}
}
/**
* 根据给定的布隆过滤器添加值,在添加一批元素的时候使用,批量添加的性能好,使用pipeline方式(如果是集群下,请使用优化后RedisPipeline的操作)
*
* @param bloomFilterHelper 布隆过滤器对象
* @param key KEY
* @param valueList 值,列表
* @param <T> 泛型,可以传入任何类型的value
*/
public <T> void addList(BloomFilterHelper<T> bloomFilterHelper, String key, List<T> valueList) {
redisTemplate.executePipelined(new RedisCallback<Long>() {
@Override
public Long doInRedis(RedisConnection connection) throws DataAccessException {
connection.openPipeline();
for (T value : valueList) {
int[] offset = bloomFilterHelper.murmurHashOffset(value);
for (int i : offset) {
connection.setBit(key.getBytes(), i, true);
}
}
return null;
}
});
}
/**
* 根据给定的布隆过滤器判断值是否存在
*
* @param bloomFilterHelper 布隆过滤器对象
* @param key KEY
* @param value 值
* @param <T> 泛型,可以传入任何类型的value
* @return 是否存在
*/
public <T> boolean contains(BloomFilterHelper<T> bloomFilterHelper, String key, T value) {
int[] offset = bloomFilterHelper.murmurHashOffset(value);
for (int i : offset) {
if (!redisTemplate.opsForValue().getBit(key, i)) {
return false;
}
}
return true;
}
}
测试类
@RestController
public class TestController {
@Autowired
private RedisBloomFilter redisBloomFilter;
@GetMapping("/test")
public void index() {
/**
* 请求的数据
*/
int expectedInsertions = 50000;
/**
* 误差率
*/
double fpp = 0.1;
redisBloomFilter.delete("bloom");
BloomFilterHelper<CharSequence> bloomFilterHelper = new BloomFilterHelper<>(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp);
int rightNum = 0;
int wrongNum = 0;
// 添加10000个元素
List<String> valueList = new ArrayList<>();
List<String> keyList = new ArrayList<>();
for (int i = 0; i < expectedInsertions / 5; i++) {
String uuid = UUID.randomUUID().toString();
valueList.add(uuid);
keyList.add(uuid);
}
long beginTime = System.currentTimeMillis();
redisBloomFilter.addList(bloomFilterHelper, "bloom", valueList);
long costMs = (System.currentTimeMillis() - beginTime) / 1000;
System.out.println("布隆过滤器添加" + expectedInsertions / 5 + "个值,耗时:" + costMs + "s");
for (int i = 0; i < 10000; i++) {
// 0-10000之间,可以被100整除的数有100个(100的倍数)
String data = i % 100 == 0 ? keyList.get(i / 100) : UUID.randomUUID().toString();
boolean result = redisBloomFilter.contains(bloomFilterHelper, "bloom", data);
if (result) {
if (keyList.contains(data)) {
rightNum++;
continue;
}
wrongNum++;
}
}
BigDecimal percent = new BigDecimal(wrongNum).divide(new BigDecimal(9900), 2, RoundingMode.HALF_UP);
BigDecimal bingo = new BigDecimal(9900 - wrongNum).divide(new BigDecimal(9900), 2, RoundingMode.HALF_UP);
System.out.println("在5W个元素中,判断100个实际存在的元素,布隆过滤器认为存在的:" + (rightNum+wrongNum));
System.out.println("在5W个元素中,判断9900个实际不存在的元素,误认为存在的:" + wrongNum + ",命中率:" + bingo + ",误判率:" + percent);
System.out.println("验证结果耗时:" + (System.currentTimeMillis() - beginTime) / 1000 + "s");
}
}