Java版本簡單布隆過濾器

最近搞NLP爬了點數據，有地方需要判斷一些字符串是否在一個大集合裏面出現過，聯想到了此前的經歷過的一個面試題。

問：在做網絡爬蟲的時候，經常會有URL重複出現，怎麼規避這種情況？
答：Java裏面可以用HashSet保存已經訪問過的URL。
問：如果這個URL的量很多呢？比如十億條？
答：如果能夠忍受一定錯誤率的話，可以使用布隆過濾器，balabala…

一、基本概念

上面已經介紹了布隆過濾器的一個常用場景，那麼布隆過濾器到底是一個什麼東西呢？

介紹：布隆過濾器可以被認爲是一個很長的二進制數組，可以用來代表一個大規模集合，判斷某個元素是否在此該集合中，但是有一定錯誤率；
錯誤率：布隆過濾器的判斷是有一定機率失誤的，但只可能出現某個元素不在集合中，卻被判斷爲在集合中這種類型的錯誤。
用途：常用於URL過濾、網頁黑名單、郵件黑名單等；
基本原理和運行步驟：
① 布隆過濾器可以看作一個長度爲 m 的比特數組，初始所有位置都置爲0，有 n 個相互獨立的Hash函數；
② 針對每個集合中的元素 v ，用這 n 個相互獨立的Hash函數求得 n 個 Hash值：H1、H2、…、Hn；
③ 對於某個求得的Hash值Hi，求：index = Hi % (m - 1)，然後把比特數組中的 index 位置置爲1，這樣對於每個元素 v ，就能在比特數組中的最多 n 個位置置爲1；
④ 當我們把比特數組初始化成功後，對於任何一個待查元素q，也用那 n 個Hash函數求得它的所有位置，判斷這些位置是否爲1，如果全爲1則在集合中，否則判斷爲不在集合中；

二、代碼實現

這個Java版本布隆過濾器參考了這個Python上基於Redis的版本，因爲是簡單版本，所以使用Java中的BitSet作爲基本數據機構，然後使用了mmh3哈希（聽說很快）。

import java.io.Serializable;
import java.util.*;

/**
 * @author kidd
 */
public class BloomFilter implements Serializable {

    private static final long serialVersionUID = -881375780720891535L;

    private static final int[] SEEDS = {543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372,
            344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338,
            465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53,
            481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371,
            63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518};

    /**
     * 總bit位數
     */
    private final int bitCount;

    /**
     * 比特數組
     */
    private final BitSet bitSet;

    /**
     * 對於每個字符串生成的hash個數
     */
    private final int hashCount;

    /**
     * 當前採用的seed，對應每個hash函數
     */
    private final int[] seeds;

    /**
     * 實際錯誤率
     */
    private final double realErrorRate;

    public BloomFilter() {
        this(100000000, 0.00000001);
    }

    public BloomFilter(int capacity, double errorRate) {
        this.bitCount = (int) Math.ceil(capacity * Math.log(Math.E) * Math.log(1 / errorRate));
        this.hashCount = (int) Math.ceil(Math.log1p(2) * this.bitCount / capacity);
        this.realErrorRate = Math.pow((1 - Math.pow(Math.E, - (double) capacity * hashCount / bitCount)), hashCount);
        this.seeds = Arrays.copyOf(SEEDS, hashCount);
        this.bitSet = new BitSet(bitCount);
    }

    public double getRealErrorRate () {
        return this.realErrorRate;
    }

    /**
     * 如果不存在就進行記錄並返回false，如果存在了就返回true
     **/
    public boolean addIfNotExist(String value) {
        boolean exits = true;
        for (int hash : getHashList(value)) {
            if (!bitSet.get(hash % (bitCount - 1))) {
                bitSet.set(hash % (bitCount - 1));
                exits = false;
            }
        }
        return exits;
    }

    public void add(String value) {
        for (int hash : getHashList(value)) {
            bitSet.set(hash % (bitCount - 1));
        }
    }

    public boolean isExisted(String value) {
        for (int hash : getHashList(value)) {
            if (!bitSet.get(hash % (bitCount - 1))) {
                return false;
            }
        }
        return true;
    }

    public List<Integer> getHashList(String value) {
        List<Integer> hashList = new ArrayList<>(hashCount);
        for (int seed : seeds) {
            int hash = murmurhash3(value, seed);
            if (hash < 0) {
                hash = Integer.MAX_VALUE + hash;
            }
            hashList.add(hash);
        }
        return hashList;
    }

    public int murmurhash3(CharSequence data, int seed) {

        final int c1 = 0xcc9e2d51;
        final int c2 = 0x1b873593;

        int h1 = seed;

        int pos = 0;
        int end = data.length();
        int k1 = 0;
        int k2 = 0;
        int shift = 0;
        int bits = 0;
        // length in UTF8 bytes
        int nBytes = 0;


        while (pos < end) {
            int code = data.charAt(pos++);
            if (code < 0x80) {
                k2 = code;
                bits = 8;
            }
            else if (code < 0x800) {
                k2 = (0xC0 | (code >> 6))
                        | ((0x80 | (code & 0x3F)) << 8);
                bits = 16;
            }
            else if (code < 0xD800 || code > 0xDFFF || pos>=end) {
                // we check for pos>=end to encode an unpaired surrogate as 3 bytes.
                k2 = (0xE0 | (code >> 12))
                        | ((0x80 | ((code >> 6) & 0x3F)) << 8)
                        | ((0x80 | (code & 0x3F)) << 16);
                bits = 24;
            } else {
                // surrogate pair
                // int utf32 = pos < end ? (int) data.charAt(pos++) : 0;
                int utf32 = (int) data.charAt(pos++);
                utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
                k2 = (0xff & (0xF0 | (utf32 >> 18)))
                        | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8
                        | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16
                        |  (0x80 | (utf32 & 0x3F)) << 24;
                bits = 32;
            }


            k1 |= k2 << shift;

            // int used_bits = 32 - shift;  // how many bits of k2 were used in k1.
            // int unused_bits = bits - used_bits; //  (bits-(32-shift)) == bits+shift-32  == bits-newshift

            shift += bits;
            if (shift >= 32) {
                // mix after we have a complete word

                k1 *= c1;
                k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
                k1 *= c2;

                h1 ^= k1;
                h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);
                h1 = h1*5+0xe6546b64;

                shift -= 32;
                // unfortunately, java won't let you shift 32 bits off, so we need to check for 0
                if (shift != 0) {
                    k1 = k2 >>> (bits-shift);   // bits used == bits - newshift
                } else {
                    k1 = 0;
                }
                nBytes += 4;
            }

        } // inner

        // handle tail
        if (shift > 0) {
            nBytes += shift >> 3;
            k1 *= c1;
            k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
            k1 *= c2;
            h1 ^= k1;
        }

        // finalization
        h1 ^= nBytes;

        // fmix(h1);
        h1 ^= h1 >>> 16;
        h1 *= 0x85ebca6b;
        h1 ^= h1 >>> 13;
        h1 *= 0xc2b2ae35;
        h1 ^= h1 >>> 16;

        return h1;
    }

    public static void main(String[] args) {
        BloomFilter fileter = new BloomFilter();
        System.out.println(fileter.addIfNotExist("1111111111111"));
        System.out.println(fileter.addIfNotExist("2222222222222222"));
        System.out.println(fileter.addIfNotExist("3333333333333333"));
        System.out.println(fileter.addIfNotExist("444444444444444"));
        System.out.println(fileter.addIfNotExist("5555555555555"));
        System.out.println(fileter.addIfNotExist("6666666666666"));
        System.out.println(fileter.addIfNotExist("1111111111111"));
    }

}

Java版本簡單布隆過濾器

一、基本概念

二、代碼實現

ROUGE和pyrouge的安裝

不能忽視的 Synchronization on a non-final field

【轉載】Linux 創建子進程執行任務

LCSTS中文數據集解析與處理

模擬實現Tair中的版本號

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結