java 基數估值

通過基數估值的方法來得到大量數據中重複的列。



算法步驟:隨機生成n多數據,利用murmurhash,得到32位的hash值,通過2 de 10分桶,來計算

package test;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;


 


 class jishuguzhi { 
	 
	 
	 public static void main(String args[])
	 {
		 
		 engine en=new engine();
		 
		 en.jisuan();
		 
	 }
	 
 }
 
 
 
  class engine

 {

	    HashMap<Integer,Integer> map=new HashMap<Integer, Integer>();
		MurmurHash m=new MurmurHash();
		
	public long gethash(byte [] data,int length,int seed)

	{
		
		return m.hash32(data, length, seed);
		 
		
		
	 
	}

	public void jisuan()

	{
		
		
		
		//分桶數爲pow(2,10);
		long top=100000000000l;
		
		while(top-->=0)
		{
		Random r=new Random();
		long x=r.nextLong();
		
		
		byte[]data=  String.valueOf(x).getBytes();
		
		int length=data.length;
		
		
		long iwant=this.gethash(data,length ,33333333);
		
		//獲得二進制字符串。
	    String iwantyou=Long.toBinaryString(iwant);
		
	    int distance=iwantyou.length()-10;
	    String tongn;
	    if(distance>=0)
	    	
	    {
	    	  tongn=iwantyou.substring(0, 10-1);
	    	
	    }	
	    
	    else 
	    	
	    {
	    	
	    	  tongn=iwantyou.substring(0, iwantyou.length()-1);
	    	
	    	
	    }
	    
	    
	   
	    //位數不夠就在後面加0;
	    
	    
	    
	    
	    int shi=Integer.valueOf(tongn,2);
	    
	    System.out.println(tongn);
	    if(map.containsKey(shi))
	    {
	    	 this.map.put(Integer.valueOf(tongn), this.map.get(shi)+1);   
	    	    	
	    }
	    else 
	    	
	    {
	    
	    this.map.put(Integer.valueOf(tongn), 1);   
	    
	    }

		
		
		
		
		
		
		
		
		
		}
		
		
		//迭代出map的值。
		
		Iterator it = map.keySet().iterator();
		while(it.hasNext()){
	Integer key =   (Integer) it.next();
		int value = map.get(key);
		System.out.println(key + "→" + value);
		}
		
		}


		

	 
	 
 }
 

 
  
  final class MurmurHash   {
 
 
private static final long serialVersionUID = 4342869264396184799L;
 
// all methods static; private constructor. 
public MurmurHash() {
 
}
 
    protected byte[] toBytesWithoutEncoding(String str) {
 
        int len = str.length();
        int pos = 0;
        byte[] buf = new byte[len << 1];
        for (int i = 0; i < len; i++) {
 
            char c = str.charAt(i);
            buf[pos++] = (byte) (c & 0xFF);
            buf[pos++] = (byte) (c >> 8);
         
}
        return buf;
     
}
 
public int hashcode(String str) {
 
        byte[] bytes = toBytesWithoutEncoding(str);
        return hash32(bytes, bytes.length);
 
}
 
/** 
 * Generates 32 bit hash from byte array of the given length and
 * seed.
 * 
 * @param data byte array to hash
 * @param length length of the array to hash
 * @param seed initial seed value
 * @return 32 bit hash of the given array
 */
public int hash32( final byte[] data, int length, int seed) {
 
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
final int m = 0x5bd1e995;
final int r = 24;
 
// Initialize the hash to a random value
int h = seed^length;
int length4 = length/4;
 
for (int i=0; i<length4; i++) {
 
final int i4 = i*4;
int k = (data[i4+0]&0xff) +((data[i4+1]&0xff)<<8)
+((data[i4+2]&0xff)<<16) +((data[i4+3]&0xff)<<24);
k *= m;
k ^= k >>> r;
k *= m;
h *= m;
h ^= k;
 
}
 
// Handle the last few bytes of the input array
switch (length%4) {
 
case 3: h ^= (data[(length&~3) +2]&0xff) << 16;
case 2: h ^= (data[(length&~3) +1]&0xff) << 8;
case 1: h ^= (data[length&~3]&0xff);
h *= m;
 
}
 
h ^= h >>> 13;
h *= m;
h ^= h >>> 15;
 
return h;
 
}
 
 
/** 
 * Generates 32 bit hash from byte array with default seed value.
 * 
 * @param data byte array to hash
 * @param length length of the array to hash
 * @return 32 bit hash of the given array
 */
public int hash32( final byte[] data, int length) {
 
return hash32( data, length, 0x9747b28c); 
 
}
 
 
/** 
 * Generates 64 bit hash from byte array of the given length and seed.
 * 
 * @param data byte array to hash
 * @param length length of the array to hash
 * @param seed initial seed value
 * @return 64 bit hash of the given array
 */
public long hash64( final byte[] data, int length, int seed) {
 
final long m = 0xc6a4a7935bd1e995L;
final int r = 47;
 
long h = (seed&0xffffffffl)^(length*m);
 
int length8 = length/8;
 
for (int i=0; i<length8; i++) {
 
final int i8 = i*8;
long k =  ((long)data[i8+0]&0xff)      +(((long)data[i8+1]&0xff)<<8)
+(((long)data[i8+2]&0xff)<<16) +(((long)data[i8+3]&0xff)<<24)
+(((long)data[i8+4]&0xff)<<32) +(((long)data[i8+5]&0xff)<<40)
+(((long)data[i8+6]&0xff)<<48) +(((long)data[i8+7]&0xff)<<56);
 
k *= m;
k ^= k >>> r;
k *= m;
 
h ^= k;
h *= m; 
 
}
 
switch (length%8) {
 
case 7: h ^= (long)(data[(length&~7)+6]&0xff) << 48;
case 6: h ^= (long)(data[(length&~7)+5]&0xff) << 40;
case 5: h ^= (long)(data[(length&~7)+4]&0xff) << 32;
case 4: h ^= (long)(data[(length&~7)+3]&0xff) << 24;
case 3: h ^= (long)(data[(length&~7)+2]&0xff) << 16;
case 2: h ^= (long)(data[(length&~7)+1]&0xff) << 8;
case 1: h ^= (long)(data[length&~7]&0xff);
h *= m;
 
};
 
h ^= h >>> r;
h *= m;
h ^= h >>> r;
 
return h;
 
}
 
 
/** 
 * Generates 64 bit hash from byte array with default seed value.
 * 
 * @param data byte array to hash
 * @param length length of the array to hash
 * @return 64 bit hash of the given string
 */
public long hash64( final byte[] data, int length) {
 
return hash64( data, length, 0xe17a1465);
 
}
 
 
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章