HashTable 源碼分析

基礎元素結構如下:
// The hash table data.
// This cannot be serialised
private struct bucket {
public Object key;
public Object val;
public int hash_coll; // Store hash code; sign bit means there was a collision.
}

private bucket[] buckets;
// The total number of entries in the hash table.
private int count;
// The total number of collision bits set in the hashtable
private int occupancy;
private int loadsize;
private float loadFactor;
private volatile int version;
private volatile bool isWriterInProgress;
private ICollection keys;
private ICollection values;

private IEqualityComparer _keycomparer;
private Object _syncRoot;
首先分析它的添加元素的方法:
// Adds an entry with the given key and value to this hashtable. An
// ArgumentException is thrown if the key is null or if the key is already
// present in the hashtable.
//
public virtual void Add(Object key, Object value) {
Insert(key, value, true);
}
// Inserts an entry into this hashtable. This method is called from the Set
// and Add methods. If the add parameter is true and the given key already
// exists in the hashtable, an exception is thrown.
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.MayFail)]
private void Insert (Object key, Object nvalue, bool add) {
// @


if (key == null) {
throw new ArgumentNullException("key", Environment.GetResourceString("ArgumentNull_Key"));
}

Contract.EndContractBlock();
if (count >= loadsize) {
expand();
}
else if(occupancy > loadsize && count > 100) {
rehash();
}
uint seed;
uint incr;
// Assume we only have one thread writing concurrently. Modify
// buckets to contain new data, as long as we insert in the right order.
uint hashcode = InitHash(key, buckets.Length, out seed, out incr);
int ntry = 0;
int emptySlotNumber = -1; // We use the empty slot number to cache the first empty slot. We chose to reuse slots
// create by remove that have the collision bit set over using up new slots.
int bucketNumber = (int) (seed % (uint)buckets.Length);
do {

// Set emptySlot number to current bucket if it is the first available bucket that we have seen
// that once contained an entry and also has had a collision.
// We need to search this entire collision chain because we have to ensure that there are no
// duplicate entries in the table.
if (emptySlotNumber == -1 && (buckets[bucketNumber].key == buckets) && (buckets[bucketNumber].hash_coll < 0))//(((buckets[bucketNumber].hash_coll & unchecked(0x80000000))!=0)))
emptySlotNumber = bucketNumber;

// Insert the key/value pair into this bucket if this bucket is empty and has never contained an entry
// OR
// This bucket once contained an entry but there has never been a collision
if ((buckets[bucketNumber].key == null) ||
(buckets[bucketNumber].key == buckets && ((buckets[bucketNumber].hash_coll & unchecked(0x80000000))==0))) {

// If we have found an available bucket that has never had a collision, but we've seen an available
// bucket in the past that has the collision bit set, use the previous bucket instead
if (emptySlotNumber != -1) // Reuse slot
bucketNumber = emptySlotNumber;

// We pretty much have to insert in this order. Don't set hash
// code until the value & key are set appropriately.
#if !FEATURE_CORECLR
Thread.BeginCriticalRegion();
#endif
isWriterInProgress = true;
buckets[bucketNumber].val = nvalue;
buckets[bucketNumber].key = key;
buckets[bucketNumber].hash_coll |= (int) hashcode;
count++;
UpdateVersion();
isWriterInProgress = false;
#if !FEATURE_CORECLR
Thread.EndCriticalRegion();
#endif

#if FEATURE_RANDOMIZED_STRING_HASHING
#if !FEATURE_CORECLR
// coreclr has the randomized string hashing on by default so we don't need to resize at this point

if(ntry > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(_keycomparer))
{
// PERF: We don't want to rehash if _keycomparer is already a RandomizedObjectEqualityComparer since in some
// cases there may not be any strings in the hashtable and we wouldn't get any mixing.
if(_keycomparer == null || !(_keycomparer is System.Collections.Generic.RandomizedObjectEqualityComparer))
{
_keycomparer = HashHelpers.GetRandomizedEqualityComparer(_keycomparer);
rehash(buckets.Length, true);
}
}
#endif // !FEATURE_CORECLR
#endif // FEATURE_RANDOMIZED_STRING_HASHING

return;
}

// The current bucket is in use
// OR
// it is available, but has had the collision bit set and we have already found an available bucket
if (((buckets[bucketNumber].hash_coll & 0x7FFFFFFF) == hashcode) &&
KeyEquals (buckets[bucketNumber].key, key)) {
if (add) {
throw new ArgumentException(Environment.GetResourceString("Argument_AddingDuplicate__", buckets[bucketNumber].key, key));
}
#if !FEATURE_CORECLR
Thread.BeginCriticalRegion();
#endif
isWriterInProgress = true;
buckets[bucketNumber].val = nvalue;
UpdateVersion();
isWriterInProgress = false;
#if !FEATURE_CORECLR
Thread.EndCriticalRegion();
#endif

#if FEATURE_RANDOMIZED_STRING_HASHING
#if !FEATURE_CORECLR
if(ntry > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(_keycomparer))
{
// PERF: We don't want to rehash if _keycomparer is already a RandomizedObjectEqualityComparer since in some
// cases there may not be any strings in the hashtable and we wouldn't get any mixing.
if(_keycomparer == null || !(_keycomparer is System.Collections.Generic.RandomizedObjectEqualityComparer))
{
_keycomparer = HashHelpers.GetRandomizedEqualityComparer(_keycomparer);
rehash(buckets.Length, true);
}
}
#endif // !FEATURE_CORECLR
#endif
return;
}

// The current bucket is full, and we have therefore collided. We need to set the collision bit
// UNLESS
// we have remembered an available slot previously.
if (emptySlotNumber == -1) {// We don't need to set the collision bit here since we already have an empty slot
if( buckets[bucketNumber].hash_coll >= 0 ) {
buckets[bucketNumber].hash_coll |= unchecked((int)0x80000000);
occupancy++;
}
}

bucketNumber = (int) (((long)bucketNumber + incr)% (uint)buckets.Length);
} while (++ntry < buckets.Length);

// This code is here if and only if there were no buckets without a collision bit set in the entire table
if (emptySlotNumber != -1)
{
// We pretty much have to insert in this order. Don't set hash
// code until the value & key are set appropriately.
#if !FEATURE_CORECLR
Thread.BeginCriticalRegion();
#endif
isWriterInProgress = true;
buckets[emptySlotNumber].val = nvalue;
buckets[emptySlotNumber].key = key;
buckets[emptySlotNumber].hash_coll |= (int) hashcode;
count++;
UpdateVersion();
isWriterInProgress = false;
#if !FEATURE_CORECLR
Thread.EndCriticalRegion();
#endif

#if FEATURE_RANDOMIZED_STRING_HASHING
#if !FEATURE_CORECLR
if(buckets.Length > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(_keycomparer))
{
// PERF: We don't want to rehash if _keycomparer is already a RandomizedObjectEqualityComparer since in some
// cases there may not be any strings in the hashtable and we wouldn't get any mixing.
if(_keycomparer == null || !(_keycomparer is System.Collections.Generic.RandomizedObjectEqualityComparer))
{
_keycomparer = HashHelpers.GetRandomizedEqualityComparer(_keycomparer);
rehash(buckets.Length, true);
}
}
#endif // !FEATURE_CORECLR
#endif
return;
}
// If you see this assert, make sure load factor & count are reasonable.
// Then verify that our double hash function (h2, described at top of file)
// meets the requirements described above. You should never see this assert.
Contract.Assert(false, "hash table insert failed! Load factor too high, or our double hashing function is incorrect.");
throw new InvalidOperationException(Environment.GetResourceString("InvalidOperation_HashInsertFailed"));
}

首先校驗key值不能爲null ,接下來判斷是否需要對存儲區域進行擴展,判斷代碼如下:
if (count >= loadsize) {
expand();
}
else if(occupancy > loadsize && count > 100) {
rehash();
}
其中 count 表示現存元素個數,在Insert 方法中,每添加一個元素執行一次 Count++ 操作,同理在Remove方法中每去除一個元素執行一次 Count-- 操作。當現存元素個數等於最大容量時,(loadsize 標識着實際可添加的元素個數,loadFactor * newsize loadFactor 最大值爲0.72,可以在構造函數中指定,指定值介於0-1之間,會被乘於0.72 作爲最終的loadFactor 值。由此可見HashTable 的空間利用率不是很高)需要進行擴展數組容量。
擴展容量方法如下:
// Increases the bucket count of this hashtable. This method is called from
// the Insert method when the actual load factor of the hashtable reaches
// the upper limit specified when the hashtable was constructed. The number
// of buckets in the hashtable is increased to the smallest prime number
// that is larger than twice the current number of buckets, and the entries
// in the hashtable are redistributed into the new buckets using the cached
// hashcodes.
private void expand() {
int rawsize = HashHelpers.ExpandPrime(buckets.Length);
rehash(rawsize, false);
}
其中 ExpandPrime 方法決定新申請空間的大小,同HashSet公用該方法,原則上是原有大小的兩倍,但有一個最大限定值 2G elements。
// Returns size of hashtable to grow to.
public static int ExpandPrime(int oldSize)
{
int newSize = 2 * oldSize;

// Allow the hashtables to grow to maximum possible size (~2G elements) before encoutering capacity overflow.
// Note that this check works even when _items.Length overflowed thanks to the (uint) cast
if ((uint)newSize > MaxPrimeArrayLength && MaxPrimeArrayLength > oldSize)
{
Contract.Assert( MaxPrimeArrayLength == GetPrime(MaxPrimeArrayLength), "Invalid MaxPrimeArrayLength");
return MaxPrimeArrayLength;
}

return GetPrime(newSize);
}
其中occupancy 表示賦值衝突的次數,在添加元素時遇到衝突該值做加一操作。當衝突次數大於可最大容納數據個數是並且現有數據個數大於100 時,重新進行一次Hash操作,重排元素,以減少衝突概率,從而提高插入和查詢時的效率。在 rehash 的操作中將 occupancy 清零,從新計算衝突個數。
private uint InitHash(Object key, int hashsize, out uint seed, out uint incr) {
// Hashcode must be positive. Also, we must not use the sign bit, since
// that is used for the collision bit.
uint hashcode = (uint) GetHash(key) & 0x7FFFFFFF;
seed = (uint) hashcode;
// Restriction: incr MUST be between 1 and hashsize - 1, inclusive for
// the modular arithmetic to work correctly. This guarantees you'll
// visit every bucket in the table exactly once within hashsize
// iterations. Violate this and it'll cause obscure bugs forever.
// If you change this calculation for h2(key), update putEntry too!
incr = (uint)(1 + ((seed * HashPrime) % ((uint)hashsize - 1)));
return hashcode;
}
通過如上 InitHash 函數獲得兩個值一個seed 和 incr 其中返回值等於Seed,Seed值用於計算在數組中實際存儲元素對應的位置,代碼如下:
int bucketNumber = (int) (seed % (uint)buckets.Length);
incr 值用於計算當通過Seed計算出的bucketNumber 所指向的位置已存在元素時,從新獲得下一個存儲位置。代碼如下:
bucketNumber = (int) (((long)bucketNumber + incr)% (uint)buckets.Length);
因潛在的衝突存在,故在插入方法中使用了 do while 循環查找一個爲空的點進行插入操作,循環體內代碼如下:
do {

// Set emptySlot number to current bucket if it is the first available bucket that we have seen
// that once contained an entry and also has had a collision.
// We need to search this entire collision chain because we have to ensure that there are no
// duplicate entries in the table.
if (emptySlotNumber == -1 && (buckets[bucketNumber].key == buckets) && (buckets[bucketNumber].hash_coll < 0))//(((buckets[bucketNumber].hash_coll & unchecked(0x80000000))!=0)))
emptySlotNumber = bucketNumber; //be removed
此處的if語句判斷 bucketNumber 指向的節點是否是被刪除過的節點,emptySlotNumber 在進入循環前被賦值爲-1,當發現當前節點爲被刪除節點,則讓該變量指向這個節點。

// Insert the key/value pair into this bucket if this bucket is empty and has never contained an entry
// OR
// This bucket once contained an entry but there has never been a collision
下面的if 語句判斷如果當前節點爲空節點或者沒有產生衝突的被移除的節點時,將值插入該節點。
if ((buckets[bucketNumber].key == null) ||
(buckets[bucketNumber].key == buckets && ((buckets[bucketNumber].hash_coll & unchecked(0x80000000))==0))) {

// If we have found an available bucket that has never had a collision, but we've seen an available
// bucket in the past that has the collision bit set, use the previous bucket instead
if (emptySlotNumber != -1) // Reuse slot
bucketNumber = emptySlotNumber;

// We pretty much have to insert in this order. Don't set hash
// code until the value & key are set appropriately.
#if !FEATURE_CORECLR
Thread.BeginCriticalRegion();
#endif
isWriterInProgress = true;
buckets[bucketNumber].val = nvalue;
buckets[bucketNumber].key = key;
buckets[bucketNumber].hash_coll |= (int) hashcode;
count++;
UpdateVersion();
isWriterInProgress = false;
#if !FEATURE_CORECLR
Thread.EndCriticalRegion();
#endif

#if FEATURE_RANDOMIZED_STRING_HASHING
#if !FEATURE_CORECLR
// coreclr has the randomized string hashing on by default so we don't need to resize at this point

if(ntry > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(_keycomparer))
{
// PERF: We don't want to rehash if _keycomparer is already a RandomizedObjectEqualityComparer since in some
// cases there may not be any strings in the hashtable and we wouldn't get any mixing.
if(_keycomparer == null || !(_keycomparer is System.Collections.Generic.RandomizedObjectEqualityComparer))
{
_keycomparer = HashHelpers.GetRandomizedEqualityComparer(_keycomparer);
rehash(buckets.Length, true);
}
}
#endif // !FEATURE_CORECLR
#endif // FEATURE_RANDOMIZED_STRING_HASHING

return;
}
下面的if 語句完成對於有衝突節點的插入操作,與沒有衝突節點的插入操作的區別在於不再對於節點的hash_coll 賦值。結合讀取操作中的方法 hash_coll 實際存儲的是key 所得到的hashcode 計算出的實際應存儲位置。
// The current bucket is in use
// OR
// it is available, but has had the collision bit set and we have already found an available bucket
if (((buckets[bucketNumber].hash_coll & 0x7FFFFFFF) == hashcode) &&
KeyEquals (buckets[bucketNumber].key, key)) {
if (add) {
throw new ArgumentException(Environment.GetResourceString("Argument_AddingDuplicate__", buckets[bucketNumber].key, key));
}
#if !FEATURE_CORECLR
Thread.BeginCriticalRegion();
#endif
isWriterInProgress = true;
buckets[bucketNumber].val = nvalue;
UpdateVersion();
isWriterInProgress = false;
#if !FEATURE_CORECLR
Thread.EndCriticalRegion();
#endif

#if FEATURE_RANDOMIZED_STRING_HASHING
#if !FEATURE_CORECLR
ntry 記錄循環執行次數,當循環次數超過哈希衝突極限值 100時,重新進行Hash排序,以減少衝突。
if(ntry > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(_keycomparer))
{
// PERF: We don't want to rehash if _keycomparer is already a RandomizedObjectEqualityComparer since in some
// cases there may not be any strings in the hashtable and we wouldn't get any mixing.
if(_keycomparer == null || !(_keycomparer is System.Collections.Generic.RandomizedObjectEqualityComparer))
{
_keycomparer = HashHelpers.GetRandomizedEqualityComparer(_keycomparer);
rehash(buckets.Length, true);
}
}
#endif // !FEATURE_CORECLR
#endif
return;
}

// The current bucket is full, and we have therefore collided. We need to set the collision bit
// UNLESS
// we have remembered an available slot previously.
if (emptySlotNumber == -1) {// We don't need to set the collision bit here since we already have an empty slot
if( buckets[bucketNumber].hash_coll >= 0 ) {
buckets[bucketNumber].hash_coll |= unchecked((int)0x80000000);
occupancy++;
}
}

bucketNumber = (int) (((long)bucketNumber + incr)% (uint)buckets.Length);
} while (++ntry < buckets.Length);

循環體後的代碼本人認爲有些多餘,如有興趣大家可以研究下。

HashTable 線程安全,加鎖的代碼如下:
Thread.BeginCriticalRegion();
#endif
isWriterInProgress = true;
buckets[bucketNumber].val = nvalue;
UpdateVersion();
isWriterInProgress = false;
#if !FEATURE_CORECLR
Thread.EndCriticalRegion();

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章