HashTable 源码分析

基础元素结构如下:
// The hash table data.
// This cannot be serialised
private struct bucket {
public Object key;
public Object val;
public int hash_coll; // Store hash code; sign bit means there was a collision.
}

private bucket[] buckets;
// The total number of entries in the hash table.
private int count;
// The total number of collision bits set in the hashtable
private int occupancy;
private int loadsize;
private float loadFactor;
private volatile int version;
private volatile bool isWriterInProgress;
private ICollection keys;
private ICollection values;

private IEqualityComparer _keycomparer;
private Object _syncRoot;
首先分析它的添加元素的方法:
// Adds an entry with the given key and value to this hashtable. An
// ArgumentException is thrown if the key is null or if the key is already
// present in the hashtable.
//
public virtual void Add(Object key, Object value) {
Insert(key, value, true);
}
// Inserts an entry into this hashtable. This method is called from the Set
// and Add methods. If the add parameter is true and the given key already
// exists in the hashtable, an exception is thrown.
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.MayFail)]
private void Insert (Object key, Object nvalue, bool add) {
// @


if (key == null) {
throw new ArgumentNullException("key", Environment.GetResourceString("ArgumentNull_Key"));
}

Contract.EndContractBlock();
if (count >= loadsize) {
expand();
}
else if(occupancy > loadsize && count > 100) {
rehash();
}
uint seed;
uint incr;
// Assume we only have one thread writing concurrently. Modify
// buckets to contain new data, as long as we insert in the right order.
uint hashcode = InitHash(key, buckets.Length, out seed, out incr);
int ntry = 0;
int emptySlotNumber = -1; // We use the empty slot number to cache the first empty slot. We chose to reuse slots
// create by remove that have the collision bit set over using up new slots.
int bucketNumber = (int) (seed % (uint)buckets.Length);
do {

// Set emptySlot number to current bucket if it is the first available bucket that we have seen
// that once contained an entry and also has had a collision.
// We need to search this entire collision chain because we have to ensure that there are no
// duplicate entries in the table.
if (emptySlotNumber == -1 && (buckets[bucketNumber].key == buckets) && (buckets[bucketNumber].hash_coll < 0))//(((buckets[bucketNumber].hash_coll & unchecked(0x80000000))!=0)))
emptySlotNumber = bucketNumber;

// Insert the key/value pair into this bucket if this bucket is empty and has never contained an entry
// OR
// This bucket once contained an entry but there has never been a collision
if ((buckets[bucketNumber].key == null) ||
(buckets[bucketNumber].key == buckets && ((buckets[bucketNumber].hash_coll & unchecked(0x80000000))==0))) {

// If we have found an available bucket that has never had a collision, but we've seen an available
// bucket in the past that has the collision bit set, use the previous bucket instead
if (emptySlotNumber != -1) // Reuse slot
bucketNumber = emptySlotNumber;

// We pretty much have to insert in this order. Don't set hash
// code until the value & key are set appropriately.
#if !FEATURE_CORECLR
Thread.BeginCriticalRegion();
#endif
isWriterInProgress = true;
buckets[bucketNumber].val = nvalue;
buckets[bucketNumber].key = key;
buckets[bucketNumber].hash_coll |= (int) hashcode;
count++;
UpdateVersion();
isWriterInProgress = false;
#if !FEATURE_CORECLR
Thread.EndCriticalRegion();
#endif

#if FEATURE_RANDOMIZED_STRING_HASHING
#if !FEATURE_CORECLR
// coreclr has the randomized string hashing on by default so we don't need to resize at this point

if(ntry > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(_keycomparer))
{
// PERF: We don't want to rehash if _keycomparer is already a RandomizedObjectEqualityComparer since in some
// cases there may not be any strings in the hashtable and we wouldn't get any mixing.
if(_keycomparer == null || !(_keycomparer is System.Collections.Generic.RandomizedObjectEqualityComparer))
{
_keycomparer = HashHelpers.GetRandomizedEqualityComparer(_keycomparer);
rehash(buckets.Length, true);
}
}
#endif // !FEATURE_CORECLR
#endif // FEATURE_RANDOMIZED_STRING_HASHING

return;
}

// The current bucket is in use
// OR
// it is available, but has had the collision bit set and we have already found an available bucket
if (((buckets[bucketNumber].hash_coll & 0x7FFFFFFF) == hashcode) &&
KeyEquals (buckets[bucketNumber].key, key)) {
if (add) {
throw new ArgumentException(Environment.GetResourceString("Argument_AddingDuplicate__", buckets[bucketNumber].key, key));
}
#if !FEATURE_CORECLR
Thread.BeginCriticalRegion();
#endif
isWriterInProgress = true;
buckets[bucketNumber].val = nvalue;
UpdateVersion();
isWriterInProgress = false;
#if !FEATURE_CORECLR
Thread.EndCriticalRegion();
#endif

#if FEATURE_RANDOMIZED_STRING_HASHING
#if !FEATURE_CORECLR
if(ntry > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(_keycomparer))
{
// PERF: We don't want to rehash if _keycomparer is already a RandomizedObjectEqualityComparer since in some
// cases there may not be any strings in the hashtable and we wouldn't get any mixing.
if(_keycomparer == null || !(_keycomparer is System.Collections.Generic.RandomizedObjectEqualityComparer))
{
_keycomparer = HashHelpers.GetRandomizedEqualityComparer(_keycomparer);
rehash(buckets.Length, true);
}
}
#endif // !FEATURE_CORECLR
#endif
return;
}

// The current bucket is full, and we have therefore collided. We need to set the collision bit
// UNLESS
// we have remembered an available slot previously.
if (emptySlotNumber == -1) {// We don't need to set the collision bit here since we already have an empty slot
if( buckets[bucketNumber].hash_coll >= 0 ) {
buckets[bucketNumber].hash_coll |= unchecked((int)0x80000000);
occupancy++;
}
}

bucketNumber = (int) (((long)bucketNumber + incr)% (uint)buckets.Length);
} while (++ntry < buckets.Length);

// This code is here if and only if there were no buckets without a collision bit set in the entire table
if (emptySlotNumber != -1)
{
// We pretty much have to insert in this order. Don't set hash
// code until the value & key are set appropriately.
#if !FEATURE_CORECLR
Thread.BeginCriticalRegion();
#endif
isWriterInProgress = true;
buckets[emptySlotNumber].val = nvalue;
buckets[emptySlotNumber].key = key;
buckets[emptySlotNumber].hash_coll |= (int) hashcode;
count++;
UpdateVersion();
isWriterInProgress = false;
#if !FEATURE_CORECLR
Thread.EndCriticalRegion();
#endif

#if FEATURE_RANDOMIZED_STRING_HASHING
#if !FEATURE_CORECLR
if(buckets.Length > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(_keycomparer))
{
// PERF: We don't want to rehash if _keycomparer is already a RandomizedObjectEqualityComparer since in some
// cases there may not be any strings in the hashtable and we wouldn't get any mixing.
if(_keycomparer == null || !(_keycomparer is System.Collections.Generic.RandomizedObjectEqualityComparer))
{
_keycomparer = HashHelpers.GetRandomizedEqualityComparer(_keycomparer);
rehash(buckets.Length, true);
}
}
#endif // !FEATURE_CORECLR
#endif
return;
}
// If you see this assert, make sure load factor & count are reasonable.
// Then verify that our double hash function (h2, described at top of file)
// meets the requirements described above. You should never see this assert.
Contract.Assert(false, "hash table insert failed! Load factor too high, or our double hashing function is incorrect.");
throw new InvalidOperationException(Environment.GetResourceString("InvalidOperation_HashInsertFailed"));
}

首先校验key值不能为null ,接下来判断是否需要对存储区域进行扩展,判断代码如下:
if (count >= loadsize) {
expand();
}
else if(occupancy > loadsize && count > 100) {
rehash();
}
其中 count 表示现存元素个数,在Insert 方法中,每添加一个元素执行一次 Count++ 操作,同理在Remove方法中每去除一个元素执行一次 Count-- 操作。当现存元素个数等于最大容量时,(loadsize 标识着实际可添加的元素个数,loadFactor * newsize loadFactor 最大值为0.72,可以在构造函数中指定,指定值介于0-1之间,会被乘于0.72 作为最终的loadFactor 值。由此可见HashTable 的空间利用率不是很高)需要进行扩展数组容量。
扩展容量方法如下:
// Increases the bucket count of this hashtable. This method is called from
// the Insert method when the actual load factor of the hashtable reaches
// the upper limit specified when the hashtable was constructed. The number
// of buckets in the hashtable is increased to the smallest prime number
// that is larger than twice the current number of buckets, and the entries
// in the hashtable are redistributed into the new buckets using the cached
// hashcodes.
private void expand() {
int rawsize = HashHelpers.ExpandPrime(buckets.Length);
rehash(rawsize, false);
}
其中 ExpandPrime 方法决定新申请空间的大小,同HashSet公用该方法,原则上是原有大小的两倍,但有一个最大限定值 2G elements。
// Returns size of hashtable to grow to.
public static int ExpandPrime(int oldSize)
{
int newSize = 2 * oldSize;

// Allow the hashtables to grow to maximum possible size (~2G elements) before encoutering capacity overflow.
// Note that this check works even when _items.Length overflowed thanks to the (uint) cast
if ((uint)newSize > MaxPrimeArrayLength && MaxPrimeArrayLength > oldSize)
{
Contract.Assert( MaxPrimeArrayLength == GetPrime(MaxPrimeArrayLength), "Invalid MaxPrimeArrayLength");
return MaxPrimeArrayLength;
}

return GetPrime(newSize);
}
其中occupancy 表示赋值冲突的次数,在添加元素时遇到冲突该值做加一操作。当冲突次数大于可最大容纳数据个数是并且现有数据个数大于100 时,重新进行一次Hash操作,重排元素,以减少冲突概率,从而提高插入和查询时的效率。在 rehash 的操作中将 occupancy 清零,从新计算冲突个数。
private uint InitHash(Object key, int hashsize, out uint seed, out uint incr) {
// Hashcode must be positive. Also, we must not use the sign bit, since
// that is used for the collision bit.
uint hashcode = (uint) GetHash(key) & 0x7FFFFFFF;
seed = (uint) hashcode;
// Restriction: incr MUST be between 1 and hashsize - 1, inclusive for
// the modular arithmetic to work correctly. This guarantees you'll
// visit every bucket in the table exactly once within hashsize
// iterations. Violate this and it'll cause obscure bugs forever.
// If you change this calculation for h2(key), update putEntry too!
incr = (uint)(1 + ((seed * HashPrime) % ((uint)hashsize - 1)));
return hashcode;
}
通过如上 InitHash 函数获得两个值一个seed 和 incr 其中返回值等于Seed,Seed值用于计算在数组中实际存储元素对应的位置,代码如下:
int bucketNumber = (int) (seed % (uint)buckets.Length);
incr 值用于计算当通过Seed计算出的bucketNumber 所指向的位置已存在元素时,从新获得下一个存储位置。代码如下:
bucketNumber = (int) (((long)bucketNumber + incr)% (uint)buckets.Length);
因潜在的冲突存在,故在插入方法中使用了 do while 循环查找一个为空的点进行插入操作,循环体内代码如下:
do {

// Set emptySlot number to current bucket if it is the first available bucket that we have seen
// that once contained an entry and also has had a collision.
// We need to search this entire collision chain because we have to ensure that there are no
// duplicate entries in the table.
if (emptySlotNumber == -1 && (buckets[bucketNumber].key == buckets) && (buckets[bucketNumber].hash_coll < 0))//(((buckets[bucketNumber].hash_coll & unchecked(0x80000000))!=0)))
emptySlotNumber = bucketNumber; //be removed
此处的if语句判断 bucketNumber 指向的节点是否是被删除过的节点,emptySlotNumber 在进入循环前被赋值为-1,当发现当前节点为被删除节点,则让该变量指向这个节点。

// Insert the key/value pair into this bucket if this bucket is empty and has never contained an entry
// OR
// This bucket once contained an entry but there has never been a collision
下面的if 语句判断如果当前节点为空节点或者没有产生冲突的被移除的节点时,将值插入该节点。
if ((buckets[bucketNumber].key == null) ||
(buckets[bucketNumber].key == buckets && ((buckets[bucketNumber].hash_coll & unchecked(0x80000000))==0))) {

// If we have found an available bucket that has never had a collision, but we've seen an available
// bucket in the past that has the collision bit set, use the previous bucket instead
if (emptySlotNumber != -1) // Reuse slot
bucketNumber = emptySlotNumber;

// We pretty much have to insert in this order. Don't set hash
// code until the value & key are set appropriately.
#if !FEATURE_CORECLR
Thread.BeginCriticalRegion();
#endif
isWriterInProgress = true;
buckets[bucketNumber].val = nvalue;
buckets[bucketNumber].key = key;
buckets[bucketNumber].hash_coll |= (int) hashcode;
count++;
UpdateVersion();
isWriterInProgress = false;
#if !FEATURE_CORECLR
Thread.EndCriticalRegion();
#endif

#if FEATURE_RANDOMIZED_STRING_HASHING
#if !FEATURE_CORECLR
// coreclr has the randomized string hashing on by default so we don't need to resize at this point

if(ntry > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(_keycomparer))
{
// PERF: We don't want to rehash if _keycomparer is already a RandomizedObjectEqualityComparer since in some
// cases there may not be any strings in the hashtable and we wouldn't get any mixing.
if(_keycomparer == null || !(_keycomparer is System.Collections.Generic.RandomizedObjectEqualityComparer))
{
_keycomparer = HashHelpers.GetRandomizedEqualityComparer(_keycomparer);
rehash(buckets.Length, true);
}
}
#endif // !FEATURE_CORECLR
#endif // FEATURE_RANDOMIZED_STRING_HASHING

return;
}
下面的if 语句完成对于有冲突节点的插入操作,与没有冲突节点的插入操作的区别在于不再对于节点的hash_coll 赋值。结合读取操作中的方法 hash_coll 实际存储的是key 所得到的hashcode 计算出的实际应存储位置。
// The current bucket is in use
// OR
// it is available, but has had the collision bit set and we have already found an available bucket
if (((buckets[bucketNumber].hash_coll & 0x7FFFFFFF) == hashcode) &&
KeyEquals (buckets[bucketNumber].key, key)) {
if (add) {
throw new ArgumentException(Environment.GetResourceString("Argument_AddingDuplicate__", buckets[bucketNumber].key, key));
}
#if !FEATURE_CORECLR
Thread.BeginCriticalRegion();
#endif
isWriterInProgress = true;
buckets[bucketNumber].val = nvalue;
UpdateVersion();
isWriterInProgress = false;
#if !FEATURE_CORECLR
Thread.EndCriticalRegion();
#endif

#if FEATURE_RANDOMIZED_STRING_HASHING
#if !FEATURE_CORECLR
ntry 记录循环执行次数,当循环次数超过哈希冲突极限值 100时,重新进行Hash排序,以减少冲突。
if(ntry > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(_keycomparer))
{
// PERF: We don't want to rehash if _keycomparer is already a RandomizedObjectEqualityComparer since in some
// cases there may not be any strings in the hashtable and we wouldn't get any mixing.
if(_keycomparer == null || !(_keycomparer is System.Collections.Generic.RandomizedObjectEqualityComparer))
{
_keycomparer = HashHelpers.GetRandomizedEqualityComparer(_keycomparer);
rehash(buckets.Length, true);
}
}
#endif // !FEATURE_CORECLR
#endif
return;
}

// The current bucket is full, and we have therefore collided. We need to set the collision bit
// UNLESS
// we have remembered an available slot previously.
if (emptySlotNumber == -1) {// We don't need to set the collision bit here since we already have an empty slot
if( buckets[bucketNumber].hash_coll >= 0 ) {
buckets[bucketNumber].hash_coll |= unchecked((int)0x80000000);
occupancy++;
}
}

bucketNumber = (int) (((long)bucketNumber + incr)% (uint)buckets.Length);
} while (++ntry < buckets.Length);

循环体后的代码本人认为有些多余,如有兴趣大家可以研究下。

HashTable 线程安全,加锁的代码如下:
Thread.BeginCriticalRegion();
#endif
isWriterInProgress = true;
buckets[bucketNumber].val = nvalue;
UpdateVersion();
isWriterInProgress = false;
#if !FEATURE_CORECLR
Thread.EndCriticalRegion();

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章