HashSet<T> 源碼解析

支持泛型,基礎元素結構:

internal struct Slot {
internal int hashCode; // Lower 31 bits of hash code, -1 if unused
internal T value;
internal int next; // Index of next entry, -1 if last
}

其 add 方法如下:

public bool Add(T item) {
return AddIfNotPresent(item);
}

/// <summary>
/// Adds value to HashSet if not contained already
/// Returns true if added and false if already present
/// </summary>
/// <param name="value">value to find</param>
/// <returns></returns>
private bool AddIfNotPresent(T value) {
if (m_buckets == null) {
Initialize(0);
}

int hashCode = InternalGetHashCode(value);
int bucket = hashCode % m_buckets.Length;
#if FEATURE_RANDOMIZED_STRING_HASHING && !FEATURE_NETCORE
int collisionCount = 0;
#endif
for (int i = m_buckets[hashCode % m_buckets.Length] - 1; i >= 0; i = m_slots[i].next) {
if (m_slots[i].hashCode == hashCode && m_comparer.Equals(m_slots[i].value, value)) {
return false;
}
#if FEATURE_RANDOMIZED_STRING_HASHING && !FEATURE_NETCORE
collisionCount++;
#endif
}

int index;
if (m_freeList >= 0) {
index = m_freeList;
m_freeList = m_slots[index].next;
}
else {
if (m_lastIndex == m_slots.Length) {
IncreaseCapacity();
// this will change during resize
bucket = hashCode % m_buckets.Length;
}
index = m_lastIndex;
m_lastIndex++;
}
m_slots[index].hashCode = hashCode;
m_slots[index].value = value;
m_slots[index].next = m_buckets[bucket] - 1;
m_buckets[bucket] = index + 1;
m_count++;
m_version++;

#if FEATURE_RANDOMIZED_STRING_HASHING && !FEATURE_NETCORE
if(collisionCount > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(m_comparer)) {
m_comparer = (IEqualityComparer<T>) HashHelpers.GetRandomizedEqualityComparer(m_comparer);
SetCapacity(m_buckets.Length, true);
}
#endif // FEATURE_RANDOMIZED_STRING_HASHING

return true;
}
通過以上方法可得知,在HashSet中添加元素,首先要判斷其是否已存在,如存在返回false並終止添加。HashSet 可添加 null 進入集合。經源碼分析可發現 HashSet 的元素 HashCode 生成函數默認情況下取自 EqualityComparer<T>.Default; 這和 HashTable 和Dictionary 的HashCode生成函數默認情況下是相同的,其中 Dictionary 可以在構造方法中自定義比較函數。源碼中 HashCode是 Int 類型變量,如遇到超大數據量注意溢出問題。

HashSet 基礎數據存儲機構如下:
private int[] m_buckets;
private Slot[] m_slots;
private int m_count;
private int m_lastIndex;
private int m_freeList;
private IEqualityComparer<T> m_comparer;
private int m_version;

其添加元素的處理方式如下:
首先通過m_buckets 存儲 hashCode % m_buckets.Length 對應於元素在 m_slots 中的存儲位置,再通過遍歷slots 元素的next 指針形成的鏈表,如果發現hashCode 同鏈表中元素同添加元素計算出的hashCode 相同,並且調用eque方法比較相同的話返回false.否則執行添加操作。核心代碼如下:
for (int i = m_buckets[hashCode % m_buckets.Length] - 1; i >= 0; i = m_slots[i].next) {
if (m_slots[i].hashCode == hashCode && m_comparer.Equals(m_slots[i].value, value)) {
return false;
}
執行添加操作處理方式如下:
分兩種情況進行處理,核心代碼如下:
int index;
if (m_freeList >= 0) {
index = m_freeList;
m_freeList = m_slots[index].next;
}
else {
if (m_lastIndex == m_slots.Length) {
IncreaseCapacity();
// this will change during resize
bucket = hashCode % m_buckets.Length;
}
index = m_lastIndex;
m_lastIndex++;
}
如上代碼, m_freeList 該指針指向下一個數組空閒的元素位置,當爲-1時代表已沒有空餘元素供賦值使用。結合Remove方法得出的結論是m_freeList 當有元素被刪除時,指向最後一個被刪除元素,被刪除元素實際上是value屬性被置爲該類型缺省的值,next 指向下一個被刪除元素,也可這樣理解所有被刪除的元素實際上形成一個以 m_freeList 爲頭指針的鏈表,移除一個元素即在該鏈表頭出添加一個元素,同理以上的插入邏輯就很好理解了,第一種情況 m_freeList >0 證明數組中 m_lastIndex 之前的元素中有被刪除的空元素存在,優先給這些元素賦值。第二種情況 m_freeList = -1 時,即之前被刪除元素已全被賦值完畢,此時通過判斷 m_lastIndex 該值指向最後一個被用元素的下一個,當其值等於 數組長度時代表數組空間以被使用完畢,需要重新分配空間,故需要調用 IncreaseCapacity();函數重新申請空間,其重新申請的空間個數由如下的ExpandPrime 方法確定,會根據現有元素個數來決定從新申請的空間大小,從而避免多次重複申請空間造成的性能問題。從新分配空間採用數組複製的方式進行,具體分配操作由 SetCapacity 方法執行,有興趣的可參看下面方法。第二種情況如在m_lastIndex 仍指向可用的元素,也即分配的空間未被使用完,直接對其進行賦值。賦值操作比較簡單,不再贅述。

相關方法如下:
public bool Remove(T item) {
if (m_buckets != null) {
int hashCode = InternalGetHashCode(item);
int bucket = hashCode % m_buckets.Length;
int last = -1;
for (int i = m_buckets[bucket] - 1; i >= 0; last = i, i = m_slots[i].next) {
if (m_slots[i].hashCode == hashCode && m_comparer.Equals(m_slots[i].value, item)) {
if (last < 0) {
// first iteration; update buckets
m_buckets[bucket] = m_slots[i].next + 1;
}
else {
// subsequent iterations; update 'next' pointers
m_slots[last].next = m_slots[i].next;
}
m_slots[i].hashCode = -1;
m_slots[i].value = default(T);
m_slots[i].next = m_freeList;

m_count--;
m_version++;
if (m_count == 0) {
m_lastIndex = 0;
m_freeList = -1;
}
else {
m_freeList = i;
}
return true;
}
}
}
// either m_buckets is null or wasn't found
return false;
}

// Returns size of hashtable to grow to.
public static int ExpandPrime(int oldSize)
{
int newSize = 2 * oldSize;

// Allow the hashtables to grow to maximum possible size (~2G elements) before encoutering capacity overflow.
// Note that this check works even when _items.Length overflowed thanks to the (uint) cast
if ((uint)newSize > MaxPrimeArrayLength && MaxPrimeArrayLength > oldSize)
{
Contract.Assert( MaxPrimeArrayLength == GetPrime(MaxPrimeArrayLength), "Invalid MaxPrimeArrayLength");
return MaxPrimeArrayLength;
}

return GetPrime(newSize);
}

private void SetCapacity(int newSize, bool forceNewHashCodes) {
Contract.Assert(HashHelpers.IsPrime(newSize), "New size is not prime!");

Contract.Assert(m_buckets != null, "SetCapacity called on a set with no elements");

Slot[] newSlots = new Slot[newSize];
if (m_slots != null) {
Array.Copy(m_slots, 0, newSlots, 0, m_lastIndex);
}

if(forceNewHashCodes) {
for(int i = 0; i < m_lastIndex; i++) {
if(newSlots[i].hashCode != -1) {
newSlots[i].hashCode = InternalGetHashCode(newSlots[i].value);
}
}
}

int[] newBuckets = new int[newSize];
for (int i = 0; i < m_lastIndex; i++) {
int bucket = newSlots[i].hashCode % newSize;
newSlots[i].next = newBuckets[bucket] - 1;
newBuckets[bucket] = i + 1;
}
m_slots = newSlots;
m_buckets = newBuckets;
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章