HashSet<T> 源码解析

支持泛型,基础元素结构:

internal struct Slot {
internal int hashCode; // Lower 31 bits of hash code, -1 if unused
internal T value;
internal int next; // Index of next entry, -1 if last
}

其 add 方法如下:

public bool Add(T item) {
return AddIfNotPresent(item);
}

/// <summary>
/// Adds value to HashSet if not contained already
/// Returns true if added and false if already present
/// </summary>
/// <param name="value">value to find</param>
/// <returns></returns>
private bool AddIfNotPresent(T value) {
if (m_buckets == null) {
Initialize(0);
}

int hashCode = InternalGetHashCode(value);
int bucket = hashCode % m_buckets.Length;
#if FEATURE_RANDOMIZED_STRING_HASHING && !FEATURE_NETCORE
int collisionCount = 0;
#endif
for (int i = m_buckets[hashCode % m_buckets.Length] - 1; i >= 0; i = m_slots[i].next) {
if (m_slots[i].hashCode == hashCode && m_comparer.Equals(m_slots[i].value, value)) {
return false;
}
#if FEATURE_RANDOMIZED_STRING_HASHING && !FEATURE_NETCORE
collisionCount++;
#endif
}

int index;
if (m_freeList >= 0) {
index = m_freeList;
m_freeList = m_slots[index].next;
}
else {
if (m_lastIndex == m_slots.Length) {
IncreaseCapacity();
// this will change during resize
bucket = hashCode % m_buckets.Length;
}
index = m_lastIndex;
m_lastIndex++;
}
m_slots[index].hashCode = hashCode;
m_slots[index].value = value;
m_slots[index].next = m_buckets[bucket] - 1;
m_buckets[bucket] = index + 1;
m_count++;
m_version++;

#if FEATURE_RANDOMIZED_STRING_HASHING && !FEATURE_NETCORE
if(collisionCount > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(m_comparer)) {
m_comparer = (IEqualityComparer<T>) HashHelpers.GetRandomizedEqualityComparer(m_comparer);
SetCapacity(m_buckets.Length, true);
}
#endif // FEATURE_RANDOMIZED_STRING_HASHING

return true;
}
通过以上方法可得知,在HashSet中添加元素,首先要判断其是否已存在,如存在返回false并终止添加。HashSet 可添加 null 进入集合。经源码分析可发现 HashSet 的元素 HashCode 生成函数默认情况下取自 EqualityComparer<T>.Default; 这和 HashTable 和Dictionary 的HashCode生成函数默认情况下是相同的,其中 Dictionary 可以在构造方法中自定义比较函数。源码中 HashCode是 Int 类型变量,如遇到超大数据量注意溢出问题。

HashSet 基础数据存储机构如下:
private int[] m_buckets;
private Slot[] m_slots;
private int m_count;
private int m_lastIndex;
private int m_freeList;
private IEqualityComparer<T> m_comparer;
private int m_version;

其添加元素的处理方式如下:
首先通过m_buckets 存储 hashCode % m_buckets.Length 对应于元素在 m_slots 中的存储位置,再通过遍历slots 元素的next 指针形成的链表,如果发现hashCode 同链表中元素同添加元素计算出的hashCode 相同,并且调用eque方法比较相同的话返回false.否则执行添加操作。核心代码如下:
for (int i = m_buckets[hashCode % m_buckets.Length] - 1; i >= 0; i = m_slots[i].next) {
if (m_slots[i].hashCode == hashCode && m_comparer.Equals(m_slots[i].value, value)) {
return false;
}
执行添加操作处理方式如下:
分两种情况进行处理,核心代码如下:
int index;
if (m_freeList >= 0) {
index = m_freeList;
m_freeList = m_slots[index].next;
}
else {
if (m_lastIndex == m_slots.Length) {
IncreaseCapacity();
// this will change during resize
bucket = hashCode % m_buckets.Length;
}
index = m_lastIndex;
m_lastIndex++;
}
如上代码, m_freeList 该指针指向下一个数组空闲的元素位置,当为-1时代表已没有空余元素供赋值使用。结合Remove方法得出的结论是m_freeList 当有元素被删除时,指向最后一个被删除元素,被删除元素实际上是value属性被置为该类型缺省的值,next 指向下一个被删除元素,也可这样理解所有被删除的元素实际上形成一个以 m_freeList 为头指针的链表,移除一个元素即在该链表头出添加一个元素,同理以上的插入逻辑就很好理解了,第一种情况 m_freeList >0 证明数组中 m_lastIndex 之前的元素中有被删除的空元素存在,优先给这些元素赋值。第二种情况 m_freeList = -1 时,即之前被删除元素已全被赋值完毕,此时通过判断 m_lastIndex 该值指向最后一个被用元素的下一个,当其值等于 数组长度时代表数组空间以被使用完毕,需要重新分配空间,故需要调用 IncreaseCapacity();函数重新申请空间,其重新申请的空间个数由如下的ExpandPrime 方法确定,会根据现有元素个数来决定从新申请的空间大小,从而避免多次重复申请空间造成的性能问题。从新分配空间采用数组复制的方式进行,具体分配操作由 SetCapacity 方法执行,有兴趣的可参看下面方法。第二种情况如在m_lastIndex 仍指向可用的元素,也即分配的空间未被使用完,直接对其进行赋值。赋值操作比较简单,不再赘述。

相关方法如下:
public bool Remove(T item) {
if (m_buckets != null) {
int hashCode = InternalGetHashCode(item);
int bucket = hashCode % m_buckets.Length;
int last = -1;
for (int i = m_buckets[bucket] - 1; i >= 0; last = i, i = m_slots[i].next) {
if (m_slots[i].hashCode == hashCode && m_comparer.Equals(m_slots[i].value, item)) {
if (last < 0) {
// first iteration; update buckets
m_buckets[bucket] = m_slots[i].next + 1;
}
else {
// subsequent iterations; update 'next' pointers
m_slots[last].next = m_slots[i].next;
}
m_slots[i].hashCode = -1;
m_slots[i].value = default(T);
m_slots[i].next = m_freeList;

m_count--;
m_version++;
if (m_count == 0) {
m_lastIndex = 0;
m_freeList = -1;
}
else {
m_freeList = i;
}
return true;
}
}
}
// either m_buckets is null or wasn't found
return false;
}

// Returns size of hashtable to grow to.
public static int ExpandPrime(int oldSize)
{
int newSize = 2 * oldSize;

// Allow the hashtables to grow to maximum possible size (~2G elements) before encoutering capacity overflow.
// Note that this check works even when _items.Length overflowed thanks to the (uint) cast
if ((uint)newSize > MaxPrimeArrayLength && MaxPrimeArrayLength > oldSize)
{
Contract.Assert( MaxPrimeArrayLength == GetPrime(MaxPrimeArrayLength), "Invalid MaxPrimeArrayLength");
return MaxPrimeArrayLength;
}

return GetPrime(newSize);
}

private void SetCapacity(int newSize, bool forceNewHashCodes) {
Contract.Assert(HashHelpers.IsPrime(newSize), "New size is not prime!");

Contract.Assert(m_buckets != null, "SetCapacity called on a set with no elements");

Slot[] newSlots = new Slot[newSize];
if (m_slots != null) {
Array.Copy(m_slots, 0, newSlots, 0, m_lastIndex);
}

if(forceNewHashCodes) {
for(int i = 0; i < m_lastIndex; i++) {
if(newSlots[i].hashCode != -1) {
newSlots[i].hashCode = InternalGetHashCode(newSlots[i].value);
}
}
}

int[] newBuckets = new int[newSize];
for (int i = 0; i < m_lastIndex; i++) {
int bucket = newSlots[i].hashCode % newSize;
newSlots[i].next = newBuckets[bucket] - 1;
newBuckets[bucket] = i + 1;
}
m_slots = newSlots;
m_buckets = newBuckets;
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章