引入哈希桶的概念來實現一個哈希表
學習一下國外大牛的思路- Chapter: 散列表(哈希表)
-
- 1. 散列表(哈希表)的定義
- 2. 散列表是怎麼進行查找的?
- 3. 散列函數設計:直接定址法
- 4. 散列函數設計:除留餘數法
- 5. 散列衝突處理:開放定址法
- 6. 散列衝突處理:鏈地址法
- 7. 哈希表的鏈地址法實現
- 8. Linux內核中的hash與bucket
- 9. 引入哈希桶的概念來實現一個哈希表
- 10. 魔獸文件打包器裏的傳奇哈希表
- 11. DJBX33A APR哈希默認算法
- 12. Times33算法與最快的Hash表
- 13. 撥雲見日,閒聊哈希表
前面的講述瞭如何用鏈地址法實現一個哈希表,那麼今天來分析一下另一種解決哈希衝突的做法,即爲每個Hash值,建立一個Hash桶(Bucket),桶的容量是固定的,也就是隻能處理固定次數的衝突,如1048576個Hash桶,每個桶中有4個表項(Entry),總計4M個表項。其實這兩種的實現思路雷同,就是對Hash表中每個Hash值建立一個衝突表,即將衝突的幾個記錄以表的形式存儲在其中。
大致的思路是這樣的:
首先哈希桶的個數是固定的,有用戶構建的時候輸入,一旦構建,個數就已經固定;查找的時候首先將key值通過哈希函數獲取哈希值,根據哈希值獲取到對應的哈希桶,然後遍歷哈希桶內的pairs數組獲取。
主要的數據結構:
01 |
struct Pair
{ |
02 |
char *key; |
03 |
char *value; |
04 |
}; |
05 |
06 |
struct Bucket
{ |
07 |
unsigned int count; |
08 |
Pair
*pairs; |
09 |
}; |
10 |
11 |
struct StrMap
{ |
12 |
unsigned int count; |
13 |
Bucket
*buckets; |
14 |
}; |
strmap.h
001 |
#ifndef
_STRMAP_H_ |
002 |
#define
_STRMAP_H_ |
003 |
004 |
#ifdef
__cplusplus |
005 |
extern "C" |
006 |
{ |
007 |
#endif |
008 |
009 |
#include
<stdlib.h> |
010 |
#include
<string.h> |
011 |
012 |
typedef struct StrMap
StrMap; |
013 |
014 |
/* |
015 |
*
This callback function is called once per key-value when iterating over |
016 |
*
all keys associated to values. |
017 |
* |
018 |
*
Parameters: |
019 |
* |
020 |
*
key: A pointer to a null-terminated C string. The string must not |
021 |
*
be modified by the client. |
022 |
* |
023 |
*
value: A pointer to a null-terminated C string. The string must |
024 |
*
not be modified by the client. |
025 |
* |
026 |
*
obj: A pointer to a client-specific object. This parameter may be |
027 |
*
null. |
028 |
* |
029 |
*
Return value: None. |
030 |
*/ |
031 |
typedef void (*sm_enum_func)( const char *key, const char *value, const void *obj); |
032 |
033 |
/* |
034 |
*
Creates a string map. |
035 |
* |
036 |
*
Parameters: |
037 |
* |
038 |
*
capacity: The number of top-level slots this string map |
039 |
*
should allocate. This parameter must be > 0. |
040 |
* |
041 |
*
Return value: A pointer to a string map object, |
042 |
*
or null if a new string map could not be allocated. |
043 |
*/ |
044 |
StrMap
* sm_new(unsigned int capacity); |
045 |
046 |
/* |
047 |
*
Releases all memory held by a string map object. |
048 |
* |
049 |
*
Parameters: |
050 |
* |
051 |
*
map: A pointer to a string map. This parameter cannot be null. |
052 |
*
If the supplied string map has been previously released, the |
053 |
*
behaviour of this function is undefined. |
054 |
* |
055 |
*
Return value: None. |
056 |
*/ |
057 |
void sm_delete(StrMap
*map); |
058 |
059 |
/* |
060 |
*
Returns the value associated with the supplied key. |
061 |
* |
062 |
*
Parameters: |
063 |
* |
064 |
*
map: A pointer to a string map. This parameter cannot be null. |
065 |
* |
066 |
*
key: A pointer to a null-terminated C string. This parameter cannot |
067 |
*
be null. |
068 |
* |
069 |
*
out_buf: A pointer to an output buffer which will contain the value, |
070 |
*
if it exists and fits into the buffer. |
071 |
* |
072 |
*
n_out_buf: The size of the output buffer in bytes. |
073 |
* |
074 |
*
Return value: If out_buf is set to null and n_out_buf is set to 0 the return |
075 |
*
value will be the number of bytes required to store the value (if it exists) |
076 |
*
and its null-terminator. For all other parameter configurations the return value |
077 |
*
is 1 if an associated value was found and completely copied into the output buffer, |
078 |
*
0 otherwise. |
079 |
*/ |
080 |
int sm_get( const StrMap
*map, const char *key, char *out_buf,
unsigned int n_out_buf); |
081 |
082 |
/* |
083 |
*
Queries the existence of a key. |
084 |
* |
085 |
*
Parameters: |
086 |
* |
087 |
*
map: A pointer to a string map. This parameter cannot be null. |
088 |
* |
089 |
*
key: A pointer to a null-terminated C string. This parameter cannot |
090 |
*
be null. |
091 |
* |
092 |
*
Return value: 1 if the key exists, 0 otherwise. |
093 |
*/ |
094 |
int sm_exists( const StrMap
*map, const char *key); |
095 |
096 |
/* |
097 |
*
Associates a value with the supplied key. If the key is already |
098 |
*
associated with a value, the previous value is replaced. |
099 |
* |
100 |
*
Parameters: |
101 |
* |
102 |
*
map: A pointer to a string map. This parameter cannot be null. |
103 |
* |
104 |
*
key: A pointer to a null-terminated C string. This parameter |
105 |
*
cannot be null. The string must have a string length > 0. The |
106 |
*
string will be copied. |
107 |
* |
108 |
*
value: A pointer to a null-terminated C string. This parameter |
109 |
*
cannot be null. The string must have a string length > 0. The |
110 |
*
string will be copied. |
111 |
* |
112 |
*
Return value: 1 if the association succeeded, 0 otherwise. |
113 |
*/ |
114 |
int sm_put(StrMap
*map, const char *key, const char *value); |
115 |
116 |
/* |
117 |
*
Returns the number of associations between keys and values. |
118 |
* |
119 |
*
Parameters: |
120 |
* |
121 |
*
map: A pointer to a string map. This parameter cannot be null. |
122 |
* |
123 |
*
Return value: The number of associations between keys and values. |
124 |
*/ |
125 |
int sm_get_count( const StrMap
*map); |
126 |
127 |
/* |
128 |
*
An enumerator over all associations between keys and values. |
129 |
* |
130 |
*
Parameters: |
131 |
* |
132 |
*
map: A pointer to a string map. This parameter cannot be null. |
133 |
* |
134 |
*
enum_func: A pointer to a callback function that will be |
135 |
*
called by this procedure once for every key associated |
136 |
*
with a value. This parameter cannot be null. |
137 |
* |
138 |
*
obj: A pointer to a client-specific object. This parameter will be |
139 |
*
passed back to the client's callback function. This parameter can |
140 |
*
be null. |
141 |
* |
142 |
*
Return value: 1 if enumeration completed, 0 otherwise. |
143 |
*/ |
144 |
int sm_enum( const StrMap
*map, sm_enum_func enum_func, const void *obj); |
145 |
146 |
#ifdef
__cplusplus |
147 |
} |
148 |
#endif |
149 |
150 |
#endif |
strmap.c
001 |
#include
"strmap.h" |
002 |
003 |
typedef struct Pair
Pair; |
004 |
005 |
typedef struct Bucket
Bucket; |
006 |
007 |
struct Pair
{ |
008 |
char *key; |
009 |
char *value; |
010 |
}; |
011 |
012 |
struct Bucket
{ |
013 |
unsigned int count; |
014 |
Pair
*pairs; |
015 |
}; |
016 |
017 |
struct StrMap
{ |
018 |
unsigned int count; |
019 |
Bucket
*buckets; |
020 |
}; |
021 |
022 |
static Pair
* get_pair(Bucket *bucket, const char *key); |
023 |
static unsigned long hash( const char *str); |
024 |
025 |
StrMap
* sm_new(unsigned int capacity) |
026 |
{ |
027 |
StrMap
*map; |
028 |
|
029 |
map
= malloc ( sizeof (StrMap)); |
030 |
if (map
== NULL) { |
031 |
return NULL; |
032 |
} |
033 |
map->count
= capacity; |
034 |
map->buckets
= malloc (map->count
* sizeof (Bucket)); |
035 |
if (map->buckets
== NULL) { |
036 |
free (map); |
037 |
return NULL; |
038 |
} |
039 |
memset (map->buckets,
0, map->count * sizeof (Bucket)); |
040 |
return map; |
041 |
} |
042 |
043 |
void sm_delete(StrMap
*map) |
044 |
{ |
045 |
unsigned int i,
j, n, m; |
046 |
Bucket
*bucket; |
047 |
Pair
*pair; |
048 |
049 |
if (map
== NULL) { |
050 |
return ; |
051 |
} |
052 |
n
= map->count; |
053 |
bucket
= map->buckets; |
054 |
i
= 0; |
055 |
while (i
< n) { |
056 |
m
= bucket->count; |
057 |
pair
= bucket->pairs; |
058 |
j
= 0; |
059 |
while (j
< m) { |
060 |
free (pair->key); |
061 |
free (pair->value); |
062 |
pair++; |
063 |
j++; |
064 |
} |
065 |
free (bucket->pairs); |
066 |
bucket++; |
067 |
i++; |
068 |
} |
069 |
free (map->buckets); |
070 |
free (map); |
071 |
} |
072 |
073 |
int sm_get( const StrMap
*map, const char *key, char *out_buf,
unsigned int n_out_buf) |
074 |
{ |
075 |
unsigned int index; |
076 |
Bucket
*bucket; |
077 |
Pair
*pair; |
078 |
079 |
if (map
== NULL) { |
080 |
return 0; |
081 |
} |
082 |
if (key
== NULL) { |
083 |
return 0; |
084 |
} |
085 |
index
= hash(key) % map->count; |
086 |
bucket
= &(map->buckets[index]); |
087 |
pair
= get_pair(bucket, key); |
088 |
if (pair
== NULL) { |
089 |
return 0; |
090 |
} |
091 |
if (out_buf
== NULL && n_out_buf == 0) { |
092 |
return strlen (pair->value)
+ 1; |
093 |
} |
094 |
if (out_buf
== NULL) { |
095 |
return 0; |
096 |
} |
097 |
if ( strlen (pair->value)
>= n_out_buf) { |
098 |
return 0; |
099 |
} |
100 |
strcpy (out_buf,
pair->value); |
101 |
return 1; |
102 |
} |
103 |
104 |
int sm_exists( const StrMap
*map, const char *key) |
105 |
{ |
106 |
unsigned int index; |
107 |
Bucket
*bucket; |
108 |
Pair
*pair; |
109 |
110 |
if (map
== NULL) { |
111 |
return 0; |
112 |
} |
113 |
if (key
== NULL) { |
114 |
return 0; |
115 |
} |
116 |
index
= hash(key) % map->count; |
117 |
bucket
= &(map->buckets[index]); |
118 |
pair
= get_pair(bucket, key); |
119 |
if (pair
== NULL) { |
120 |
return 0; |
121 |
} |
122 |
return 1; |
123 |
} |
124 |
125 |
int sm_put(StrMap
*map, const char *key, const char *value) |
126 |
{ |
127 |
unsigned int key_len,
value_len, index; |
128 |
Bucket
*bucket; |
129 |
Pair
*tmp_pairs, *pair; |
130 |
char *tmp_value; |
131 |
char *new_key,
*new_value; |
132 |
133 |
if (map
== NULL) { |
134 |
return 0; |
135 |
} |
136 |
if (key
== NULL || value == NULL) { |
137 |
return 0; |
138 |
} |
139 |
key_len
= strlen (key); |
140 |
value_len
= strlen (value); |
141 |
/*
Get a pointer to the bucket the key string hashes to */ |
142 |
index
= hash(key) % map->count; |
143 |
bucket
= &(map->buckets[index]); |
144 |
/*
Check if we can handle insertion by simply replacing |
145 |
*
an existing value in a key-value pair in the bucket. |
146 |
*/ |
147 |
if ((pair
= get_pair(bucket, key)) != NULL) { |
148 |
/*
The bucket contains a pair that matches the provided key, |
149 |
*
change the value for that pair to the new value. |
150 |
*/ |
151 |
if ( strlen (pair->value)
< value_len) { |
152 |
/*
If the new value is larger than the old value, re-allocate |
153 |
*
space for the new larger value. |
154 |
*/ |
155 |
tmp_value
= realloc (pair->value,
(value_len + 1) * sizeof ( char )); |
156 |
if (tmp_value
== NULL) { |
157 |
return 0; |
158 |
} |
159 |
pair->value
= tmp_value; |
160 |
} |
161 |
/*
Copy the new value into the pair that matches the key */ |
162 |
strcpy (pair->value,
value); |
163 |
return 1; |
164 |
} |
165 |
/*
Allocate space for a new key and value */ |
166 |
new_key
= malloc ((key_len
+ 1) * sizeof ( char )); |
167 |
if (new_key
== NULL) { |
168 |
return 0; |
169 |
} |
170 |
new_value
= malloc ((value_len
+ 1) * sizeof ( char )); |
171 |
if (new_value
== NULL) { |
172 |
free (new_key); |
173 |
return 0; |
174 |
} |
175 |
/*
Create a key-value pair */ |
176 |
if (bucket->count
== 0) { |
177 |
/*
The bucket is empty, lazily allocate space for a single |
178 |
*
key-value pair. |
179 |
*/ |
180 |
bucket->pairs
= malloc ( sizeof (Pair)); |
181 |
if (bucket->pairs
== NULL) { |
182 |
free (new_key); |
183 |
free (new_value); |
184 |
return 0; |
185 |
} |
186 |
bucket->count
= 1; |
187 |
} |
188 |
else { |
189 |
/*
The bucket wasn't empty but no pair existed that matches the provided |
190 |
*
key, so create a new key-value pair. |
191 |
*/ |
192 |
tmp_pairs
= realloc (bucket->pairs,
(bucket->count + 1) * sizeof (Pair)); |
193 |
if (tmp_pairs
== NULL) { |
194 |
free (new_key); |
195 |
free (new_value); |
196 |
return 0; |
197 |
} |
198 |
bucket->pairs
= tmp_pairs; |
199 |
bucket->count++; |
200 |
} |
201 |
/*
Get the last pair in the chain for the bucket */ |
202 |
pair
= &(bucket->pairs[bucket->count - 1]); |
203 |
pair->key
= new_key; |
204 |
pair->value
= new_value; |
205 |
/*
Copy the key and its value into the key-value pair */ |
206 |
strcpy (pair->key,
key); |
207 |
strcpy (pair->value,
value); |
208 |
return 1; |
209 |
} |
210 |
211 |
int sm_get_count( const StrMap
*map) |
212 |
{ |
213 |
unsigned int i,
j, n, m; |
214 |
unsigned int count; |
215 |
Bucket
*bucket; |
216 |
Pair
*pair; |
217 |
218 |
if (map
== NULL) { |
219 |
return 0; |
220 |
} |
221 |
bucket
= map->buckets; |
222 |
n
= map->count; |
223 |
i
= 0; |
224 |
count
= 0; |
225 |
while (i
< n) { |
226 |
pair
= bucket->pairs; |
227 |
m
= bucket->count; |
228 |
j
= 0; |
229 |
while (j
< m) { |
230 |
count++; |
231 |
pair++; |
232 |
j++; |
233 |
} |
234 |
bucket++; |
235 |
i++; |
236 |
} |
237 |
return count; |
238 |
} |
239 |
240 |
int sm_enum( const StrMap
*map, sm_enum_func enum_func, const void *obj) |
241 |
{ |
242 |
unsigned int i,
j, n, m; |
243 |
Bucket
*bucket; |
244 |
Pair
*pair; |
245 |
246 |
if (map
== NULL) { |
247 |
return 0; |
248 |
} |
249 |
if (enum_func
== NULL) { |
250 |
return 0; |
251 |
} |
252 |
bucket
= map->buckets; |
253 |
n
= map->count; |
254 |
i
= 0; |
255 |
while (i
< n) { |
256 |
pair
= bucket->pairs; |
257 |
m
= bucket->count; |
258 |
j
= 0; |
259 |
while (j
< m) { |
260 |
enum_func(pair->key,
pair->value, obj); |
261 |
pair++; |
262 |
j++; |
263 |
} |
264 |
bucket++; |
265 |
i++; |
266 |
} |
267 |
return 1; |
268 |
} |
269 |
270 |
/* |
271 |
*
Returns a pair from the bucket that matches the provided key, |
272 |
*
or null if no such pair exist. |
273 |
*/ |
274 |
static Pair
* get_pair(Bucket *bucket, const char *key) |
275 |
{ |
276 |
unsigned int i,
n; |
277 |
Pair
*pair; |
278 |
279 |
n
= bucket->count; |
280 |
if (n
== 0) { |
281 |
return NULL; |
282 |
} |
283 |
pair
= bucket->pairs; |
284 |
i
= 0; |
285 |
while (i
< n) { |
286 |
if (pair->key
!= NULL && pair->value != NULL) { |
287 |
if ( strcmp (pair->key,
key) == 0) { |
288 |
return pair; |
289 |
} |
290 |
} |
291 |
pair++; |
292 |
i++; |
293 |
} |
294 |
return NULL; |
295 |
} |
296 |
297 |
/* |
298 |
*
Returns a hash code for the provided string. |
299 |
*/ |
300 |
static unsigned long hash( const char *str) |
301 |
{ |
302 |
unsigned long hash
= 5381; |
303 |
int c; |
304 |
305 |
while (c
= *str++) { |
306 |
hash
= ((hash << 5) + hash) + c; |
307 |
} |
308 |
return hash; |
309 |
} |
前一節與這節這兩種實現方法看似比較類似,但也有差異:
基於哈希桶的情況下,由於Hash桶容量的限制,所以,有可能發生Hash表填不滿的情況,也就是,雖然Hash表裏面還有空位,但是新建的表項由於衝突過多,而不能裝入Hash表中。不過,這樣的實現也有其好處,就是查表的最大開銷是可以確定的,因爲最多處理的衝突數是確定的,所以算法的時間複雜度爲O(1)+O(m),其中m爲Hash桶容量。
而另一種通過鏈表的實現,由於Hash桶的容量是無限的,因此,只要沒有超出Hash表的最大容量,就能夠容納新建的表項。但是,一旦發生了Hash衝突嚴重的情況,就會造成Hash桶的鏈表過長,大大降低查找效率。在最壞的情況下,時間複雜度退化爲O(n),其中n爲Hash表的總容量。當然,這種情況的概率小之又小,幾乎是可以忽略的。