hash table相關--引入哈希桶的概念來實現一個哈希表

前面的講述瞭如何用鏈地址法實現一個哈希表,那麼今天來分析一下另一種解決哈希衝突的做法,即爲每個Hash值,建立一個Hash桶(Bucket),桶的容量是固定的,也就是隻能處理固定次數的衝突,如1048576個Hash桶,每個桶中有4個表項(Entry),總計4M個表項。其實這兩種的實現思路雷同,就是對Hash表中每個Hash值建立一個衝突表,即將衝突的幾個記錄以表的形式存儲在其中。

大致的思路是這樣的:

首先哈希桶的個數是固定的,有用戶構建的時候輸入,一旦構建,個數就已經固定;查找的時候首先將key值通過哈希函數獲取哈希值,根據哈希值獲取到對應的哈希桶,然後遍歷哈希桶內的pairs數組獲取。

struct Bucket {
    unsigned int count;
    Pair *pairs;
};


struct StrMap {
    unsigned int count;
    Bucket *buckets;
};



strmap.h


#ifndef _STRMAP_H_
#define _STRMAP_H_


#ifdef __cplusplus
extern "C"
{
#endif


#include <stdlib.h>
#include <string.h>


typedef struct StrMap StrMap;


/*
 * This callback function is called once per key-value when iterating over
 * all keys associated to values.
 *
 * Parameters:
 *
 * key: A pointer to a null-terminated C string. The string must not
 * be modified by the client.
 *
 * value: A pointer to a null-terminated C string. The string must
 * not be modified by the client.
 *
 * obj: A pointer to a client-specific object. This parameter may be
 * null.
 *
 * Return value: None.
 */
typedef void(*sm_enum_func)(const char *key, const char *value, const void *obj);


/*
 * Creates a string map.
 *
 * Parameters:
 *
 * capacity: The number of top-level slots this string map
 * should allocate. This parameter must be > 0.
 *
 * Return value: A pointer to a string map object, 
 * or null if a new string map could not be allocated.
 */
StrMap * sm_new(unsigned int capacity);


/*
 * Releases all memory held by a string map object.
 *
 * Parameters:
 *
 * map: A pointer to a string map. This parameter cannot be null.
 * If the supplied string map has been previously released, the
 * behaviour of this function is undefined.
 *
 * Return value: None.
 */
void sm_delete(StrMap *map);


/*
 * Returns the value associated with the supplied key.
 *
 * Parameters:
 *
 * map: A pointer to a string map. This parameter cannot be null.
 *
 * key: A pointer to a null-terminated C string. This parameter cannot
 * be null.
 *
 * out_buf: A pointer to an output buffer which will contain the value,
 * if it exists and fits into the buffer.
 *
 * n_out_buf: The size of the output buffer in bytes.
 *
 * Return value: If out_buf is set to null and n_out_buf is set to 0 the return
 * value will be the number of bytes required to store the value (if it exists)
 * and its null-terminator. For all other parameter configurations the return value
 * is 1 if an associated value was found and completely copied into the output buffer,
 * 0 otherwise.
 */
int sm_get(const StrMap *map, const char *key, char *out_buf, unsigned int n_out_buf);


/*
 * Queries the existence of a key.
 *
 * Parameters:
 *
 * map: A pointer to a string map. This parameter cannot be null.
 *
 * key: A pointer to a null-terminated C string. This parameter cannot
 * be null.
 *
 * Return value: 1 if the key exists, 0 otherwise.
 */
int sm_exists(const StrMap *map, const char *key);


/*
 * Associates a value with the supplied key. If the key is already
 * associated with a value, the previous value is replaced.
 *
 * Parameters:
 *
 * map: A pointer to a string map. This parameter cannot be null.
 *
 * key: A pointer to a null-terminated C string. This parameter
 * cannot be null. The string must have a string length > 0. The
 * string will be copied.
 *
 * value: A pointer to a null-terminated C string. This parameter
 * cannot be null. The string must have a string length > 0. The
 * string will be copied.
 *
 * Return value: 1 if the association succeeded, 0 otherwise.
 */
int sm_put(StrMap *map, const char *key, const char *value);


/*
 * Returns the number of associations between keys and values.
 *
 * Parameters:
 *
 * map: A pointer to a string map. This parameter cannot be null.
 *
 * Return value: The number of associations between keys and values.
 */
int sm_get_count(const StrMap *map);


/*
 * An enumerator over all associations between keys and values.
 *
 * Parameters:
 *
 * map: A pointer to a string map. This parameter cannot be null.
 *
 * enum_func: A pointer to a callback function that will be
 * called by this procedure once for every key associated
 * with a value. This parameter cannot be null.
 *
 * obj: A pointer to a client-specific object. This parameter will be
 * passed back to the client's callback function. This parameter can
 * be null.
 *
 * Return value: 1 if enumeration completed, 0 otherwise.
 */
int sm_enum(const StrMap *map, sm_enum_func enum_func, const void *obj);


#ifdef __cplusplus
}
#endif


#endif


struct Pair {
    char *key;
    char *value;
};


struct Bucket {
    unsigned int count;
    Pair *pairs;
};


struct StrMap {
    unsigned int count;
    Bucket *buckets;
};


strmap.c


#include "strmap.h"


typedef struct Pair Pair;


typedef struct Bucket Bucket;


struct Pair {
char *key;
char *value;
};


struct Bucket {
unsigned int count;
Pair *pairs;
};


struct StrMap {
unsigned int count;
Bucket *buckets;
};


static Pair * get_pair(Bucket *bucket, const char *key);
static unsigned long hash(const char *str);


StrMap * sm_new(unsigned int capacity)
{
StrMap *map;

map = malloc(sizeof(StrMap));
if (map == NULL) {
return NULL;
}
map->count = capacity;
map->buckets = malloc(map->count * sizeof(Bucket));
if (map->buckets == NULL) {
free(map);
return NULL;
}
memset(map->buckets, 0, map->count * sizeof(Bucket));
return map;
}


void sm_delete(StrMap *map)
{
unsigned int i, j, n, m;
Bucket *bucket;
Pair *pair;


if (map == NULL) {
return;
}
n = map->count;
bucket = map->buckets;
i = 0;
while (i < n) {
m = bucket->count;
pair = bucket->pairs;
j = 0;
while(j < m) {
free(pair->key);
free(pair->value);
pair++;
j++;
}
free(bucket->pairs);
bucket++;
i++;
}
free(map->buckets);
free(map);
}


int sm_get(const StrMap *map, const char *key, char *out_buf, unsigned int n_out_buf)
{
unsigned int index;
Bucket *bucket;
Pair *pair;


if (map == NULL) {
return 0;
}
if (key == NULL) {
return 0;
}
index = hash(key) % map->count;
bucket = &(map->buckets[index]);
pair = get_pair(bucket, key);
if (pair == NULL) {
return 0;
}
if (out_buf == NULL && n_out_buf == 0) {
return strlen(pair->value) + 1;
}
if (out_buf == NULL) {
return 0;
}
if (strlen(pair->value) >= n_out_buf) {
return 0;
}
strcpy(out_buf, pair->value);
return 1;
}


int sm_exists(const StrMap *map, const char *key)
{
unsigned int index;
Bucket *bucket;
Pair *pair;


if (map == NULL) {
return 0;
}
if (key == NULL) {
return 0;
}
index = hash(key) % map->count;
bucket = &(map->buckets[index]);
pair = get_pair(bucket, key);
if (pair == NULL) {
return 0;
}
return 1;
}


int sm_put(StrMap *map, const char *key, const char *value)
{
unsigned int key_len, value_len, index;
Bucket *bucket;
Pair *tmp_pairs, *pair;
char *tmp_value;
char *new_key, *new_value;


if (map == NULL) {
return 0;
}
if (key == NULL || value == NULL) {
return 0;
}
key_len = strlen(key);
value_len = strlen(value);
/* Get a pointer to the bucket the key string hashes to */
index = hash(key) % map->count;
bucket = &(map->buckets[index]);
/* Check if we can handle insertion by simply replacing
* an existing value in a key-value pair in the bucket.
*/
if ((pair = get_pair(bucket, key)) != NULL) {
/* The bucket contains a pair that matches the provided key,
* change the value for that pair to the new value.
*/
if (strlen(pair->value) < value_len) {
/* If the new value is larger than the old value, re-allocate
* space for the new larger value.
*/
tmp_value = realloc(pair->value, (value_len + 1) * sizeof(char));
if (tmp_value == NULL) {
return 0;
}
pair->value = tmp_value;
}
/* Copy the new value into the pair that matches the key */
strcpy(pair->value, value);
return 1;
}
/* Allocate space for a new key and value */
new_key = malloc((key_len + 1) * sizeof(char));
if (new_key == NULL) {
return 0;
}
new_value = malloc((value_len + 1) * sizeof(char));
if (new_value == NULL) {
free(new_key);
return 0;
}
/* Create a key-value pair */
if (bucket->count == 0) {
/* The bucket is empty, lazily allocate space for a single
* key-value pair.
*/
bucket->pairs = malloc(sizeof(Pair));
if (bucket->pairs == NULL) {
free(new_key);
free(new_value);
return 0;
}
bucket->count = 1;
}
else {
/* The bucket wasn't empty but no pair existed that matches the provided
* key, so create a new key-value pair.
*/
tmp_pairs = realloc(bucket->pairs, (bucket->count + 1) * sizeof(Pair));
if (tmp_pairs == NULL) {
free(new_key);
free(new_value);
return 0;
}
bucket->pairs = tmp_pairs;
bucket->count++;
}
/* Get the last pair in the chain for the bucket */
pair = &(bucket->pairs[bucket->count - 1]);
pair->key = new_key;
pair->value = new_value;
/* Copy the key and its value into the key-value pair */
strcpy(pair->key, key);
strcpy(pair->value, value);
return 1;
}


int sm_get_count(const StrMap *map)
{
unsigned int i, j, n, m;
unsigned int count;
Bucket *bucket;
Pair *pair;


if (map == NULL) {
return 0;
}
bucket = map->buckets;
n = map->count;
i = 0;
count = 0;
while (i < n) {
pair = bucket->pairs;
m = bucket->count;
j = 0;
while (j < m) {
count++;
pair++;
j++;
}
bucket++;
i++;
}
return count;
}


int sm_enum(const StrMap *map, sm_enum_func enum_func, const void *obj)
{
unsigned int i, j, n, m;
Bucket *bucket;
Pair *pair;


if (map == NULL) {
return 0;
}
if (enum_func == NULL) {
return 0;
}
bucket = map->buckets;
n = map->count;
i = 0;
while (i < n) {
pair = bucket->pairs;
m = bucket->count;
j = 0;
while (j < m) {
enum_func(pair->key, pair->value, obj);
pair++;
j++;
}
bucket++;
i++;
}
return 1;
}


/*
 * Returns a pair from the bucket that matches the provided key,
 * or null if no such pair exist.
 */
static Pair * get_pair(Bucket *bucket, const char *key)
{
unsigned int i, n;
Pair *pair;


n = bucket->count;
if (n == 0) {
return NULL;
}
pair = bucket->pairs;
i = 0;
while (i < n) {
if (pair->key != NULL && pair->value != NULL) {
if (strcmp(pair->key, key) == 0) {
return pair;
}
}
pair++;
i++;
}
return NULL;
}


/*
 * Returns a hash code for the provided string.
 */
static unsigned long hash(const char *str)
{
unsigned long hash = 5381;
int c;


while (c = *str++) {
hash = ((hash << 5) + hash) + c;
}
return hash;
}


前一節與這節這兩種實現方法看似比較類似,但也有差異:

基於哈希桶的情況下,由於Hash桶容量的限制,所以,有可能發生Hash表填不滿的情況,也就是,雖然Hash表裏面還有空位,但是新建的表項由於衝突過多,而不能裝入Hash表中。不過,這樣的實現也有其好處,就是查表的最大開銷是可以確定的,因爲最多處理的衝突數是確定的,所以算法的時間複雜度爲O(1)+O(m),其中m爲Hash桶容量。

而另一種通過鏈表的實現,由於Hash桶的容量是無限的,因此,只要沒有超出Hash表的最大容量,就能夠容納新建的表項。但是,一旦發生了Hash衝突嚴重的情況,就會造成Hash桶的鏈表過長,大大降低查找效率。在最壞的情況下,時間複雜度退化爲O(n),其中n爲Hash表的總容量。當然,這種情況的概率小之又小,幾乎是可以忽略的。

 轉自
http://www.nowamagic.net/academy/detail/3008108




struct Pair {
    char *key;
    char *value;
};


struct Bucket {
    unsigned int count;
    Pair *pairs;
};


struct StrMap {
    unsigned int count;
    Bucket *buckets;
};
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章