trie樹-《算法導論》學習筆記十四

引用一下百度百科的話吧:
Trie樹,又稱單詞查找樹,是一種樹形結構,是一種哈希樹的變種。典型應用是用於統計,排序和保存大量的字符串(但不僅限於字符串),所以經常被搜索引擎系統用於文本詞頻統計。它的優點是:利用字符串的公共前綴來減少查詢時間,最大限度地減少無謂的字符串比較,查詢效率比哈希樹高。

這裏構建了一棵字典樹,每個結點有52個孩子指針,對應26個小寫字母和26個大寫字母,根節點不存儲數據,一個單詞從第一個字母開始經由根結點走對應分支進行插入和統計。

trie樹結點衛星數據包含了字母、出現次數、是否構成一個單詞,孩子指針就是一個52大小的trie樹結點指針數組。

實現了幾個操作:

1. 插入單詞

遍歷每個字母,從根結點出發,如果結點對應字母的孩子結點爲空,就創建結點,出現次數爲1,如果存在這個結點,出現次數就+1,並且如果單詞結束,結束處的結點是否構成一個單詞字段標識爲構成

2. 遍歷樹,並打印所有單詞和每個單詞出現次數

3. 統計樹,按給定的數字統計出現次數前幾的單詞

樹統計,與遍歷類似,用尾遞歸,並傳入一個大於單詞最大長度的數組來存儲每個分支的單詞,如果遇到結點能構成一個單詞,就判斷你單詞個數,並以插入排序的方式插入創建的統計鏈表(類似打撲克的插排序);
統計鏈表有更新操作,根據輸入的統計前幾的數字來維護這個鏈表該去掉哪些結點,該更新哪些結點的順序等


獲取單詞來源爲編寫的一個簡單單詞隨機生成代碼,寫入一個文件中,可指定單詞最大長度,全大寫/全小寫/大小寫均有,單詞個數,單詞範圍(只支持a-*或A-*,例如5,就是生成a-e/A-E的單詞)
貼代碼:

隨機生成單詞

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>

int word_len = 0;
int upper_low = 65;
int lowwer_low = 97;

// if letter_size = 5
// it will generate a-e or A-E letter.
int letter_size = 0;

int random_letter()
{
    return rand() % letter_size;
}
int random_word(
    char *word,
    int opt )
{   
    // minimum word'length is 3.
    int true_word_len = rand() % word_len + 3;
    int true_word_len1 = true_word_len;

    while ( true_word_len-- ) {
        char letter = 0;

        if ( opt == 0 ) {
            letter = random_letter() + lowwer_low;
        } else if ( opt == 1 ) {
            letter = random_letter() + upper_low;
        } else {
            int opt_case = rand() % 2;
            if ( opt_case == 0 )
                letter = random_letter() + lowwer_low;
            else 
                letter = random_letter() + upper_low;
        }
        word[true_word_len] = letter;
    }

    return true_word_len1;
}
void gen_word(
    int fd,
    int word_num,
    int opt )
{
    char word[20] = {0};
    int true_word_len = 0;

    while ( word_num-- ) {
        memset( word, 0, 20);
        true_word_len = random_word( word, opt );
        word[true_word_len] = '\n';
        write( fd, word, true_word_len + 1 );
    }
}
int main(
    int argc,
    char **argv )
{
    srand((int)time(NULL));
    if ( argc != 5 ) {
        printf("please input "
                "word's length & "
                "words' number & "
                "word's range & "
                "gen_case(0:lowwer case,1:upper case,other:both\n");
        exit( 0 );
    }

    word_len = atoi( argv[1] );
    int word_num = atoi( argv[2] );
    letter_size = atoi( argv[3] );
    int opt = atoi( argv[4] );

    int fd = open("word.txt", O_RDWR | O_TRUNC, 0777);

    gen_word( fd, word_num, opt );

    close( fd );

    return 0;
}

trie樹

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>

#define MAX_CHILD_NUM 52
#define UPPER_LOW 65
#define UPPER_UP 90
#define LOWER_LOW 97
#define LOWER_UP 122

#define PRINT(format, arg...) \
do { \
    printf("[%s/%d]:", __func__, __LINE__); \
    printf(format, ##arg); \
    printf("\n"); \
}while(0) 

typedef struct trieTreeNode {
    char letter;
    int count;
    int is_word;
    struct trieTreeNode *next[MAX_CHILD_NUM];
} trieTreeNode;

typedef struct trieTree {
    trieTreeNode *root;
} trieTree;

typedef struct count_data {
    int order;
    int count;
    char string[20];
    struct count_data *next;
} count_data;

int trans_letter_2_index(
    char letter )
{
    int index = -1;
    if ( letter >= LOWER_LOW 
        && letter <= LOWER_UP ) {
        index = letter - LOWER_LOW + 26;
    } else if ( letter >= UPPER_LOW 
        && letter <= UPPER_UP ) {
        index = letter - UPPER_LOW; 
    } else {
        PRINT("error letter input:%c", letter);
        exit( 0 );
    }

    return index;


}
trieTreeNode *create_node(
    char letter )
{
    trieTreeNode *node = 
        ( trieTreeNode * )calloc( 1, sizeof(trieTreeNode) );
    node->letter = letter;
    node->count = 0;
    node->is_word = 0;
}
void insert(
    trieTreeNode *root,
    char *word )
{
    if ( root == NULL ) {
        PRINT("root node is null.");
        return;
    }
    int i = 0;
    trieTreeNode *cur = root;

    for ( i; word[i] != '\0'; i++ ) {
        int next_index = trans_letter_2_index(word[i]);
        //PRINT("letter:%c, index:%d", word[i], next_index);
        if ( cur->next[next_index] == NULL ) {
            cur->next[next_index] = create_node( word[i] );
        } else {
            //cur->next[next_index]->count += 1;
        }
        if ( word[i+1] == '\0' ) {
            cur->next[next_index]->count += 1;
            cur->next[next_index]->is_word = 1;
        }
        cur = cur->next[next_index];
    }
}
// 刪除鏈表所有結點
void delete_list_all_node(
    count_data *node )
{
    count_data *p = NULL;
    while ( node ) {
        p = node;
        node = node->next;
        free( p );
    }
}
void print_list_all_node(
    count_data *node )
{
    printf("\n");
    node = node->next;
    while ( node ) {
        printf("[%d],count:%d\tword:%s\n", 
            node->order, node->count, node->string);
        node = node->next;
    }
    printf("\n");
}
void update_insert_node(
    count_data *insert_node )
{
    if ( !insert_node->next )
        return;
    count_data *print_p = insert_node; 

    if ( insert_node->order == 1 ) {
        delete_list_all_node( insert_node->next );
        insert_node->next = NULL;
    } else if ( insert_node->order < 1 ) {
        PRINT("ERROR!!!!!");
        exit( 0 );
    } else {
        count_data *p = insert_node;
        insert_node = insert_node->next;
        while ( insert_node ) {
            if ( insert_node->count < p->count ) {
                insert_node->order = p->order - 1;
            } else if ( insert_node->count > p->count ) {
                PRINT("ERROR!!!cur->count:%d, pre->count:%d", 
                    insert_node->count, p->count);
                exit( 0 );
            } else {
                insert_node->order = p->order;
            }
            if ( insert_node->order < 1 ) {
                delete_list_all_node( insert_node );
                p->next = NULL;
                break;
            }
            p = insert_node;
            insert_node = insert_node->next;
        }
    }
}
void list_insert( 
    char *tmp_word,
    int cur_count,
    int tail,
    count_data *head,
    int top_num )
{
    tmp_word[tail] = '\0';
    count_data *new_data = ( count_data * )malloc( sizeof(count_data) );
    new_data->count = cur_count;
    memcpy( new_data->string, tmp_word, tail + 1 );
    new_data->next = NULL;

    //PRINT("count:%d\ttmp_word:%s, string:%s", cur_count, tmp_word, new_data->string);

    if ( head->next == NULL ) {
        head->next = new_data;
        new_data->order = top_num;
    } else if ( cur_count > head->next->count ) {
        new_data->order = head->next->order;
        new_data->next = head->next;
        head->next = new_data;
        update_insert_node( new_data );
    } else {
        while ( 1 ) {
            head = head->next;
            if ( head->next == NULL ) {
                if ( head->order > 1 ) {
                    head->next = new_data;
                    if ( head->count == new_data->count ) 
                        new_data->order = head->order;
                    else
                        new_data->order = head->order - 1;

                    head->next = new_data;
                } else if ( head->count > new_data->count ) {
                    // 不插入
                    free( new_data );
                } else if ( head->count == new_data->count ) {
                    head->next = new_data;
                    new_data->order = head->order;
                } else if ( head->count < new_data->count ) {
                    // 此種情況只有求出現次數最多的前1個單詞時有
                    head->count = new_data->count;
                    free( new_data );
                }
                break;
            } else if ( head->count >= cur_count 
                && head->next->count < cur_count ) {
                new_data->next = head->next;
                head->next = new_data;
                new_data->order = head->order;
                update_insert_node( new_data );
                break;
            }
        }
    }
}
void find_top_count1(
    trieTreeNode *root,
    char *tmp_word,
    int tail,
    count_data *head, 
    int top_num )
{
    if ( !root )
        return;

    tmp_word[tail] = root->letter;
    tail++;

    if ( root->is_word ) {

        /*
        printf("\n--------------before delete------------------\n");
        print_list_all_node( head );
        printf("\n--------------------------------------------\n");
        */

        list_insert( tmp_word, root->count, tail, head, top_num );

        /*
        printf("\n--------------------after delete----------------------------\n");
        print_list_all_node( head );
        printf("\n-----------------------------------------------------------\n");
        */
    }

    int i = 0;
    for ( i; i < MAX_CHILD_NUM; i++ ) {
        find_top_count1( root->next[i], tmp_word, tail, head, top_num );
    }
}
void find_top_count(
    trieTreeNode *root,
    int top_num )
{
    if ( !root )
        return;

    int i = 0;

    count_data *head = ( count_data * )malloc( sizeof(count_data) );

    for ( i; i < MAX_CHILD_NUM; i++ ) {
        char tmp_word[20] = {0};
        find_top_count1( root->next[i], tmp_word, 0, head, top_num );
    }

    printf("出現次數最大前%d次的單詞:\n", top_num);
    count_data *p = head->next;
    count_data *free_p = NULL;
    while ( p != NULL ) {
        free_p = p;
        printf("前%d,count:%d\t%s\n", p->order, p->count, p->string);
        p = p->next;
        free( free_p );
    }
    free( head );
}

void tree_walk1(
    trieTreeNode *root,
    char *tmp_word,
    int tail )
{
    if ( !root )
        return;

    tmp_word[tail] = root->letter;
    tail++;
    //printf("%c\n", root->letter);
    if ( root->is_word ) {
        int j = 0;
        printf("count:%d\t", root->count);
        for ( j; j < tail; j++ ) {
            printf("%c", tmp_word[j]);
        }
        printf("\n");
    }

    int i = 0;
    for ( i; i < MAX_CHILD_NUM; i++ ) {
        tree_walk1( root->next[i], tmp_word, tail );
    }
}
void tree_walk(
    trieTreeNode *root )
{
    if ( !root ) 
        return;

    int i = 0;

    for ( i; i < MAX_CHILD_NUM; i++ ) {
        char tmp_word[20] = {0};
        tree_walk1( root->next[i], tmp_word, 0 );
    }
}
int main(
    int argc,
    char **argv )
{
    if ( argc != 3 ) {
        PRINT("USAGE: please input words file & top number");
        exit( 0 );
    }

    char *file_name = argv[1];
    int top_num = atoi( argv[2] );

    trieTree *tree = ( trieTree * )malloc( sizeof(trieTree) );

    tree->root = create_node( -1 );

    int fd = open(file_name, O_RDONLY);
    if ( fd < 0 ) {
        PRINT("OPEN FILE %s ERROR!!!(%s)", file_name, (char *)strerror(errno));
        exit( 0 );
    }
    // 每次讀取文件的緩衝區
    char buf[1024 * 10] = {0};

    // 每次讀取的大小
    int read_len = 1024;

    // 讀取的返回值
    int read_bytes = 0;

    // 從讀取的緩衝區每次提取'\n' - '\n'之間的單詞
    char tmp_word[20] = {0};

    // 讀取文件緩衝區如果出現單詞隔斷,剩餘部分在下一次
    // read才能讀到,這個index做單詞繼續拼接
    int tmp_index = 0;

    while ( 1 ) {
        memset( buf, 0, read_len );
        read_bytes = read( fd, buf, read_len );
        if ( read_bytes <= 0 )
            break;
        //printf("readbytes:%d------\n%s\n", read_bytes, buf);
        int cur = 0;
        while ( cur < read_bytes ) {
            // 單詞文件最後一個單詞末尾一定要有'\n'
            if ( buf[cur] == '\n' ) {
                tmp_word[tmp_index] = '\0';
                //printf("insert word:%s\n", tmp_word);
                insert( tree->root, tmp_word );
                memset( tmp_word, 0, 20 );
                tmp_index = 0;
            } else {
                tmp_word[tmp_index] = buf[cur];
                tmp_index++;
            }
            cur++;
        }
    }
    printf("\n========================================\n");
    tree_walk( tree->root );

    find_top_count( tree->root, top_num );

    close( fd );

    return 0;
}

trie樹的代碼使用:./xxx word.txt 10即統計出現次數前10的單詞,並打印單詞和次數

例如對生成了10000個單詞的word.txt文件,統計前5:
./xxx word.txt 5
這裏寫圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章