實驗3-huffman編解碼

一、實驗原理
哈夫曼編碼(Huffman Coding)，又稱霍夫曼編碼或最佳碼，是可變字長編碼(VLC)的一種，屬於無損壓縮。該方法完全依據字符出現概率來構造碼字，出現概率大的符號碼長短，概率小的碼長大，能有效的減小碼長，對於概率分佈相差大的信源壓縮效率高，而對於接近於等概分佈的信源壓縮效率低。
實際實現中常用二叉樹來表示編碼過程，節點需要表示的信息有它的概率；它是否爲葉子節點，不是則表示是一箇中間節點，它有左右子節點，是葉子結點則有一個符號；它的父節點用於建立碼樹。由於huffman碼爲變長碼，不能事先預留空間，所以用指針來表示它的碼字序列，還需指出它所用的比特位數，爲了後續輸出碼錶還添加了概率。
每個節點的數據結構爲：

typedef struct huffman_node_tag
{
    unsigned char isLeaf;//是否爲葉子結點
    unsigned long count;//該符號的個數
    struct huffman_node_tag *parent;//指向父節點的指針

    union
    {
        struct
        {
            struct huffman_node_tag *zero, *one;//子節點
        };
        unsigned char symbol;//符號
    };
} huffman_node;

每個碼字的數據結構爲：

typedef struct huffman_code_tag
{
    //add by zhn
    int count;//出現頻率
    //end add
    /* The length of this code in bits. */
    unsigned long numbits;//比特位數
    unsigned char *bits;//比特流
} huffman_code;

它的編碼步驟爲
1)統計個符號出現的次數，按照它們出現的概率並從大到小依次排列。
2)每次取概率最小的兩個節點，合併概率，生成父節點，用父節點代替這兩個子節點重新排序，直到根結點。
3)分配碼字，二叉樹的左節點爲0，右節點爲1，從根到葉子結點遍歷得到碼字。
二、實驗步驟
1.huffman編碼流程

1)讀入文件

    char memory = 0; //memory爲1表示對內存編碼
    char compress = 1;//compress爲1表示壓縮，爲0是解壓
    int opt;
    //add by zhn
    const char *file_in = NULL, *file_out = NULL;
    const char *file_table=NULL;
    FILE *in = stdin;//標準輸入
    FILE *out = stdout;//標準輸出
    //add by zhn
    FILE *table;//輸出碼錶
    while((opt = getopt(argc, argv, "i:o:t:cdhvm")) != -1)//對argc，argv的解析，單個字符後跟一個冒號表示後面必須接參數
    {
        switch(opt)
        {
        case 'i':
            file_in = optarg;
            break;
        case 'o':
            file_out = optarg;
            break;
        //add by zhn
        case 't':
            file_table = optarg;
            break;
        case 'c':
            compress = 1;
            break;
        case 'd':
            compress = 0;
            break;
        case 'h':
            usage(stdout);
            return 0;
        case 'v':
            version(stdout);
            return 0;
        case 'm':
            memory = 1;
            break;
        default:
            usage(stderr);
            return 1;
        }
    }

2）統計各個字符出現的概率

#define MAX_SYMBOLS 256//共有256個字符
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];
static void
init_frequencies(SymbolFrequencies *pSF)
{
    memset(*pSF, 0, sizeof(SymbolFrequencies));
}
static huffman_node*
new_leaf_node(unsigned char symbol)//新建葉子結點父節點爲0
{
    huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));
    p->isLeaf = 1;
    p->symbol = symbol;
    p->count = 0;
    p->parent = 0;
    return p;
}
static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{
    int c;
    unsigned int total_count = 0;

    /* Set all frequencies to 0. */
    init_frequencies(pSF);//初始化256個節點
    /* Count the frequency of each symbol in the input file. */
    while((c = fgetc(in)) != EOF)//每次取一個字符
    {
        unsigned char uc = c;
        if(!(*pSF)[uc])//如果該字符以前沒有出現則建立新的葉子節點
            (*pSF)[uc] = new_leaf_node(uc);
        ++(*pSF)[uc]->count;//統計頻率
        ++total_count;
    }
    return total_count;
}

3）建立huffman樹

static int
SFComp(const void *p1, const void *p2)
{
    const huffman_node *hn1 = *(const huffman_node**)p1;
    const huffman_node *hn2 = *(const huffman_node**)p2;

    /* Sort all NULLs to the end. */
    if(hn1 == NULL && hn2 == NULL)
        return 0;
    if(hn1 == NULL)
        return 1;
    if(hn2 == NULL)
        return -1;

    if(hn1->count > hn2->count)//qsort爲1時排爲elem2，elem2；-1時排爲elem1，elem2；
        return 1;
    else if(hn1->count < hn2->count)
        return -1;

    return 0;
}//以頻率從從小到大排序，並且count爲0時不參與排序
static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)
{
    if(subtree == NULL)
        return;

    if(subtree->isLeaf)
    {
        (*pSF)[subtree->symbol] = new_code(subtree);
    }
    else
    {
        build_symbol_encoder(subtree->zero, pSF);
        build_symbol_encoder(subtree->one, pSF);
    }
}//先遍歷右節點，再遍歷左節點，直到爲葉子結點才分配碼字
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{
    unsigned int i = 0;
    unsigned int n = 0;
    huffman_node *m1 = NULL, *m2 = NULL;
    SymbolEncoder *pSE = NULL;


    qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);//qsort以SFComp爲函數進行升序排序，並交換地址，

    /* 得到非零字符的個數 */
    for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)
        ;
    for(i = 0; i < n - 1; ++i)
    {
        /* Set m1 and m2 to the two subsets of least probability. */
            m1 = (*pSF)[0];
        m2 = (*pSF)[1];

        /* Replace m1 and m2 with a set {m1, m2} whose probability
         * is the sum of that of m1 and m2. */
        (*pSF)[0] = m1->parent = m2->parent =
            new_nonleaf_node(m1->count + m2->count, m1, m2);
        (*pSF)[1] = NULL;

        /* Put newSet into the correct count position in pSF. */
        qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);
    }

    /* Build the SymbolEncoder array from the tree. */
    pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));
    memset(pSE, 0, sizeof(SymbolEncoder));
    build_symbol_encoder((*pSF)[0], pSE);
    return pSE;
}

4）將碼錶和其他信息寫入輸出文件

static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{
    unsigned long i, count = 0;

    /* Determine the number of entries in se. */
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
        if((*se)[i])
            ++count;
    }

    /* Write the number of entries in network byte order. */
    i = htonl(count);
    if(fwrite(&i, sizeof(i), 1, out) != 1)
        return 1;

    /* Write the number of bytes that will be encoded. */
    symbol_count = htonl(symbol_count);
    if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)
        return 1;

    /* Write the entries. */
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
        huffman_code *p = (*se)[i];
        if(p)
        {
            unsigned int numbytes;

            /* Write the 1 byte symbol. */
            fputc((unsigned char)i, out);

            /* Write the 1 byte code bit length. */
            fputc(p->numbits, out);
            /* Write the code bytes. */
            numbytes = numbytes_from_numbits(p->numbits);
            if(fwrite(p->bits, 1, numbytes, out) != numbytes)
                return 1;
        }
    }

    return 0;
}

計算機存貯時是大尾字節序，低位在前高位在後，而htonl函數可以實現字節序的轉化，把原數據高低位交換後再存貯在文件中。
5）對文件進行編碼

static unsigned char
get_bit(unsigned char* bits, unsigned long i)
{
    return (bits[i / 8] >> i % 8) & 1;
}
static int
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{
    unsigned char curbyte = 0;
    unsigned char curbit = 0;
    int c;

    while((c = fgetc(in)) != EOF)
    {
        unsigned char uc = (unsigned char)c;
        huffman_code *code = (*se)[uc];
        unsigned long i;

        for(i = 0; i < code->numbits; ++i)
        {
            /* Add the current bit to curbyte. */
            curbyte |= get_bit(code->bits, i) << curbit;

            /* If this byte is filled up then write it
             * out and reset the curbit and curbyte. */
            if(++curbit == 8)
            {
                fputc(curbyte, out);
                curbyte = 0;
                curbit = 0;
            }
        }
    }

    /*
     * If there is data in curbyte that has not been
     * output yet, which means that the last encoded
     * character did not fall on a byte boundary,
     * then output it.
     */
    if(curbit > 0)
        fputc(curbyte, out);

    return 0;
}

6）釋放碼樹

static void
free_huffman_tree(huffman_node *subtree)
{
    if(subtree == NULL)
        return;

    if(!subtree->isLeaf)
    {
        free_huffman_tree(subtree->zero);
        free_huffman_tree(subtree->one);
    }

    free(subtree);
}
static void
free_encoder(SymbolEncoder *pSE)
{
    unsigned long i;
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
        huffman_code *p = (*pSE)[i];
        if(p)
            free_code(p);
    }

    free(pSE);
}

2.huffman解碼流程

1）讀入解碼文件
2）讀取碼樹

static huffman_node*
read_code_table(FILE* in, unsigned int *pDataBytes)
{
    huffman_node *root = new_nonleaf_node(0, NULL, NULL);//建立根節點
    unsigned int count;
    if(fread(&count, sizeof(count), 1, in) != 1)//讀取符號數，如果讀取失敗則不解碼直接返回
    {
        free_huffman_tree(root);
        return NULL;
    }

    count = ntohl(count);
    if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1)//讀取總的解碼出來的文件字節數
    {
        free_huffman_tree(root);
        return NULL;
    }
    *pDataBytes = ntohl(*pDataBytes);
    /* Read the entries. */
    while(count-- > 0)//文件指向碼錶，第一項爲信源符號，第二項爲碼字長度，碼字
    {
        int c;
        unsigned int curbit;
        unsigned char symbol;
        unsigned char numbits;
        unsigned char numbytes;
        unsigned char *bytes;
        huffman_node *p = root;

        if((c = fgetc(in)) == EOF)//信源符號
        {
            free_huffman_tree(root);
            return NULL;
        }
        symbol = (unsigned char)c;

        if((c = fgetc(in)) == EOF)//碼長
        {
            free_huffman_tree(root);
            return NULL;
        }

        numbits = (unsigned char)c;
        numbytes = (unsigned char)numbytes_from_numbits(numbits);//碼長轉化爲字節大小
        bytes = (unsigned char*)malloc(numbytes);
        if(fread(bytes, 1, numbytes, in) != numbytes)
        {
            free(bytes);
            free_huffman_tree(root);
            return NULL;
        }
        for(curbit = 0; curbit < numbits; ++curbit)
        {
            if(get_bit(bytes, curbit))//如果當前碼字爲1，則建立一個右節點
            {
                if(p->one == NULL)//如果右節點不存在，則新建右節點
                {
                    p->one = curbit == (unsigned char)(numbits - 1)//如果是當前的比特位到達了最後則建立一個葉子節點，沒有則建立一箇中間節點
                        ? new_leaf_node(symbol)
                        : new_nonleaf_node(0, NULL, NULL);
                    p->one->parent = p;
                }
                p = p->one;//把當前節點當成中間節點，以便建立後面節點
            }
            else//當前碼字爲0，則建立左節點
            {
                if(p->zero == NULL)
                {
                    p->zero = curbit == (unsigned char)(numbits - 1)
                        ? new_leaf_node(symbol)
                        : new_nonleaf_node(0, NULL, NULL);
                    p->zero->parent = p;
                }
                p = p->zero;
            }
        }

        free(bytes);
    }

    return root;
}

3）解碼文件

int
huffman_decode_file(FILE *in, FILE *out)
{
    huffman_node *root, *p;
    int c;
    unsigned int data_count;

    /* Read the Huffman code table. */
    root = read_code_table(in, &data_count);
    if(!root)
        return 1;

    /* Decode the file. */
    p = root;
    while(data_count > 0 && (c = fgetc(in)) != EOF)//解碼沒完成時條件成立
    {
        unsigned char byte = (unsigned char)c;//當前符號
        unsigned char mask = 1;
        while(data_count > 0 && mask)
        {
            p = byte & mask ? p->one : p->zero;//從根節點按碼字遍歷
            mask <<= 1;

            if(p->isLeaf)//遍歷到葉子結點時輸出信源符號，並且重新返回根節點
            {
                fputc(p->symbol, out);
                p = root;
                --data_count;
            }
        }
    }

    free_huffman_tree(root);
    return 0;
}

三、實驗結果
輸出碼錶的程序爲：

int write_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{
    int i,j;

    fprintf(out,"字符\t概率\t長度\t碼字\n");//輸出表頭
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
        huffman_code *p = (*se)[i];
        if(p)
        {
            float chance=p->count/(double)symbol_count; //計算概率
            fprintf(out,"%d\t",i);//信源符號
            fprintf(out,"%f\t",chance);//概率
            fprintf(out,"%d\t",p->numbits);//碼長
            for(j=0;j<p->numbits;j++)//二進制表示碼字，每次取1bit輸出到文件中
            {
                unsigned char c=get_bit(p->bits,j);
                fprintf(out,"%d",c);
            }
            fprintf(out,"\t\n");
        }
    }
    return 0;
}

test1的碼錶爲：

第一列表示信源符號，第二列爲碼長，第三列爲碼字
各種文件格式的編碼結果爲

四、結論
huffman編碼對於概率分佈不均勻，數值相差大的信源壓縮效果好，同時對於大量數據的壓縮數據好，而對於其他如符號分佈接近等概，數據較少的壓縮效果則比較差，甚至還可能出現壓縮後文件比壓縮前大的現象。