Huffman Coding 原理與C/C++代碼

Huffman編碼的代碼計劃一直躺在我的Evernote裏面。這幾天正好是論文初稿的提交後的空窗期,就花兩天把這項todolist幹掉。


Huffman Coding 原理

Huffman Coding(霍夫曼編碼)是通信專業必學的一個知識點,在研僧期間老蔡《信息論》的課上也是再次強調了數遍。Huffman Coding在數據編碼領域裏面相當重要,在諸如數據壓縮、音頻編碼、圖像編碼中都得到了廣泛的應用,例如,MPEG1音頻標準的LayerIII、H.263視頻編碼標準中都使用Huffman Coding來進行數據壓縮。

Huffman Coding是由Huffman在1951年提出的。當時,Huffman和他在MIT信息論的同學需要選擇是完成學期報告還是期末考試。導師Fano給他們的學期報告題目是,查找最有效的二進制編碼。由於無法證明哪個已有編碼是最有效的,Huffman放棄了對已有編碼的研究,轉向新的探索,最終發現了基於有序頻率二叉樹編碼的想法,並很快證明了這個方法的有效性。Huffman Coding的具體文獻如[1]所示,如果你想要深入研究Huffman Coding,那麼最好研讀一番。

這個方法完成依據字符出現的概率來構造平均長度最短的碼字。具體過程如下:

  1. 先對各個字符出現的概率進行統計;
  2. 然後按照各個字符出現概率的大小排列,把最小的兩個概率相加,作爲新的概率和剩餘的概率重新排隊;
  3. 再把最小的兩個概率相加,再重新排隊,直到最後變成1。每次相加時都把“0”和“1”賦給相加的兩個概率,讀出時由該符號開始一直到最後的“1”。

Huffman Coding的過程

Pseudo Code

begin
     count frequencies of each single characters
     sort them to non-decreasing sequence
     create a leaf node (character, frequency c, left son = NULL, right son = NULL) 
     of the tree for each character and put nodes into queue F
     while (|F|>=2) do 
      begin
        pop the first two nodes (u1, u2) with the lowest 
          frequencies from sorted queue
        create a node evaluated with sum of the chosen units, 
          successors are chosen units (eps, c(u1)+c(u2), u1, u2)
        insert new node into queue
      end
     node evaluate with way from root to leaf node (left son 0, right son 1)
     create output from coded intput characters
end

C Code

#include<stdio.h>
#include<stdlib.h>
#include<conio.h>

#define MAX_TREE_HT 100

typedef struct tagNode
{
    char character;
    unsigned frequency;
    struct tagNode *left, *right;
}HNode;

typedef struct tagHeap
{
    unsigned size;
    unsigned space;
    HNode **array;
}HHeap;

HNode* newNode(char character, unsigned frequency)
{
    HNode* temp = (HNode*)malloc(sizeof(HNode));
    temp->left = NULL;
    temp->right = NULL;
    temp->character = character;
    temp->frequency = frequency;
    return temp;
}

HHeap* createHHeap(unsigned space)
{
    HHeap* HHeapX = (HHeap*)malloc(sizeof(HHeap));
    HHeapX->size = 0;
    HHeapX->space = space;
    HHeapX->array = (HNode**)malloc(HHeapX->space * sizeof(HNode*));
    return HHeapX;
}

void swapHNode(HNode** a,HNode** b)
{
    HNode* t = *a;
    *a = *b;
    *b = t;
}

void HHeapify(HHeap* HHeapX, int idx)
{
    int smallest = idx;
    int left = 2*idx + 1;
    int right = 2*idx + 2;

    if ((left < HHeapX->size) && (HHeapX->array[left]->frequency < HHeapX->array[smallest]->frequency) )
    {
        smallest = left;
    }

    if ((right < HHeapX->size)&& (HHeapX->array[right]->frequency < HHeapX->array[smallest]->frequency))
    {
        smallest = right;
    }

    if (smallest != idx)
    {
        swapHNode(&HHeapX->array[smallest], &HHeapX->array[idx]);
        HHeapify(HHeapX, smallest);
    }
}

int isSizeOne(HHeap* HHeapX)
{
    return (HHeapX->size == 1);
}

HNode* extractMin(HHeap* HHeapX)
{
    HNode* temp = HHeapX->array[0];
    HHeapX->array[0] = HHeapX->array[HHeapX->size - 1];
    --HHeapX->size;
    HHeapify(HHeapX,0);
    return temp;
}

void insertHHeap(HHeap* HHeapX, HNode* HNodeX)
{
    //int i = HHeapX->size - 1;
    int i = HHeapX->size; //不減1
    ++HHeapX->size;
    while ((i > 1) && HNodeX->frequency < HHeapX->array[(i-1)/2]->frequency)
    {
        HHeapX->array[i] = HHeapX->array[(i-1)/2];
        i = (i-1)/2;
    }
    HHeapX->array[i] = HNodeX;
}

void buildHHeap(HHeap* HHeapX)
{
    int n = HHeapX->size - 1;
    for (int i = (n-1)/2; i >= 0 ; --i)
    {
        HHeapify(HHeapX, i);
    }
}

void printArr(int arr[],int n)
{
    for (int i = 0; i < n; i++)
    {
        printf("%d", arr[i]);
    }
    printf("\n");
}

int isLeaf(HNode* root)
{
    return !(root->left) && !(root->right) ;
}

HHeap* createAndBuildHHeap(char character[], int frequency[], int size)
{
    int i;
    HHeap* HHeapX = createHHeap(size);
    for (i = 0; i < size; ++i)
        HHeapX->array[i] = newNode(character[i], frequency[i]);
    HHeapX->size = size;
    buildHHeap(HHeapX);
    return HHeapX;
}

HNode* buildHuffmanTree(char character[], int frequency[], int size)
{
    HNode *l, *r, *top;
    HHeap* HHeap = createAndBuildHHeap(character, frequency, size);

    while (!isSizeOne(HHeap))
    {
        l = extractMin(HHeap);
        r = extractMin(HHeap);
        top = newNode('$', l->frequency + r->frequency);
        top->left = l;
        top->right = r;
        insertHHeap(HHeap, top);
    }
    return extractMin(HHeap);
}

void printCodes(HNode* root, int arr[], int top)
{
    if (root->left)
    {
        arr[top] = 0;
        printCodes(root->left, arr, top + 1);
    }

    if (root->right)
    {
        arr[top] = 1;
        printCodes(root->right, arr, top + 1);
    }

    if (isLeaf(root))
    {
        printf("%c: ", root->character);
        printArr(arr, top);
    }
}

void HuffmanCoding(char character[], int frequency[], int size)
{
    HNode* root = buildHuffmanTree(character, frequency, size);
    int arr[MAX_TREE_HT], top = 0;
    printCodes(root, arr, top);
}

int countStrFreq(const char *s, char character[], int frequency[])
{
    // 用表計算字符出現的頻率
    int freq[128] = {0};
    while (*s)
    {
        freq[(int)*s++]++;
        //printf("%c",*s);
    }

    int c = 0;
    for (int i = 0; i < 128; i++)
    {
        if (freq[i] != 0)
        {
            character[c] = char(i);
            frequency[c] = freq[i];
            c++;
        }
    }
    return c;
}

void main()
{
    // 輸入的字符串
    const char *str = "this is an example for huffman encoding";

    // ASCII碼共包含128個字符,因此初始化大小設爲128
    char cha[128];
    int freq[128]={0};

    // 計算字符串中各字符出現的頻率
    int val;
    val = countStrFreq(str,cha,freq);

    // 進行Huffman編碼
    HuffmanCoding(cha, freq, val);

    system("pause");
}

C++ Code

在改寫C++代碼的時候遇到了兩個bug。
bug1:
在C++文件流處理中,當利用file.eof()函數來判斷文件末尾的時候,會出現文件末尾重複的現象,即原始爲abc,會變成abcc。這裏的解決方案是在while循環中加入if(file.eof()) break 來提前退出;
bug2:
當文件指針到達eof後,seekg()函數會失效,這個時候需要使用file.clear()函數來恢復流車狀態。

/****************************************************************
*   Huffman coding algorithm Version 1.0
*   Author: Sergey Tikhonov
*   Modifier: Jeremy Lin
*   Email: [email protected]
*   Date: 2015.03.14 pm HQU
*   More detail: http://blog.csdn.net/linj_m
****************************************************************/

#include <map>
#include <string>
#include <vector>
#include <iostream>
#include <fstream>

using namespace std;

struct cNode
{
    char ch; // character
    float pro; // probability
};

struct treeNode: public cNode
{
    char lcode;
    char rcode;
    treeNode *left; // left child
    treeNode *right; // right child
};

static int nodeCompare(const void *elem1, const void *elem2)
{
    const cNode a = *(cNode*)elem1;
    const cNode b = *(cNode*)elem2;

    if (a.pro < b.pro)
        return 1;
    else if(a.pro > b.pro)
        return -1;
    else
        return 0;
}

class HCode
{
private:
    int tsize; // table size (number of chars)
    cNode *ptable; // table of probabilities
    map<char, string> codes; // codeword for each char

public:
    void enCode(const char* inputFilepath, const char* outputFilepath)
    {
        map<char, int> freqs; // frequency for each char from input text
        int i;

        // Opening input file
        //
        ifstream inputFile;
        inputFile.open(inputFilepath, ifstream::in);
        if (!inputFile)
        {
            cerr<<"error: unable to open input file: " << inputFilepath <<endl;
        }

        // Counting chars
        //
        // bug 1, use eof() to judge the end of file will bring error。
        char ch; //char
        unsigned total = 0;
        while (true)
        {
            inputFile.get(ch);
            if(inputFile.eof()) 
                break;
            freqs[ch]++;
            total++;
        }
        tsize = (int)freqs.size();


        // Building decreasing freqs table
        //
        ptable =new cNode[tsize];
        //assert(ptable);
        float ftot = float(total);
        map<char, int>::iterator fi;
        for (fi = freqs.begin(), i = 0; fi != freqs.end();  ++fi, ++i)
        {
            ptable[i].ch = (*fi).first;
            ptable[i].pro = float((*fi).second)/ftot;
        }
        qsort(ptable, tsize, sizeof(cNode), nodeCompare);

        // Encoding
        //
        EncHuffman();

        // Opening output file
        //
        ofstream outputFile;
        outputFile.open(outputFilepath, ofstream::out);
        if (!outputFile)
        {
            cerr<<"error: unable to open output file: " << outputFilepath <<endl;
        }

        // Outputing ptable and codes
        //
        std::cout<<endl<<tsize<<endl;
        outputFile<<tsize<<endl;
        for (int i = 0; i < tsize; i++)
        {
            std::cout <<ptable[i].ch<<"\t"<<ptable[i].pro<<"\t"<<codes[ptable[i].ch].c_str()<<endl;
            outputFile<<ptable[i].ch<<"\t"<<ptable[i].pro<<"\t"<<codes[ptable[i].ch].c_str()<<endl;
        }


        // Outputing encoded text
        //
        // bug 2, if inputfile's eofbit is ture,the seekg()function will out of work. 
        //so you have to use clear() to reset inputfile‘s state.
        inputFile.clear();  
        inputFile.seekg(0,inputFile.beg);
        std::cout<<endl;
        outputFile<<endl;
        while (true)
        {
            inputFile.get(ch);
            if (inputFile.eof())
                break;
            std::cout<<codes[ch].c_str();
            outputFile<<codes[ch].c_str();
        }
        std::cout<<endl;

        // Cleaning
        //
        codes.clear();
        delete[] ptable;

        // Closing files
        //
        outputFile.close();
        outputFile.clear();
        inputFile.close();
        inputFile.clear();
    }

    void Decode(const char* inputFilename, const char* outputFilename)
    {
        // Opening input file
        ifstream inputFile;
        inputFile.open(inputFilename);
        if (!inputFile)
        {
            cerr<<"error: unable to open input file: " << inputFilename <<endl;
        }

        // Loading codes
        //
        inputFile>>tsize;
        char ch, code[128];
        float p;
        int i;
        inputFile.get();
        for (i = 0; i < tsize; i++)
        {
            inputFile.get(ch);
            inputFile>>p>>code;
            codes[ch] = code;
            inputFile.get();
        }
        inputFile.get();

        // Opening output file
        //
        ofstream outputFile;
        outputFile.open(outputFilename);
        if (!outputFile)
        {
            cerr<<"error: unable to open output file: "<<outputFilename<<endl;
        }


        // Decoding and outputing to file
        //
        string accum = "";
        map<char, string>::iterator ci;
        while (true)
        {
            inputFile.get(ch);
            if(inputFile.eof())
                break;
            accum += ch;
            for (ci = codes.begin(); ci != codes.end(); ++ci)
            {
                if (!strcmp((*ci).second.c_str(), accum.c_str()))
                {
                    accum = "";
                    std::cout<<(*ci).first;
                    outputFile<<(*ci).first;
                }
            }
        }
        std::cout<<endl;


        // Cleaning
        //
        outputFile.close();
        outputFile.clear();
        inputFile.close();
        inputFile.clear();
    }

private:
    void EncHuffman()
    {
        // Creating leaves (initial top-nodes)
        //
        treeNode *n;
        vector<treeNode*> tops; // top-nodes
        int i, numtop = tsize;
        for (i = 0; i < numtop; i++)
        {
            n = new treeNode;
            //assert(n);
            n->ch = ptable[i].ch;
            n->pro = ptable[i].pro;
            n->left = NULL;
            n->right = NULL;
            tops.push_back(n);
        }

        // Building binary tree.
        // Combining last two nodes, replacing them by new node
        // without invalidating sort
        //
        while (numtop > 1)
        {
            n = new treeNode;
            //assert(n);
            n->pro = tops[numtop - 2]->pro + tops[numtop - 1]->pro;
            n->left = tops[numtop - 2];
            n->right = tops[numtop - 1];

            if ( n->left->pro < n->right->pro)
            {
                n->lcode = '0';
                n->rcode = '1';
            }
            else
            {
                n->lcode = '1';
                n->rcode = '0';
            }
            tops.pop_back();
            tops.pop_back();

            bool isins = false;
            std::vector<treeNode*>::iterator ti;
            for ( ti = tops.begin(); ti != tops.end(); ++ti)
            {
                if ( (*ti)->pro < n->pro)
                {
                    tops.insert(ti, n);
                    isins = true;
                    break;
                }
            }
            if ( !isins) 
                tops.push_back(n);
            numtop--;
        }

        // Building codes
        //
        treeNode *root = tops[0];
        GenerateCode(root);

        // Cleaning 
        // 
        DestroyNode(root);
        tops.clear();
    }

    void GenerateCode( treeNode *node ) // for outside call: node is root
    {
        static string sequence = "";
        if( node->left )
        {
            sequence += node->lcode;
            GenerateCode( node->left );
        }

        if( node->right )
        {
            sequence += node->rcode;
            GenerateCode( node->right );
        }

        if( !node->left && !node->right )
            codes[node->ch] = sequence;

        int l = (int)sequence.length();
        if( l > 1 ) 
            sequence = sequence.substr( 0, l-1 );
        else 
            sequence = "";
    }

    void DestroyNode( treeNode *node) // for outside call: node is root
    {
        if (node->left)
        {
            DestroyNode(node->left);
            delete node->left;
            node->left = NULL;
        }

        if (node->right)
        {
            DestroyNode(node->right);
            delete node->right;
            node->right = NULL;
        }
    }
};

int show_usage()
{
    cout<<"Huffman Coding Algorithm Version 1.0"<<endl;
    cout<<"  Modifier:Jeremy Lin 2015-03-14 @HQU"<<endl;
    cout<<"  Email:[email protected]"<<endl;
    cout<<endl;
    cout<<"Usage:"<<endl; 
    cout<<" huffman [OPTIONS] input [output]"<<endl;
    cout<<" The defaul action is to encode the input file."<<endl;
    cout<<" -d\tDecode file."<<endl;
    cout<<endl;
    cout<<"Examples:"<<endl;
    cout<<" huffman input.txt"<<endl;
    cout<<" huffman input.txt encoded.txt"<<endl;
    cout<<" huffman -d encoded.txt"<<endl;
    exit(0);
}

int main(int argc, char **argv)
{
    int i = 1;
    bool decFlag = false;   // decode flag
    char inputFilename[128];  
    char outputFilename[128];

    if (argc < 2)
    {
        show_usage();
    }

    if (strcmp(argv[i],"-d") == 0)
    {
        decFlag = true;
        ++i;
        if (i == argc)
        {
            show_usage();
        }
    }

    strcpy(inputFilename, argv[i]);
    ++i;

    if (i < argc)
    {
        strcpy(outputFilename, argv[i]);
    }
    else
    {
        if (decFlag) strcpy(outputFilename, "decoded.txt");
        else         strcpy(outputFilename, "encoded.txt");
    }

    // Calling encoding or decoding subroutine
    // 
    HCode *pCoder;
    pCoder = new HCode;
    if (!pCoder)
    {
        cerr<<"error: unable to create a pointer to HCode"<<endl;
    }

    if (!decFlag)
    {
        pCoder->enCode(inputFilename, outputFilename);
    }
    else
    {
        pCoder->Decode(inputFilename, outputFilename);
    }
    delete pCoder;

    return 0;
}

這裏寫圖片描述

本文地址:http://blog.csdn.net/linj_m/article/details/44241543
更多資源 請關注博客: LinJM-機器視覺 微博:林建民-機器視覺

[1] Huffman, D.A., A method for the construction of minimum redundancy codes. Proceedings of the IRE, 1952. 40(9): p. 1098-1101.
[2] http://scanftree.com/Data_Structure/huffman-code

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章