Huffman編碼的代碼計劃一直躺在我的Evernote裏面。這幾天正好是論文初稿的提交後的空窗期,就花兩天把這項todolist幹掉。
Huffman Coding 原理
Huffman Coding(霍夫曼編碼)是通信專業必學的一個知識點,在研僧期間老蔡《信息論》的課上也是再次強調了數遍。Huffman Coding在數據編碼領域裏面相當重要,在諸如數據壓縮、音頻編碼、圖像編碼中都得到了廣泛的應用,例如,MPEG1音頻標準的LayerIII、H.263視頻編碼標準中都使用Huffman Coding來進行數據壓縮。
Huffman Coding是由Huffman在1951年提出的。當時,Huffman和他在MIT信息論的同學需要選擇是完成學期報告還是期末考試。導師Fano給他們的學期報告題目是,查找最有效的二進制編碼。由於無法證明哪個已有編碼是最有效的,Huffman放棄了對已有編碼的研究,轉向新的探索,最終發現了基於有序頻率二叉樹編碼的想法,並很快證明了這個方法的有效性。Huffman Coding的具體文獻如[1]所示,如果你想要深入研究Huffman Coding,那麼最好研讀一番。
這個方法完成依據字符出現的概率來構造平均長度最短的碼字。具體過程如下:
- 先對各個字符出現的概率進行統計;
- 然後按照各個字符出現概率的大小排列,把最小的兩個概率相加,作爲新的概率和剩餘的概率重新排隊;
- 再把最小的兩個概率相加,再重新排隊,直到最後變成1。每次相加時都把“0”和“1”賦給相加的兩個概率,讀出時由該符號開始一直到最後的“1”。
Pseudo Code
begin
count frequencies of each single characters
sort them to non-decreasing sequence
create a leaf node (character, frequency c, left son = NULL, right son = NULL)
of the tree for each character and put nodes into queue F
while (|F|>=2) do
begin
pop the first two nodes (u1, u2) with the lowest
frequencies from sorted queue
create a node evaluated with sum of the chosen units,
successors are chosen units (eps, c(u1)+c(u2), u1, u2)
insert new node into queue
end
node evaluate with way from root to leaf node (left son 0, right son 1)
create output from coded intput characters
end
C Code
#include<stdio.h>
#include<stdlib.h>
#include<conio.h>
#define MAX_TREE_HT 100
typedef struct tagNode
{
char character;
unsigned frequency;
struct tagNode *left, *right;
}HNode;
typedef struct tagHeap
{
unsigned size;
unsigned space;
HNode **array;
}HHeap;
HNode* newNode(char character, unsigned frequency)
{
HNode* temp = (HNode*)malloc(sizeof(HNode));
temp->left = NULL;
temp->right = NULL;
temp->character = character;
temp->frequency = frequency;
return temp;
}
HHeap* createHHeap(unsigned space)
{
HHeap* HHeapX = (HHeap*)malloc(sizeof(HHeap));
HHeapX->size = 0;
HHeapX->space = space;
HHeapX->array = (HNode**)malloc(HHeapX->space * sizeof(HNode*));
return HHeapX;
}
void swapHNode(HNode** a,HNode** b)
{
HNode* t = *a;
*a = *b;
*b = t;
}
void HHeapify(HHeap* HHeapX, int idx)
{
int smallest = idx;
int left = 2*idx + 1;
int right = 2*idx + 2;
if ((left < HHeapX->size) && (HHeapX->array[left]->frequency < HHeapX->array[smallest]->frequency) )
{
smallest = left;
}
if ((right < HHeapX->size)&& (HHeapX->array[right]->frequency < HHeapX->array[smallest]->frequency))
{
smallest = right;
}
if (smallest != idx)
{
swapHNode(&HHeapX->array[smallest], &HHeapX->array[idx]);
HHeapify(HHeapX, smallest);
}
}
int isSizeOne(HHeap* HHeapX)
{
return (HHeapX->size == 1);
}
HNode* extractMin(HHeap* HHeapX)
{
HNode* temp = HHeapX->array[0];
HHeapX->array[0] = HHeapX->array[HHeapX->size - 1];
--HHeapX->size;
HHeapify(HHeapX,0);
return temp;
}
void insertHHeap(HHeap* HHeapX, HNode* HNodeX)
{
//int i = HHeapX->size - 1;
int i = HHeapX->size; //不減1
++HHeapX->size;
while ((i > 1) && HNodeX->frequency < HHeapX->array[(i-1)/2]->frequency)
{
HHeapX->array[i] = HHeapX->array[(i-1)/2];
i = (i-1)/2;
}
HHeapX->array[i] = HNodeX;
}
void buildHHeap(HHeap* HHeapX)
{
int n = HHeapX->size - 1;
for (int i = (n-1)/2; i >= 0 ; --i)
{
HHeapify(HHeapX, i);
}
}
void printArr(int arr[],int n)
{
for (int i = 0; i < n; i++)
{
printf("%d", arr[i]);
}
printf("\n");
}
int isLeaf(HNode* root)
{
return !(root->left) && !(root->right) ;
}
HHeap* createAndBuildHHeap(char character[], int frequency[], int size)
{
int i;
HHeap* HHeapX = createHHeap(size);
for (i = 0; i < size; ++i)
HHeapX->array[i] = newNode(character[i], frequency[i]);
HHeapX->size = size;
buildHHeap(HHeapX);
return HHeapX;
}
HNode* buildHuffmanTree(char character[], int frequency[], int size)
{
HNode *l, *r, *top;
HHeap* HHeap = createAndBuildHHeap(character, frequency, size);
while (!isSizeOne(HHeap))
{
l = extractMin(HHeap);
r = extractMin(HHeap);
top = newNode('$', l->frequency + r->frequency);
top->left = l;
top->right = r;
insertHHeap(HHeap, top);
}
return extractMin(HHeap);
}
void printCodes(HNode* root, int arr[], int top)
{
if (root->left)
{
arr[top] = 0;
printCodes(root->left, arr, top + 1);
}
if (root->right)
{
arr[top] = 1;
printCodes(root->right, arr, top + 1);
}
if (isLeaf(root))
{
printf("%c: ", root->character);
printArr(arr, top);
}
}
void HuffmanCoding(char character[], int frequency[], int size)
{
HNode* root = buildHuffmanTree(character, frequency, size);
int arr[MAX_TREE_HT], top = 0;
printCodes(root, arr, top);
}
int countStrFreq(const char *s, char character[], int frequency[])
{
// 用表計算字符出現的頻率
int freq[128] = {0};
while (*s)
{
freq[(int)*s++]++;
//printf("%c",*s);
}
int c = 0;
for (int i = 0; i < 128; i++)
{
if (freq[i] != 0)
{
character[c] = char(i);
frequency[c] = freq[i];
c++;
}
}
return c;
}
void main()
{
// 輸入的字符串
const char *str = "this is an example for huffman encoding";
// ASCII碼共包含128個字符,因此初始化大小設爲128
char cha[128];
int freq[128]={0};
// 計算字符串中各字符出現的頻率
int val;
val = countStrFreq(str,cha,freq);
// 進行Huffman編碼
HuffmanCoding(cha, freq, val);
system("pause");
}
C++ Code
在改寫C++代碼的時候遇到了兩個bug。
bug1:
在C++文件流處理中,當利用file.eof()函數來判斷文件末尾的時候,會出現文件末尾重複的現象,即原始爲abc,會變成abcc。這裏的解決方案是在while循環中加入if(file.eof()) break 來提前退出;
bug2:
當文件指針到達eof後,seekg()函數會失效,這個時候需要使用file.clear()函數來恢復流車狀態。
/****************************************************************
* Huffman coding algorithm Version 1.0
* Author: Sergey Tikhonov
* Modifier: Jeremy Lin
* Email: [email protected]
* Date: 2015.03.14 pm HQU
* More detail: http://blog.csdn.net/linj_m
****************************************************************/
#include <map>
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
using namespace std;
struct cNode
{
char ch; // character
float pro; // probability
};
struct treeNode: public cNode
{
char lcode;
char rcode;
treeNode *left; // left child
treeNode *right; // right child
};
static int nodeCompare(const void *elem1, const void *elem2)
{
const cNode a = *(cNode*)elem1;
const cNode b = *(cNode*)elem2;
if (a.pro < b.pro)
return 1;
else if(a.pro > b.pro)
return -1;
else
return 0;
}
class HCode
{
private:
int tsize; // table size (number of chars)
cNode *ptable; // table of probabilities
map<char, string> codes; // codeword for each char
public:
void enCode(const char* inputFilepath, const char* outputFilepath)
{
map<char, int> freqs; // frequency for each char from input text
int i;
// Opening input file
//
ifstream inputFile;
inputFile.open(inputFilepath, ifstream::in);
if (!inputFile)
{
cerr<<"error: unable to open input file: " << inputFilepath <<endl;
}
// Counting chars
//
// bug 1, use eof() to judge the end of file will bring error。
char ch; //char
unsigned total = 0;
while (true)
{
inputFile.get(ch);
if(inputFile.eof())
break;
freqs[ch]++;
total++;
}
tsize = (int)freqs.size();
// Building decreasing freqs table
//
ptable =new cNode[tsize];
//assert(ptable);
float ftot = float(total);
map<char, int>::iterator fi;
for (fi = freqs.begin(), i = 0; fi != freqs.end(); ++fi, ++i)
{
ptable[i].ch = (*fi).first;
ptable[i].pro = float((*fi).second)/ftot;
}
qsort(ptable, tsize, sizeof(cNode), nodeCompare);
// Encoding
//
EncHuffman();
// Opening output file
//
ofstream outputFile;
outputFile.open(outputFilepath, ofstream::out);
if (!outputFile)
{
cerr<<"error: unable to open output file: " << outputFilepath <<endl;
}
// Outputing ptable and codes
//
std::cout<<endl<<tsize<<endl;
outputFile<<tsize<<endl;
for (int i = 0; i < tsize; i++)
{
std::cout <<ptable[i].ch<<"\t"<<ptable[i].pro<<"\t"<<codes[ptable[i].ch].c_str()<<endl;
outputFile<<ptable[i].ch<<"\t"<<ptable[i].pro<<"\t"<<codes[ptable[i].ch].c_str()<<endl;
}
// Outputing encoded text
//
// bug 2, if inputfile's eofbit is ture,the seekg()function will out of work.
//so you have to use clear() to reset inputfile‘s state.
inputFile.clear();
inputFile.seekg(0,inputFile.beg);
std::cout<<endl;
outputFile<<endl;
while (true)
{
inputFile.get(ch);
if (inputFile.eof())
break;
std::cout<<codes[ch].c_str();
outputFile<<codes[ch].c_str();
}
std::cout<<endl;
// Cleaning
//
codes.clear();
delete[] ptable;
// Closing files
//
outputFile.close();
outputFile.clear();
inputFile.close();
inputFile.clear();
}
void Decode(const char* inputFilename, const char* outputFilename)
{
// Opening input file
ifstream inputFile;
inputFile.open(inputFilename);
if (!inputFile)
{
cerr<<"error: unable to open input file: " << inputFilename <<endl;
}
// Loading codes
//
inputFile>>tsize;
char ch, code[128];
float p;
int i;
inputFile.get();
for (i = 0; i < tsize; i++)
{
inputFile.get(ch);
inputFile>>p>>code;
codes[ch] = code;
inputFile.get();
}
inputFile.get();
// Opening output file
//
ofstream outputFile;
outputFile.open(outputFilename);
if (!outputFile)
{
cerr<<"error: unable to open output file: "<<outputFilename<<endl;
}
// Decoding and outputing to file
//
string accum = "";
map<char, string>::iterator ci;
while (true)
{
inputFile.get(ch);
if(inputFile.eof())
break;
accum += ch;
for (ci = codes.begin(); ci != codes.end(); ++ci)
{
if (!strcmp((*ci).second.c_str(), accum.c_str()))
{
accum = "";
std::cout<<(*ci).first;
outputFile<<(*ci).first;
}
}
}
std::cout<<endl;
// Cleaning
//
outputFile.close();
outputFile.clear();
inputFile.close();
inputFile.clear();
}
private:
void EncHuffman()
{
// Creating leaves (initial top-nodes)
//
treeNode *n;
vector<treeNode*> tops; // top-nodes
int i, numtop = tsize;
for (i = 0; i < numtop; i++)
{
n = new treeNode;
//assert(n);
n->ch = ptable[i].ch;
n->pro = ptable[i].pro;
n->left = NULL;
n->right = NULL;
tops.push_back(n);
}
// Building binary tree.
// Combining last two nodes, replacing them by new node
// without invalidating sort
//
while (numtop > 1)
{
n = new treeNode;
//assert(n);
n->pro = tops[numtop - 2]->pro + tops[numtop - 1]->pro;
n->left = tops[numtop - 2];
n->right = tops[numtop - 1];
if ( n->left->pro < n->right->pro)
{
n->lcode = '0';
n->rcode = '1';
}
else
{
n->lcode = '1';
n->rcode = '0';
}
tops.pop_back();
tops.pop_back();
bool isins = false;
std::vector<treeNode*>::iterator ti;
for ( ti = tops.begin(); ti != tops.end(); ++ti)
{
if ( (*ti)->pro < n->pro)
{
tops.insert(ti, n);
isins = true;
break;
}
}
if ( !isins)
tops.push_back(n);
numtop--;
}
// Building codes
//
treeNode *root = tops[0];
GenerateCode(root);
// Cleaning
//
DestroyNode(root);
tops.clear();
}
void GenerateCode( treeNode *node ) // for outside call: node is root
{
static string sequence = "";
if( node->left )
{
sequence += node->lcode;
GenerateCode( node->left );
}
if( node->right )
{
sequence += node->rcode;
GenerateCode( node->right );
}
if( !node->left && !node->right )
codes[node->ch] = sequence;
int l = (int)sequence.length();
if( l > 1 )
sequence = sequence.substr( 0, l-1 );
else
sequence = "";
}
void DestroyNode( treeNode *node) // for outside call: node is root
{
if (node->left)
{
DestroyNode(node->left);
delete node->left;
node->left = NULL;
}
if (node->right)
{
DestroyNode(node->right);
delete node->right;
node->right = NULL;
}
}
};
int show_usage()
{
cout<<"Huffman Coding Algorithm Version 1.0"<<endl;
cout<<" Modifier:Jeremy Lin 2015-03-14 @HQU"<<endl;
cout<<" Email:[email protected]"<<endl;
cout<<endl;
cout<<"Usage:"<<endl;
cout<<" huffman [OPTIONS] input [output]"<<endl;
cout<<" The defaul action is to encode the input file."<<endl;
cout<<" -d\tDecode file."<<endl;
cout<<endl;
cout<<"Examples:"<<endl;
cout<<" huffman input.txt"<<endl;
cout<<" huffman input.txt encoded.txt"<<endl;
cout<<" huffman -d encoded.txt"<<endl;
exit(0);
}
int main(int argc, char **argv)
{
int i = 1;
bool decFlag = false; // decode flag
char inputFilename[128];
char outputFilename[128];
if (argc < 2)
{
show_usage();
}
if (strcmp(argv[i],"-d") == 0)
{
decFlag = true;
++i;
if (i == argc)
{
show_usage();
}
}
strcpy(inputFilename, argv[i]);
++i;
if (i < argc)
{
strcpy(outputFilename, argv[i]);
}
else
{
if (decFlag) strcpy(outputFilename, "decoded.txt");
else strcpy(outputFilename, "encoded.txt");
}
// Calling encoding or decoding subroutine
//
HCode *pCoder;
pCoder = new HCode;
if (!pCoder)
{
cerr<<"error: unable to create a pointer to HCode"<<endl;
}
if (!decFlag)
{
pCoder->enCode(inputFilename, outputFilename);
}
else
{
pCoder->Decode(inputFilename, outputFilename);
}
delete pCoder;
return 0;
}
本文地址:http://blog.csdn.net/linj_m/article/details/44241543
更多資源 請關注博客: LinJM-機器視覺 微博:林建民-機器視覺
[1] Huffman, D.A., A method for the construction of minimum redundancy codes. Proceedings of the IRE, 1952. 40(9): p. 1098-1101.
[2] http://scanftree.com/Data_Structure/huffman-code