一、實驗原理
哈夫曼編碼(Huffman Coding),又稱霍夫曼編碼或最佳碼,是可變字長編碼(VLC)的一種,屬於無損壓縮。該方法完全依據字符出現概率來構造碼字,出現概率大的符號碼長短,概率小的碼長大,能有效的減小碼長,對於概率分佈相差大的信源壓縮效率高,而對於接近於等概分佈的信源壓縮效率低。
實際實現中常用二叉樹來表示編碼過程,節點需要表示的信息有它的概率;它是否爲葉子節點,不是則表示是一箇中間節點,它有左右子節點,是葉子結點則有一個符號;它的父節點用於建立碼樹。 由於huffman碼爲變長碼,不能事先預留空間,所以用指針來表示它的碼字序列,還需指出它所用的比特位數,爲了後續輸出碼錶還添加了概率。
每個節點的數據結構爲:
typedef struct huffman_node_tag
{
unsigned char isLeaf;//是否爲葉子結點
unsigned long count;//該符號的個數
struct huffman_node_tag *parent;//指向父節點的指針
union
{
struct
{
struct huffman_node_tag *zero, *one;//子節點
};
unsigned char symbol;//符號
};
} huffman_node;
每個碼字的數據結構爲:
typedef struct huffman_code_tag
{
//add by zhn
int count;//出現頻率
//end add
/* The length of this code in bits. */
unsigned long numbits;//比特位數
unsigned char *bits;//比特流
} huffman_code;
它的編碼步驟爲
1)統計個符號出現的次數,按照它們出現的概率並從大到小依次排列。
2)每次取概率最小的兩個節點,合併概率,生成父節點,用父節點代替這兩個子節點重新排序,直到根結點。
3)分配碼字,二叉樹的左節點爲0,右節點爲1,從根到葉子結點遍歷得到碼字。
二、實驗步驟
1.huffman編碼流程
1)讀入文件
char memory = 0; //memory爲1表示對內存編碼
char compress = 1;//compress爲1表示壓縮,爲0是解壓
int opt;
//add by zhn
const char *file_in = NULL, *file_out = NULL;
const char *file_table=NULL;
FILE *in = stdin;//標準輸入
FILE *out = stdout;//標準輸出
//add by zhn
FILE *table;//輸出碼錶
while((opt = getopt(argc, argv, "i:o:t:cdhvm")) != -1)//對argc,argv的解析,單個字符後跟一個冒號表示後面必須接參數
{
switch(opt)
{
case 'i':
file_in = optarg;
break;
case 'o':
file_out = optarg;
break;
//add by zhn
case 't':
file_table = optarg;
break;
case 'c':
compress = 1;
break;
case 'd':
compress = 0;
break;
case 'h':
usage(stdout);
return 0;
case 'v':
version(stdout);
return 0;
case 'm':
memory = 1;
break;
default:
usage(stderr);
return 1;
}
}
2)統計各個字符出現的概率
#define MAX_SYMBOLS 256//共有256個字符
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];
static void
init_frequencies(SymbolFrequencies *pSF)
{
memset(*pSF, 0, sizeof(SymbolFrequencies));
}
static huffman_node*
new_leaf_node(unsigned char symbol)//新建葉子結點父節點爲0
{
huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));
p->isLeaf = 1;
p->symbol = symbol;
p->count = 0;
p->parent = 0;
return p;
}
static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{
int c;
unsigned int total_count = 0;
/* Set all frequencies to 0. */
init_frequencies(pSF);//初始化256個節點
/* Count the frequency of each symbol in the input file. */
while((c = fgetc(in)) != EOF)//每次取一個字符
{
unsigned char uc = c;
if(!(*pSF)[uc])//如果該字符以前沒有出現則建立新的葉子節點
(*pSF)[uc] = new_leaf_node(uc);
++(*pSF)[uc]->count;//統計頻率
++total_count;
}
return total_count;
}
3)建立huffman樹
static int
SFComp(const void *p1, const void *p2)
{
const huffman_node *hn1 = *(const huffman_node**)p1;
const huffman_node *hn2 = *(const huffman_node**)p2;
/* Sort all NULLs to the end. */
if(hn1 == NULL && hn2 == NULL)
return 0;
if(hn1 == NULL)
return 1;
if(hn2 == NULL)
return -1;
if(hn1->count > hn2->count)//qsort爲1時排爲elem2,elem2;-1時排爲elem1,elem2;
return 1;
else if(hn1->count < hn2->count)
return -1;
return 0;
}//以頻率從從小到大排序,並且count爲0時不參與排序
static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)
{
if(subtree == NULL)
return;
if(subtree->isLeaf)
{
(*pSF)[subtree->symbol] = new_code(subtree);
}
else
{
build_symbol_encoder(subtree->zero, pSF);
build_symbol_encoder(subtree->one, pSF);
}
}//先遍歷右節點,再遍歷左節點,直到爲葉子結點才分配碼字
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{
unsigned int i = 0;
unsigned int n = 0;
huffman_node *m1 = NULL, *m2 = NULL;
SymbolEncoder *pSE = NULL;
qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);//qsort以SFComp爲函數進行升序排序,並交換地址,
/* 得到非零字符的個數 */
for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)
;
for(i = 0; i < n - 1; ++i)
{
/* Set m1 and m2 to the two subsets of least probability. */
m1 = (*pSF)[0];
m2 = (*pSF)[1];
/* Replace m1 and m2 with a set {m1, m2} whose probability
* is the sum of that of m1 and m2. */
(*pSF)[0] = m1->parent = m2->parent =
new_nonleaf_node(m1->count + m2->count, m1, m2);
(*pSF)[1] = NULL;
/* Put newSet into the correct count position in pSF. */
qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);
}
/* Build the SymbolEncoder array from the tree. */
pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));
memset(pSE, 0, sizeof(SymbolEncoder));
build_symbol_encoder((*pSF)[0], pSE);
return pSE;
}
4)將碼錶和其他信息寫入輸出文件
static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{
unsigned long i, count = 0;
/* Determine the number of entries in se. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*se)[i])
++count;
}
/* Write the number of entries in network byte order. */
i = htonl(count);
if(fwrite(&i, sizeof(i), 1, out) != 1)
return 1;
/* Write the number of bytes that will be encoded. */
symbol_count = htonl(symbol_count);
if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)
return 1;
/* Write the entries. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if(p)
{
unsigned int numbytes;
/* Write the 1 byte symbol. */
fputc((unsigned char)i, out);
/* Write the 1 byte code bit length. */
fputc(p->numbits, out);
/* Write the code bytes. */
numbytes = numbytes_from_numbits(p->numbits);
if(fwrite(p->bits, 1, numbytes, out) != numbytes)
return 1;
}
}
return 0;
}
計算機存貯時是大尾字節序,低位在前高位在後,而htonl函數可以實現字節序的轉化,把原數據高低位交換後再存貯在文件中。
5)對文件進行編碼
static unsigned char
get_bit(unsigned char* bits, unsigned long i)
{
return (bits[i / 8] >> i % 8) & 1;
}
static int
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{
unsigned char curbyte = 0;
unsigned char curbit = 0;
int c;
while((c = fgetc(in)) != EOF)
{
unsigned char uc = (unsigned char)c;
huffman_code *code = (*se)[uc];
unsigned long i;
for(i = 0; i < code->numbits; ++i)
{
/* Add the current bit to curbyte. */
curbyte |= get_bit(code->bits, i) << curbit;
/* If this byte is filled up then write it
* out and reset the curbit and curbyte. */
if(++curbit == 8)
{
fputc(curbyte, out);
curbyte = 0;
curbit = 0;
}
}
}
/*
* If there is data in curbyte that has not been
* output yet, which means that the last encoded
* character did not fall on a byte boundary,
* then output it.
*/
if(curbit > 0)
fputc(curbyte, out);
return 0;
}
6)釋放碼樹
static void
free_huffman_tree(huffman_node *subtree)
{
if(subtree == NULL)
return;
if(!subtree->isLeaf)
{
free_huffman_tree(subtree->zero);
free_huffman_tree(subtree->one);
}
free(subtree);
}
static void
free_encoder(SymbolEncoder *pSE)
{
unsigned long i;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*pSE)[i];
if(p)
free_code(p);
}
free(pSE);
}
2.huffman解碼流程
1)讀入解碼文件
2)讀取碼樹
static huffman_node*
read_code_table(FILE* in, unsigned int *pDataBytes)
{
huffman_node *root = new_nonleaf_node(0, NULL, NULL);//建立根節點
unsigned int count;
if(fread(&count, sizeof(count), 1, in) != 1)//讀取符號數,如果讀取失敗則不解碼直接返回
{
free_huffman_tree(root);
return NULL;
}
count = ntohl(count);
if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1)//讀取總的解碼出來的文件字節數
{
free_huffman_tree(root);
return NULL;
}
*pDataBytes = ntohl(*pDataBytes);
/* Read the entries. */
while(count-- > 0)//文件指向碼錶,第一項爲信源符號,第二項爲碼字長度,碼字
{
int c;
unsigned int curbit;
unsigned char symbol;
unsigned char numbits;
unsigned char numbytes;
unsigned char *bytes;
huffman_node *p = root;
if((c = fgetc(in)) == EOF)//信源符號
{
free_huffman_tree(root);
return NULL;
}
symbol = (unsigned char)c;
if((c = fgetc(in)) == EOF)//碼長
{
free_huffman_tree(root);
return NULL;
}
numbits = (unsigned char)c;
numbytes = (unsigned char)numbytes_from_numbits(numbits);//碼長轉化爲字節大小
bytes = (unsigned char*)malloc(numbytes);
if(fread(bytes, 1, numbytes, in) != numbytes)
{
free(bytes);
free_huffman_tree(root);
return NULL;
}
for(curbit = 0; curbit < numbits; ++curbit)
{
if(get_bit(bytes, curbit))//如果當前碼字爲1,則建立一個右節點
{
if(p->one == NULL)//如果右節點不存在,則新建右節點
{
p->one = curbit == (unsigned char)(numbits - 1)//如果是當前的比特位到達了最後則建立一個葉子節點,沒有則建立一箇中間節點
? new_leaf_node(symbol)
: new_nonleaf_node(0, NULL, NULL);
p->one->parent = p;
}
p = p->one;//把當前節點當成中間節點,以便建立後面節點
}
else//當前碼字爲0,則建立左節點
{
if(p->zero == NULL)
{
p->zero = curbit == (unsigned char)(numbits - 1)
? new_leaf_node(symbol)
: new_nonleaf_node(0, NULL, NULL);
p->zero->parent = p;
}
p = p->zero;
}
}
free(bytes);
}
return root;
}
3)解碼文件
int
huffman_decode_file(FILE *in, FILE *out)
{
huffman_node *root, *p;
int c;
unsigned int data_count;
/* Read the Huffman code table. */
root = read_code_table(in, &data_count);
if(!root)
return 1;
/* Decode the file. */
p = root;
while(data_count > 0 && (c = fgetc(in)) != EOF)//解碼沒完成時條件成立
{
unsigned char byte = (unsigned char)c;//當前符號
unsigned char mask = 1;
while(data_count > 0 && mask)
{
p = byte & mask ? p->one : p->zero;//從根節點按碼字遍歷
mask <<= 1;
if(p->isLeaf)//遍歷到葉子結點時輸出信源符號,並且重新返回根節點
{
fputc(p->symbol, out);
p = root;
--data_count;
}
}
}
free_huffman_tree(root);
return 0;
}
三、實驗結果
輸出碼錶的程序爲:
int write_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{
int i,j;
fprintf(out,"字符\t概率\t長度\t碼字\n");//輸出表頭
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if(p)
{
float chance=p->count/(double)symbol_count; //計算概率
fprintf(out,"%d\t",i);//信源符號
fprintf(out,"%f\t",chance);//概率
fprintf(out,"%d\t",p->numbits);//碼長
for(j=0;j<p->numbits;j++)//二進制表示碼字,每次取1bit輸出到文件中
{
unsigned char c=get_bit(p->bits,j);
fprintf(out,"%d",c);
}
fprintf(out,"\t\n");
}
}
return 0;
}
test1的碼錶爲:
第一列表示信源符號,第二列爲碼長,第三列爲碼字
各種文件格式的編碼結果爲
四、結論
huffman編碼對於概率分佈不均勻,數值相差大的信源壓縮效果好,同時對於大量數據的壓縮數據好,而對於其他如符號分佈接近等概,數據較少的壓縮效果則比較差,甚至還可能出現壓縮後文件比壓縮前大的現象。