哈夫曼壓縮&解壓縮
Ⅰ 前言
在之前的文章裏,我先介紹瞭如何構造哈夫曼樹及實現哈夫曼編碼,並用程序完成了這個部分。
【C語言->數據結構與算法】->樹與二叉樹概念&哈夫曼樹的構造
【C語言->數據結構與算法】->哈夫曼壓縮&解壓縮->第一階段->哈夫曼編碼&解碼的實現
這個程序的框架已經構架完成,可以完成最終的部分了。在第一階段中,我們完成了對任意字符串的編碼和解碼,現在要做的是,如何把這個字符串變成文件的內容。
我先把第一階段的函數部分代碼放在這裏。
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include "tyz.h"
#include "hufmanTree.h"
u8 *decoding(u8 *hufCode, u32 characterCount, HUFMAN_TREE_NODE *hufmanTreeNode) {
u8 *decode = NULL;
u32 i;
u32 index = 0;
u32 sum = 0;
u32 father = 2 * characterCount - 2;
for (i = 0; i < characterCount; i++) {
sum += hufmanTreeNode[i].attribute.frequency;
}
decode = (u8 *) calloc(sizeof(u8), sum);
for (i = 0; hufCode[i]; i++) {
if ('0' == hufCode[i]) {
decode[index++] = hufmanTreeNode[hufmanTreeNode[father].leftChild].attribute.character;
father = characterCount * 2 - 2;
} else {
father = hufmanTreeNode[father].rightChild;
if (-1 == hufmanTreeNode[father].leftChild) {
decode[index++] = hufmanTreeNode[father].attribute.character;
father = characterCount * 2 - 2;
}
}
}
return decode;
}
void destoryCode(u8 *hufCode) {
if (NULL == hufCode) {
return;
}
free(hufCode);
}
u8 *coding(u8 *str, u32 *orientate, u32 characterCount, HUFMAN_TREE_NODE *hufmanTreeNode) {
u8 *code = NULL;
u32 i;
u32 sum = 0;
for (i = 0; i < characterCount; i++) {
sum += hufmanTreeNode[i].attribute.frequency * strlen(hufmanTreeNode[i].hufmanCode);
}
code = (u8 *) calloc(sizeof(u8), sum);
for (i = 0; str[i]; i++) {
strcat(code, hufmanTreeNode[orientate[str[i]]].hufmanCode);
}
return code;
}
void creatHufmanCode(u8 *code, u32 index, u32 root, HUFMAN_TREE_NODE *hufmanTreeNode) {
if (-1 == hufmanTreeNode[root].leftChild) {
code[index] = 0;
strcpy(hufmanTreeNode[root].hufmanCode, code);
return;
} else {
code[index] = '0';
creatHufmanCode(code, index+1, hufmanTreeNode[root].leftChild, hufmanTreeNode);
code[index] = '1';
creatHufmanCode(code, index+1, hufmanTreeNode[root].rightChild, hufmanTreeNode);
}
}
u32 searchMinimumNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode) {
u32 i;
u32 minIndex = -1;
for (i = 0; i < count; i++) {
if (FALSE == hufmanTreeNode[i].visited
&& (-1 == minIndex
|| hufmanTreeNode[minIndex].attribute.frequency > hufmanTreeNode[i].attribute.frequency)) {
minIndex = i;
}
}
hufmanTreeNode[minIndex].visited = TRUE;
return minIndex;
}
void creatHufmanTree(u32 characterCount, HUFMAN_TREE_NODE *hufmanTreeNode) {
u32 i;
u32 leftChild;
u32 rightChild;
u32 count = characterCount;
for (i = 0; i < count - 1; i++) {
leftChild = searchMinimumNode(count+i, hufmanTreeNode);
rightChild = searchMinimumNode(count+i, hufmanTreeNode);
hufmanTreeNode[count+i].visited = FALSE;
hufmanTreeNode[count+i].hufmanCode = NULL;
hufmanTreeNode[count+i].leftChild = leftChild;
hufmanTreeNode[count+i].rightChild = rightChild;
hufmanTreeNode[count+i].attribute.character = '@';
hufmanTreeNode[count+i].attribute.frequency =
hufmanTreeNode[leftChild].attribute.frequency +
hufmanTreeNode[rightChild].attribute.frequency;
}
}
void showHufmanTreeNode(u32 characterCount, HUFMAN_TREE_NODE *hufmanTreeNode) {
u32 i;
printf("字符 頻度 左孩子 右孩子 編碼\n");
for (i = 0; i < characterCount; i++) {
printf("%-5c %-5d %-7d %-7d %-10s\n",
hufmanTreeNode[i].attribute.character,
hufmanTreeNode[i].attribute.frequency,
hufmanTreeNode[i].leftChild,
hufmanTreeNode[i].rightChild,
hufmanTreeNode[i].hufmanCode == NULL ? "NULL" : hufmanTreeNode[i].hufmanCode);
}
}
void destoryHufmanTreeNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode) {
u32 i;
if (NULL == hufmanTreeNode) {
return;
}
for (i = 0; i < count; i++) {
free(hufmanTreeNode[i].hufmanCode);
}
free(hufmanTreeNode);
}
HUFMAN_TREE_NODE *initHufmanTreeNode(u32 characterCount, u32 *orientate, ATTRIBUTE *attributeList) {
u32 i;
u32 nodeCount;
HUFMAN_TREE_NODE *hufmanTreeNode;
nodeCount = characterCount * 2 - 1;
hufmanTreeNode = (HUFMAN_TREE_NODE *) calloc(sizeof(HUFMAN_TREE_NODE), nodeCount);
for (i = 0; i < characterCount; i++) {
hufmanTreeNode[i].visited = FALSE;
hufmanTreeNode[i].hufmanCode = (u8 *) calloc(sizeof(u8), characterCount);
hufmanTreeNode[i].leftChild = hufmanTreeNode[i].rightChild = -1;
hufmanTreeNode[i].attribute = attributeList[i];
orientate[attributeList[i].character] = i;
}
return hufmanTreeNode;
}
void showAttributeList(u32 characterCount, ATTRIBUTE *attributeList) {
u32 i;
for (i = 0; i < characterCount; i++) {
printf("頻度:%d 符號:%c\n", attributeList[i].frequency, attributeList[i].character);
}
}
void destoryAttributeList(ATTRIBUTE *attributeList) {
if (NULL == attributeList) {
return;
}
free(attributeList);
}
ATTRIBUTE *initAttributeList(u8 *str, u32 *ascii, u32 *characterCount) {
u32 i;
u32 index = 0;
u32 count = 0;
ATTRIBUTE *attributeList;
for (i = 0; str[i]; i++) {
ascii[str[i]]++;
}
for (i = 0; i < 256; i++) {
count += (ascii[i] != 0);
}
*characterCount = count;
attributeList = (ATTRIBUTE *) calloc(sizeof(ATTRIBUTE), count);
for (i = 0; i < 256; i++) {
if (ascii[i] != 0) {
attributeList[index].character = (u8) i;
attributeList[index++].frequency = ascii[i];
}
}
return attributeList;
}
int main() {
u8 str[128];
u8 code[256];
u8 *hufCode = NULL;
u8 *decode = NULL;
u32 ascii[256] = {0};
u32 orientate[256] = {0};
u32 characterCount;
ATTRIBUTE *attributeList = NULL;
HUFMAN_TREE_NODE *hufmanTreeNode = NULL;
printf("請輸入字符串:\n");
gets(str);
attributeList = initAttributeList(str, ascii, &characterCount);
showAttributeList(characterCount, attributeList);
hufmanTreeNode = initHufmanTreeNode(characterCount, orientate, attributeList);
creatHufmanTree(characterCount, hufmanTreeNode);
creatHufmanCode(code, 0, 2*characterCount-2, hufmanTreeNode);
printf("Hufman Tree Below\n");
showHufmanTreeNode(2*characterCount-1, hufmanTreeNode);
hufCode = coding(str, orientate, characterCount, hufmanTreeNode);
printf("Hufman Code Below\n");
printf("%s\n", hufCode);
decode = decoding(hufCode, characterCount, hufmanTreeNode);
printf("Hufman Decode Below\n");
printf("%s\n", decode);
destoryCode(hufCode);
destoryCode(decode);
destoryAttributeList(attributeList);
destoryHufmanTreeNode(characterCount, hufmanTreeNode);
return 0;
}
我們現在需要思考的是,在對文件進行編碼壓縮解壓縮的過程中,和對字符串進行這個操作的過程中,哪些部分是共通的?哪些部分是需要改變的?
Ⅱ 需求分析&主函數帶參的應用
A. 需求分析
在用戶要壓縮文件時,我們需要用戶輸入一個TA需要壓縮的文件名,要生成的文件名可輸入可不輸入。
在用戶要解壓縮文件時,我們需要用戶輸入TA需要解壓縮的文件,並輸入要生成的文件名,因爲我們不知道用戶的文件到底是什麼類型的,所以需要用戶自己輸入要生成的文件名包括擴展名,以此來生成他需要的文件類型。
根據這兩個需求,我們如何讓用戶做到這一點呢?答案便是主函數帶參。
若對這個知識點有疑問的同學可以看我下面這篇文章,可以大概瞭解這裏的內容。
【C語言->數據結構與算法】->關於主函數帶參
B. 壓縮部分
如果用戶沒有輸入要生成的文件名,按照其他的壓縮軟件的規則,會生成一個和源文件名字相同但是擴展名相同的文件。
比如我要壓縮下面這個文件👇
則會彈出以下窗口,如果我沒有改變文件名,默認名字就是源文件名+壓縮軟件獨有的擴展名。
所以我們就需要一個自己的擴展名。我用宏定義了一個我的擴展名👇
#define TYZ_COP_EXTENSION ".TyzHuf"
在用戶輸入一個需要壓縮的文件名後,我們需要先檢測這個文件是否存在,由於這個函數可能會比較常用,所以我把它放在我常用的頭文件對應的.c文件tyz.c中👇
boolean isFileExist(const char *filename) {
FILE *fp;
fp = fopen(filename, "r");
if (NULL == fp) {
return FALSE;
}
fclose(fp);
return TRUE;
}
如果源文件存在,接下來就是對要生成的文件進行操作。若用戶沒有輸入要生成的文件,則我們需要一個源文件名+我們自己的擴展名的文件名,若用戶輸入了一個要生成的文件名,則我們需要將用戶輸入的文件名的擴展名改爲自己的擴展名。
函數如下,我仍將其放在tyz.c中👇
char *creatFilename(const char *sourceFilename, const char *extensionName, char *targetFilename) {
int index;
int dotIndex = -1;
char filename[200];
if ('.' != extensionName[0]) {
strcpy(filename, ".");
strcat(filename, extensionName);
} else {
strcpy(filename, extensionName);
}
for (index = 0; sourceFilename[index]; index++) {
if (sourceFilename[index] == '.') {
dotIndex = index;
}
}
if (-1 == dotIndex) {
strcpy(targetFilename, sourceFilename);
strcat(targetFilename, extensionName);
return targetFilename;
}
strcpy(targetFilename, sourceFilename);
targetFilename[dotIndex] = 0;
strcat(targetFilename, extensionName);
return targetFilename;
}
由於一個文件中可以有很多點,所以我們只取第一個點,將後面的內容變爲擴展名。
所以壓縮部分的主函數代碼如下👇
int main(int argc, char const *argv[]) {
char sourceFilename[200];
char targetFilename[200];
if (argc != 2 && argc != 3) {
printf("用法:hufCompress 源文件 目標文件\n");
return -1;
}
strcpy(sourceFilename, argv[1]);
if (!isFileExist(sourceFilename)) {
printf("source file[%s] did not exist\n", sourceFilename);
return -2;
}
if (argc == 3) {
strcpy(targetFilename, argv[2]);
} else {
creatFilename(sourceFilename, TYZ_COP_EXTENSION, targetFilename);
}
compress(sourceFilename, targetFilename);
return 0;
}
通過主函數的參數,我們就可以判斷用戶有沒有輸入要生成的文件名。如果格式不符合要求,我們會給用戶一個提示。
接下來是解壓縮部分。
B. 解壓縮部分
如我們分析的,我們只需要判斷用戶輸入的命令參數個數即可。
int main(int argc, char const *argv[]) {
char sourceFilename[200];
char targetFilename[200];
if (argc != 3) {
printf("用法:hufDecompress 源文件 還原文件\n");
return -1;
}
strcpy(sourceFilename, argv[1]);
if (!isFileExist(sourceFilename)) {
printf("source file[%s] did not exist\n", sourceFilename);
return -2;
}
strcpy(targetFilename, argv[2]);
decompress(sourceFilename, targetFilename);
return 0;
}
Ⅲ 哈夫曼壓縮
A. 代碼分析
要壓縮一個文件,我們需要改變的只是從鍵盤輸入讀取字符串,變成從文件中讀取字符。文件的本質還是由數據構成的,所以我們要做的就是將數據從文件中讀出來,然後和字符串操作一樣,做同樣的事,根據其頻度得到頻度表,然後構建哈夫曼樹,最後生成編碼。
所以和第一階段的代碼對比,我們需要改變的只有兩個部分,一個是從用戶輸入的字符串統計其頻度,改成從文件中讀取數據並統計其頻度;一個是編碼,這個階段我們需要用位運算,將編碼寫入新生成的壓縮文件中。
所以接下來,我們就來完成這兩個部分。
B. 從文件中讀取內容生成頻度表
這一部分沒有什麼複雜的地方,對於文件操作有疑問的同學可以看我下面這篇文章。
【C語言基礎】->文件操作詳解->一篇文章讀懂關於文件的龐雜函數使用
ATTRIBUTE *initAttributeList(u32 *ascii, u16 *characterCount, const char *sourceFilename) {
u8 ch;
u32 i;
u32 index = 0;
u16 count = 0;
FILE *fpIn;
ATTRIBUTE *attributeList;
fpIn = fopen(sourceFilename, "r");
ch = fgetc(fpIn);
while (!feof(fpIn)) {
ascii[ch]++;
ch = fgetc(fpIn);
}
for (i = 0; i < 256; i++) {
count += (ascii[i] != 0);
}
*characterCount = count;
attributeList = (ATTRIBUTE *) calloc(sizeof(ATTRIBUTE), count);
for (i = 0; i < 256; i++) {
if (ascii[i] != 0) {
attributeList[index].character = (u8) i;
attributeList[index++].frequency = ascii[i];
}
}
fclose(fpIn);
return attributeList;
}
通過fgetc()函數從文件中一個字節一個字節讀取,生成頻度表。需要注意的是while循環
ch = fgetc(fpIn);
while (!feof(fpIn)) {
ascii[ch]++;
ch = fgetc(fpIn);
}
我在文件操作的博文裏講過,feof()函數是一個動作標識函數,不是狀態標識符,所以要注意要把讀取操作放在循環的最後。
C. 將編碼寫入文件
我通過一個文件來分析這裏的內容。我打開一個bmp文件,比如下面這個👇
我用二進制編輯器將這個文件打開,會得到這樣的內容👇
這就是這個bmp文件在計算機中的內容,其中大量FF FF FF是白色的意思。可以看到文件的頭部有這樣的內容👇
首三位是BMP,也就是這個文件的拓展名,這是識別一個文件的方式,後面的數據標記了這個文件的各種數據,用以識別。
所以我們壓縮文件的時候,也要參照這種方式。首先是識別碼,然後是文件的字符種類數量,然後是哈夫曼編碼的總位數。
如果要進行解碼,在第一階段做的,我們是根據哈夫曼樹進行解碼的,所以爲了得到哈夫曼樹,我們必須得到頻度表,因此我們壓縮時,也需要將頻度表寫入文件中。
所以這四個部分,識別碼,文件的字符數量,哈夫曼編碼的位數,以及頻度表,就是我們要先寫入文件中的內容。
typedef struct FILE_HEAD {
u8 flag[3];
u16 characterCount;
u32 bitCount;
u8 unused[7];
}FILE_HEAD;
這個結構體就是要寫入文件頭部的內容,包括識別碼,文件的字符數量和哈夫曼編碼的位數,最後一個u8 unused[7];
,是爲了將這個結構體湊成2的整數次方,這樣存儲進去就是完整的,方便查找後面的內容。
現在我們看寫入部分。
首先是得到哈夫曼編碼的總位數。
u32 getBitCount(u16 characterCount, HUFMAN_TREE_NODE *hufmanTreeNode) {
u16 i;
u32 sum = 0;
for (i = 0; i < characterCount; i++) {
sum += hufmanTreeNode[i].attribute.frequency * strlen(hufmanTreeNode[i].hufmanCode);
}
return sum;
}
然後是編碼,也就是將哈夫曼編碼寫入壓縮的文件中的部分。👇
void coding(const char *sourceFilename, const char *targetFilename, HUFMAN_TREE_NODE *hufmanTreeNode,
u32 *orientate, u16 characterCount, ATTRIBUTE *attributeList) {
u32 ch;
u8 *hufCode;
u32 temp;
u32 bitIndex = 0;
u32 codeIndex;
FILE_HEAD fileHead = {0};
FILE *fpIn;
FILE *fpOut;
fpIn = fopen(sourceFilename, "r");
fpOut = fopen(targetFilename, "wb");
fileHead.flag[0] = 'T';
fileHead.flag[1] = 'Y';
fileHead.flag[2] = 'Z';
fileHead.characterCount = characterCount;
fileHead.bitCount = getBitCount(characterCount, hufmanTreeNode);
printf("characterCount = %d\nbitCount = %d\n", characterCount, fileHead.bitCount);
fwrite(&fileHead, sizeof(FILE_HEAD), 1, fpOut);
fwrite(attributeList, sizeof(ATTRIBUTE), characterCount, fpOut);
ch = fgetc(fpIn);
while (!feof(fpIn)) {
hufCode = hufmanTreeNode[orientate[ch]].hufmanCode;
for (codeIndex = 0; hufCode[codeIndex]; codeIndex++) {
hufCode[codeIndex] == '1' ? SET(temp, bitIndex)
: CLEAR(temp, bitIndex);
if (++bitIndex >= 8) {
fputc(temp, fpOut);
bitIndex = 0;
temp = 0;
}
}
ch = fgetc(fpIn);
}
if (bitIndex > 0) {
fputc(temp, fpOut);
}
fclose(fpIn);
fclose(fpOut);
}
通過hufCode,暫時存儲得到的ch所對應的哈夫曼編碼,然後遍歷這個編碼,如果是0就通過位運算的清位,將temp的對應位變成0,就這樣通過位運算的清位和置位,將temp的一個字節寫完,然後將其寫入文件中。
這裏有一個地方我犯了錯,看了很久才發現這個漏洞。
if (bitIndex > 0) {
fputc(temp, fpOut);
}
最後一步,其他的八個位的信息都寫入了,但是如果哈夫曼編碼的總位數不是八的倍數,就會有單獨幾位多出來,這個也是要寫進文件的,但是這時候temp的大於bitIndex的位上是上一個字節寫入的信息,由於新寫入的最後幾位不足八位,所以沒有完全覆蓋掉,這時候就會有問題。
所以,我在上一步,每次寫入一個文件的信息後,多加了一步,將temp歸0,這樣就可以避免這種錯誤的發生了。
if (++bitIndex >= 8) {
fputc(temp, fpOut);
bitIndex = 0;
temp = 0;
}
這就是編碼的內容。由於這一階段,我們的主函數做了改變,所以我們還需要一個函數,來充當第一階段主函數的作用。👇
void compress(const char *sourceFilename, const char *targetFilename) {
u8 code[256];
u16 characterCount;
u32 ascii[256] = {0};
u32 orientate[256] = {0};
ATTRIBUTE *attributeList = NULL;
HUFMAN_TREE_NODE *hufmanTreeNode = NULL;
attributeList = initAttributeList(ascii, &characterCount, sourceFilename);
//printf("頻度表如下:\n");
//showAttributeList(characterCount, attributeList);
hufmanTreeNode = initHufmanTreeNode(characterCount, orientate, attributeList);
hufmanTreeNode = initHufmanTreeNode(characterCount, orientate, attributeList);
creatHufmanTree(characterCount, hufmanTreeNode);
creatHufmanCode(code, 0, 2*characterCount-2, hufmanTreeNode);
//printf("編碼如下:\n");
//showHufmanTreeNode(2*characterCount-1, hufmanTreeNode);
coding(sourceFilename, targetFilename, hufmanTreeNode, orientate, characterCount, attributeList);
destoryAttributeList(attributeList);
destoryHufmanTreeNode(2*characterCount-2, hufmanTreeNode);
}
D. 哈夫曼壓縮完整代碼
首先是tyz.h的內容👇
#ifndef _TYZ_H_
#define _TYZ_H_
#define TRUE 1
#define FALSE 0
#define NOT_FOUND -1
#define SET(value, index) (value |= 1 << (index ^ 7))
#define CLEAR(value, index) (value &= ~(1 << (index ^ 7)))
#define GET(value, index) (((value) & (1 << ((index) ^ 7))) != 0)
typedef unsigned char boolean;
typedef unsigned int u32;
typedef unsigned short u16;
typedef boolean u8;
int skipBlank(const char *str);
boolean isRealStart(int ch);
boolean isFileExist(const char *filename);
char *creatFilename(const char *sourceFilename, const char *extensionName, char *targetFilename);
#endif
接着是tyz.c的部分👇
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "tyz.h"
boolean isRealStart(int ch) {
return isdigit(ch) || '+' == ch || '-' == ch;
}
int skipBlank(const char *str) {
int index;
for (index = 0; str[index] && isspace(str[index]); index++) {
;
}
return index;
}
boolean isFileExist(const char *filename) {
FILE *fp;
fp = fopen(filename, "r");
if (NULL == fp) {
return FALSE;
}
fclose(fp);
return TRUE;
}
char *creatFilename(const char *sourceFilename, const char *extensionName, char *targetFilename) {
int index;
int dotIndex = -1;
char filename[200];
if ('.' != extensionName[0]) {
strcpy(filename, ".");
strcat(filename, extensionName);
} else {
strcpy(filename, extensionName);
}
for (index = 0; sourceFilename[index]; index++) {
if (sourceFilename[index] == '.') {
dotIndex = index;
}
}
if (-1 == dotIndex) {
strcpy(targetFilename, sourceFilename);
strcat(targetFilename, extensionName);
return targetFilename;
}
strcpy(targetFilename, sourceFilename);
targetFilename[dotIndex] = 0;
strcat(targetFilename, extensionName);
return targetFilename;
}
接着是壓縮部分的頭文件hufCompress.h的內容👇
#ifndef _TYZ_HUF_COMPRESS_H_
#define _TYZ_HUF_COMPRESS_H_
#include "tyz.h"
typedef struct ATTRIBUTE {
u8 character;
u32 frequency;
}ATTRIBUTE;
typedef struct HUFMAN_TREE_NODE {
boolean visited;
u8 *hufmanCode;
u32 leftChild;
u32 rightChild;
ATTRIBUTE attribute;
}HUFMAN_TREE_NODE;
typedef struct FILE_HEAD {
u8 flag[3];
u16 characterCount;
u32 bitCount;
u8 unused[7];
}FILE_HEAD;
void compress(const char *sourceFilename, const char *targetFilename);
ATTRIBUTE *initAttributeList(u32 *ascii, u16 *characterCount, const char *sourceFilename);
void destoryAttributeList(ATTRIBUTE *attributeList);
void showAttributeList(u16 characterCount, ATTRIBUTE *attributeList);
HUFMAN_TREE_NODE *initHufmanTreeNode(u16 characterCount, u32 *orientate, ATTRIBUTE *attributeList);
void destoryHufmanTreeNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode);
void showHufmanTreeNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode);
void creatHufmanTree(u16 characterCount, HUFMAN_TREE_NODE *hufmanTreeNode);
u32 searchMinimumNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode);
void creatHufmanCode(u8 *code, u32 index, u32 root, HUFMAN_TREE_NODE *hufmanTreeNode);
void coding(const char *sourceFilename, const char *targetFilename, HUFMAN_TREE_NODE *hufmanTreeNode,
u32 *orientate, u16 characterCount, ATTRIBUTE *attributeList);
u32 getBitCount(u16 characterCount, HUFMAN_TREE_NODE *hufmanTreeNode);
#endif
最後是主體部分,hufCompress.c👇
#pragma pack(push)
#pragma pack(1)
#include <stdio.h>
#include <malloc.h>
#include <string.h>
#include "tyz.h"
#include "hufCompress.h"
#define TYZ_COP_EXTENSION ".TyzHuf"
u32 getBitCount(u16 characterCount, HUFMAN_TREE_NODE *hufmanTreeNode) {
u16 i;
u32 sum = 0;
for (i = 0; i < characterCount; i++) {
sum += hufmanTreeNode[i].attribute.frequency * strlen(hufmanTreeNode[i].hufmanCode);
}
return sum;
}
void coding(const char *sourceFilename, const char *targetFilename, HUFMAN_TREE_NODE *hufmanTreeNode,
u32 *orientate, u16 characterCount, ATTRIBUTE *attributeList) {
u32 ch;
u8 *hufCode;
u32 temp;
u32 bitIndex = 0;
u32 codeIndex;
FILE_HEAD fileHead = {0};
FILE *fpIn;
FILE *fpOut;
fpIn = fopen(sourceFilename, "r");
fpOut = fopen(targetFilename, "wb");
fileHead.flag[0] = 'T';
fileHead.flag[1] = 'Y';
fileHead.flag[2] = 'Z';
fileHead.characterCount = characterCount;
fileHead.bitCount = getBitCount(characterCount, hufmanTreeNode);
printf("characterCount = %d\nbitCount = %d\n", characterCount, fileHead.bitCount);
fwrite(&fileHead, sizeof(FILE_HEAD), 1, fpOut);
fwrite(attributeList, sizeof(ATTRIBUTE), characterCount, fpOut);
ch = fgetc(fpIn);
while (!feof(fpIn)) {
hufCode = hufmanTreeNode[orientate[ch]].hufmanCode;
for (codeIndex = 0; hufCode[codeIndex]; codeIndex++) {
hufCode[codeIndex] == '1' ? SET(temp, bitIndex)
: CLEAR(temp, bitIndex);
if (++bitIndex >= 8) {
fputc(temp, fpOut);
bitIndex = 0;
temp = 0;
}
}
ch = fgetc(fpIn);
}
if (bitIndex > 0) {
fputc(temp, fpOut);
}
fclose(fpIn);
fclose(fpOut);
}
void creatHufmanCode(u8 *code, u32 index, u32 root, HUFMAN_TREE_NODE *hufmanTreeNode) {
if (-1 == hufmanTreeNode[root].leftChild) {
code[index] = 0;
strcpy(hufmanTreeNode[root].hufmanCode, code);
return;
} else {
code[index] = '0';
creatHufmanCode(code, index+1, hufmanTreeNode[root].leftChild, hufmanTreeNode);
code[index] = '1';
creatHufmanCode(code, index+1, hufmanTreeNode[root].rightChild, hufmanTreeNode);
}
}
u32 searchMinimumNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode) {
u32 i;
u32 minIndex = -1;
for (i = 0; i < count; i++) {
if (FALSE == hufmanTreeNode[i].visited
&& (-1 == minIndex
|| hufmanTreeNode[minIndex].attribute.frequency > hufmanTreeNode[i].attribute.frequency)) {
minIndex = i;
}
}
hufmanTreeNode[minIndex].visited = TRUE;
return minIndex;
}
void creatHufmanTree(u16 characterCount, HUFMAN_TREE_NODE *hufmanTreeNode) {
u32 i;
u32 leftChild;
u32 rightChild;
u16 count = characterCount;
for (i = 0; i < count - 1; i++) {
leftChild = searchMinimumNode(count+i, hufmanTreeNode);
rightChild = searchMinimumNode(count+i, hufmanTreeNode);
hufmanTreeNode[count+i].visited = FALSE;
hufmanTreeNode[count+i].hufmanCode = NULL;
hufmanTreeNode[count+i].leftChild = leftChild;
hufmanTreeNode[count+i].rightChild = rightChild;
hufmanTreeNode[count+i].attribute.character = '@';
hufmanTreeNode[count+i].attribute.frequency =
hufmanTreeNode[leftChild].attribute.frequency +
hufmanTreeNode[rightChild].attribute.frequency;
}
}
void showHufmanTreeNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode) {
u32 i;
printf("字符 頻度 左孩子 右孩子 編碼\n");
for (i = 0; i < count; i++) {
printf("%-5c %-5d %-7d %-7d %-10s\n",
hufmanTreeNode[i].attribute.character,
hufmanTreeNode[i].attribute.frequency,
hufmanTreeNode[i].leftChild,
hufmanTreeNode[i].rightChild,
hufmanTreeNode[i].hufmanCode == NULL ? "NULL" : hufmanTreeNode[i].hufmanCode);
}
}
void destoryHufmanTreeNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode) {
u32 i;
if (NULL == hufmanTreeNode) {
return;
}
for (i = 0; i < count; i++) {
free(hufmanTreeNode[i].hufmanCode);
}
free(hufmanTreeNode);
}
HUFMAN_TREE_NODE *initHufmanTreeNode(u16 characterCount, u32 *orientate, ATTRIBUTE *attributeList) {
u32 i;
u32 nodeCount;
HUFMAN_TREE_NODE *hufmanTreeNode;
nodeCount = characterCount * 2 - 1;
hufmanTreeNode = (HUFMAN_TREE_NODE *) calloc(sizeof(HUFMAN_TREE_NODE), nodeCount);
for (i = 0; i < characterCount; i++) {
hufmanTreeNode[i].visited = FALSE;
hufmanTreeNode[i].hufmanCode = (u8 *) calloc(sizeof(u8), characterCount);
hufmanTreeNode[i].leftChild = hufmanTreeNode[i].rightChild = -1;
hufmanTreeNode[i].attribute = attributeList[i];
orientate[attributeList[i].character] = i;
}
return hufmanTreeNode;
}
void showAttributeList(u16 characterCount, ATTRIBUTE *attributeList) {
u16 i;
for (i = 0; i < characterCount; i++) {
printf("頻度:%d 符號:%c\n", attributeList[i].frequency, attributeList[i].character);
}
}
void destoryAttributeList(ATTRIBUTE *attributeList) {
if (NULL == attributeList) {
return;
}
free(attributeList);
}
ATTRIBUTE *initAttributeList(u32 *ascii, u16 *characterCount, const char *sourceFilename) {
u8 ch;
u32 i;
u32 index = 0;
u16 count = 0;
FILE *fpIn;
ATTRIBUTE *attributeList;
fpIn = fopen(sourceFilename, "r");
ch = fgetc(fpIn);
while (!feof(fpIn)) {
ascii[ch]++;
ch = fgetc(fpIn);
}
for (i = 0; i < 256; i++) {
count += (ascii[i] != 0);
}
*characterCount = count;
attributeList = (ATTRIBUTE *) calloc(sizeof(ATTRIBUTE), count);
for (i = 0; i < 256; i++) {
if (ascii[i] != 0) {
attributeList[index].character = (u8) i;
attributeList[index++].frequency = ascii[i];
}
}
fclose(fpIn);
return attributeList;
}
void compress(const char *sourceFilename, const char *targetFilename) {
u8 code[256];
u16 characterCount;
u32 ascii[256] = {0};
u32 orientate[256] = {0};
ATTRIBUTE *attributeList = NULL;
HUFMAN_TREE_NODE *hufmanTreeNode = NULL;
attributeList = initAttributeList(ascii, &characterCount, sourceFilename);
//printf("頻度表如下:\n");
//showAttributeList(characterCount, attributeList);
hufmanTreeNode = initHufmanTreeNode(characterCount, orientate, attributeList);
hufmanTreeNode = initHufmanTreeNode(characterCount, orientate, attributeList);
creatHufmanTree(characterCount, hufmanTreeNode);
creatHufmanCode(code, 0, 2*characterCount-2, hufmanTreeNode);
//printf("編碼如下:\n");
//showHufmanTreeNode(2*characterCount-1, hufmanTreeNode);
coding(sourceFilename, targetFilename, hufmanTreeNode, orientate, characterCount, attributeList);
destoryAttributeList(attributeList);
destoryHufmanTreeNode(2*characterCount-2, hufmanTreeNode);
}
int main(int argc, char const *argv[]) {
char sourceFilename[200];
char targetFilename[200];
if (argc != 2 && argc != 3) {
printf("用法:hufCompress 源文件 目標文件\n");
return -1;
}
strcpy(sourceFilename, argv[1]);
if (!isFileExist(sourceFilename)) {
printf("source file[%s] did not exist\n", sourceFilename);
return -2;
}
if (argc == 3) {
strcpy(targetFilename, argv[2]);
} else {
creatFilename(sourceFilename, TYZ_COP_EXTENSION, targetFilename);
}
compress(sourceFilename, targetFilename);
return 0;
}
#pragma pack(pop)
E. 運行結果
graph.TyzHuf就是新生成的壓縮文件。
其部分內容如下👇
可以清楚看到頭部內容就是TYZ三個標識符。
Ⅳ 哈夫曼解壓縮
A. 代碼分析
回憶第一階段完成的部分,我們需要根據哈夫曼樹才能完成解碼,因此第一步,我們仍是得到頻度表,只不過是從我們壓縮進去的文件中得到頻度表。壓縮的時候我們已經將必要的信息寫入文件了。
接下來其他部分是和第一階段保持不變的,根據頻度表我們可以生成哈夫曼樹,然後就可以根據哈夫曼樹來進行對壓縮文件中的數據進行解碼了。
B. 從壓縮文件中讀取頻度表
ATTRIBUTE *initAttributeList(u32 *bitCount, u16 *characterCount, const char *sourceFilename) {
FILE *fpIn;
ATTRIBUTE *attributeList = NULL;
FILE_HEAD fileHead = {0};
fpIn = fopen(sourceFilename, "r");
fread(&fileHead, sizeof(FILE_HEAD), 1, fpIn);
if (fileHead.flag[0] != 'T'
|| fileHead.flag[1] != 'Y'
|| fileHead.flag[2] != 'Z') {
printf("文件無法識別^-^\n");
return NULL;
}
*characterCount = fileHead.characterCount;
*bitCount = fileHead.bitCount;
attributeList = (ATTRIBUTE *) calloc(sizeof(ATTRIBUTE), *characterCount);
fread(attributeList, sizeof(ATTRIBUTE), *characterCount, fpIn);
return attributeList;
}
這一部分也是很簡單的。在開始的時候我們需要判斷標識碼,如果一個文件開頭不是TYZ,那麼說明這不是我們的壓縮文件,無法解碼。
讀取了文件頭的結構體,我們就可以知道哈夫曼編碼的總位數bitCount
和字符類型數characterCount
。
接下來就是讀取我們存放的頻度表了。之後就是根據這個頻度表再生成哈夫曼樹,和第一階段的代碼是相同的。
C. 解碼
void decoding(const char *sourceFilename, const char *targetFilename,
HUFMAN_TREE_NODE *hufmanTreeNode, u16 characterCount, u32 bitCount) {
u32 ch;
u32 bitIndex = 0;
u32 pointIndex = 0;
u32 root = 2 * characterCount - 2;
FILE *fpIn;
FILE *fpOut;
fpIn = fopen(sourceFilename, "rb");
fpOut = fopen(targetFilename, "wb");
printf("characterCount = %d\nbitCount = %d\n", characterCount, bitCount);
fseek(fpIn, sizeof(FILE_HEAD) + sizeof(ATTRIBUTE) * characterCount, SEEK_SET);
ch = fgetc(fpIn);
while (pointIndex <= bitCount) {
if (-1 == hufmanTreeNode[root].leftChild) {
fputc((int) hufmanTreeNode[root].attribute.character, fpOut);
root = 2 * characterCount - 2;
continue;
}
root = GET(ch, bitIndex) == 0
? hufmanTreeNode[root].leftChild
: hufmanTreeNode[root].rightChild;
pointIndex++;
if (++bitIndex >= 8) {
ch = fgetc(fpIn);
bitIndex = 0;
}
}
fclose(fpIn);
fclose(fpOut);
}
這個過程其實和第一階段的解碼是差不多的,只是多了幾步文件的操作。
通過位運算的取位,我們得到從壓縮文件中讀取的信息,這樣就完成了解碼。
D. 哈夫曼解壓縮完整代碼
tyz.c, tyz.h的內容和哈夫曼壓縮的內容一樣,我不再贅述。
以下是hufDecompress.h的內容👇
#ifndef _TYZ_HUF_COMPRESS_H_
#define _TYZ_HUF_COMPRESS_H_
#include "tyz.h"
typedef struct ATTRIBUTE {
u8 character;
u32 frequency;
}ATTRIBUTE;
typedef struct HUFMAN_TREE_NODE {
boolean visited;
u8 *hufmanCode;
u32 leftChild;
u32 rightChild;
ATTRIBUTE attribute;
}HUFMAN_TREE_NODE;
typedef struct FILE_HEAD {
u8 flag[3];
u16 characterCount;
u32 bitCount;
u8 unused[7];
}FILE_HEAD;
void decompress(const char *sourceFilename, const char *targetFilename);
ATTRIBUTE *initAttributeList(u32 *bitCount, u16 *characterCount, const char *sourceFilename);
void destoryAttributeList(ATTRIBUTE *attributeList);
void showAttributeList(u16 characterCount, ATTRIBUTE *attributeList);
HUFMAN_TREE_NODE *initHufmanTreeNode(u16 characterCount, u32 *orientate, ATTRIBUTE *attributeList);
void destoryHufmanTreeNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode);
void showHufmanTreeNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode);
void creatHufmanTree(u16 characterCount, HUFMAN_TREE_NODE *hufmanTreeNode);
u32 searchMinimumNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode);
void creatHufmanCode(u8 *code, u32 index, u32 root, HUFMAN_TREE_NODE *hufmanTreeNode);
void decoding(const char *sourceFilename, const char *targetFilename,
HUFMAN_TREE_NODE *hufmanTreeNode, u16 characterCount, u32 bitCount);
#endif
最後是hufDecompress.c的內容👇
#pragma pack(push)
#pragma pack(1)
#include <stdio.h>
#include <malloc.h>
#include <string.h>
#include "tyz.h"
#include "hufCompress.h"
void decoding(const char *sourceFilename, const char *targetFilename,
HUFMAN_TREE_NODE *hufmanTreeNode, u16 characterCount, u32 bitCount) {
u32 ch;
u32 bitIndex = 0;
u32 pointIndex = 0;
u32 root = 2 * characterCount - 2;
FILE *fpIn;
FILE *fpOut;
fpIn = fopen(sourceFilename, "rb");
fpOut = fopen(targetFilename, "wb");
printf("characterCount = %d\nbitCount = %d\n", characterCount, bitCount);
fseek(fpIn, sizeof(FILE_HEAD) + sizeof(ATTRIBUTE) * characterCount, SEEK_SET);
ch = fgetc(fpIn);
while (pointIndex <= bitCount) {
if (-1 == hufmanTreeNode[root].leftChild) {
fputc((int) hufmanTreeNode[root].attribute.character, fpOut);
root = 2 * characterCount - 2;
continue;
}
root = GET(ch, bitIndex) == 0
? hufmanTreeNode[root].leftChild
: hufmanTreeNode[root].rightChild;
pointIndex++;
if (++bitIndex >= 8) {
ch = fgetc(fpIn);
bitIndex = 0;
}
}
fclose(fpIn);
fclose(fpOut);
}
void creatHufmanCode(u8 *code, u32 index, u32 root, HUFMAN_TREE_NODE *hufmanTreeNode) {
if (-1 == hufmanTreeNode[root].leftChild) {
code[index] = 0;
strcpy(hufmanTreeNode[root].hufmanCode, code);
return;
} else {
code[index] = '0';
creatHufmanCode(code, index+1, hufmanTreeNode[root].leftChild, hufmanTreeNode);
code[index] = '1';
creatHufmanCode(code, index+1, hufmanTreeNode[root].rightChild, hufmanTreeNode);
}
}
u32 searchMinimumNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode) {
u32 i;
u32 minIndex = -1;
for (i = 0; i < count; i++) {
if (FALSE == hufmanTreeNode[i].visited
&& (-1 == minIndex
|| hufmanTreeNode[minIndex].attribute.frequency > hufmanTreeNode[i].attribute.frequency)) {
minIndex = i;
}
}
hufmanTreeNode[minIndex].visited = TRUE;
return minIndex;
}
void creatHufmanTree(u16 characterCount, HUFMAN_TREE_NODE *hufmanTreeNode) {
u32 i;
u32 leftChild;
u32 rightChild;
u16 count = characterCount;
for (i = 0; i < count - 1; i++) {
leftChild = searchMinimumNode(count+i, hufmanTreeNode);
rightChild = searchMinimumNode(count+i, hufmanTreeNode);
hufmanTreeNode[count+i].visited = FALSE;
hufmanTreeNode[count+i].hufmanCode = NULL;
hufmanTreeNode[count+i].leftChild = leftChild;
hufmanTreeNode[count+i].rightChild = rightChild;
hufmanTreeNode[count+i].attribute.character = '@';
hufmanTreeNode[count+i].attribute.frequency =
hufmanTreeNode[leftChild].attribute.frequency +
hufmanTreeNode[rightChild].attribute.frequency;
}
}
void showHufmanTreeNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode) {
u32 i;
printf("字符 頻度 左孩子 右孩子 編碼\n");
for (i = 0; i < count; i++) {
printf("%-5c %-5d %-7d %-7d %-10s\n",
hufmanTreeNode[i].attribute.character,
hufmanTreeNode[i].attribute.frequency,
hufmanTreeNode[i].leftChild,
hufmanTreeNode[i].rightChild,
hufmanTreeNode[i].hufmanCode == NULL ? "NULL" : hufmanTreeNode[i].hufmanCode);
}
}
void destoryHufmanTreeNode(u32 count, HUFMAN_TREE_NODE *hufmanTreeNode) {
u32 i;
if (NULL == hufmanTreeNode) {
return;
}
for (i = 0; i < count; i++) {
free(hufmanTreeNode[i].hufmanCode);
}
free(hufmanTreeNode);
}
HUFMAN_TREE_NODE *initHufmanTreeNode(u16 characterCount, u32 *orientate, ATTRIBUTE *attributeList) {
u32 i;
u32 nodeCount;
HUFMAN_TREE_NODE *hufmanTreeNode;
nodeCount = characterCount * 2 - 1;
hufmanTreeNode = (HUFMAN_TREE_NODE *) calloc(sizeof(HUFMAN_TREE_NODE), nodeCount);
for (i = 0; i < characterCount; i++) {
hufmanTreeNode[i].visited = FALSE;
hufmanTreeNode[i].hufmanCode = (u8 *) calloc(sizeof(u8), characterCount);
hufmanTreeNode[i].leftChild = hufmanTreeNode[i].rightChild = -1;
hufmanTreeNode[i].attribute = attributeList[i];
orientate[attributeList[i].character] = i;
}
return hufmanTreeNode;
}
void showAttributeList(u16 characterCount, ATTRIBUTE *attributeList) {
u16 i;
for (i = 0; i < characterCount; i++) {
printf("頻度:%d 符號:%c\n", attributeList[i].frequency, attributeList[i].character);
}
}
void destoryAttributeList(ATTRIBUTE *attributeList) {
if (NULL == attributeList) {
return;
}
free(attributeList);
}
ATTRIBUTE *initAttributeList(u32 *bitCount, u16 *characterCount, const char *sourceFilename) {
FILE *fpIn;
ATTRIBUTE *attributeList = NULL;
FILE_HEAD fileHead = {0};
fpIn = fopen(sourceFilename, "r");
fread(&fileHead, sizeof(FILE_HEAD), 1, fpIn);
if (fileHead.flag[0] != 'T'
|| fileHead.flag[1] != 'Y'
|| fileHead.flag[2] != 'Z') {
printf("文件無法識別^-^\n");
return NULL;
}
*characterCount = fileHead.characterCount;
*bitCount = fileHead.bitCount;
attributeList = (ATTRIBUTE *) calloc(sizeof(ATTRIBUTE), *characterCount);
fread(attributeList, sizeof(ATTRIBUTE), *characterCount, fpIn);
return attributeList;
}
void decompress(const char *sourceFilename, const char *targetFilename) {
u8 code[256];
u16 characterCount;
u32 bitCount;
u32 orientate[256] = {0};
ATTRIBUTE *attributeList = NULL;
HUFMAN_TREE_NODE *hufmanTreeNode = NULL;
attributeList = initAttributeList(&bitCount, &characterCount, sourceFilename);
//printf("頻度表如下:\n");
//showAttributeList(characterCount, attributeList);
hufmanTreeNode = initHufmanTreeNode(characterCount, orientate, attributeList);
hufmanTreeNode = initHufmanTreeNode(characterCount, orientate, attributeList);
creatHufmanTree(characterCount, hufmanTreeNode);
creatHufmanCode(code, 0, 2*characterCount-2, hufmanTreeNode);
//printf("編碼如下:\n");
//showHufmanTreeNode(2*characterCount-1, hufmanTreeNode);
decoding(sourceFilename, targetFilename, hufmanTreeNode, characterCount, bitCount);
destoryAttributeList(attributeList);
destoryHufmanTreeNode(2*characterCount-2, hufmanTreeNode);
}
int main(int argc, char const *argv[]) {
char sourceFilename[200];
char targetFilename[200];
if (argc != 3) {
printf("用法:hufDecompress 源文件 還原文件\n");
return -1;
}
strcpy(sourceFilename, argv[1]);
if (!isFileExist(sourceFilename)) {
printf("source file[%s] did not exist\n", sourceFilename);
return -2;
}
strcpy(targetFilename, argv[2]);
decompress(sourceFilename, targetFilename);
return 0;
}
#pragma pack(pop)
E. 運行結果
可以看到生成的gra.bmp和源文件是一模一樣大小的,解壓縮的十分完美。
Ⅴ 一些補充
有的同學可能會有疑惑,我在程序的開頭寫的僞指令
#pragma pack(push)
#pragma pack(1)
結尾也有一行。這個是爲了阻止內存對齊模式,使得壓縮達到更高的效率,如果你對內存對齊模式有疑問,可以看我的下面這篇文章👇
【C語言基礎】->內存對齊模式->爲什麼我的結構體大小我猜不透
關於這個項目,還牽扯到以下幾個知識點:
【C語言基礎】->位運算詳細解析->位運算的使用
【C語言基礎】->文件操作詳解->一篇文章讀懂關於文件的龐雜函數使用
【C語言基礎】->遞歸調用->八皇后問題
【C語言->數據結構與算法】->樹與二叉樹概念&哈夫曼樹的構造
【C語言->數據結構與算法】->哈夫曼壓縮&解壓縮->第一階段->哈夫曼編碼&解碼的實現
另,感謝朱洪先生的指導。