赫夫曼編碼\譯碼

赫夫曼編碼

通過赫夫曼編碼可以節省存儲空間,在計算機科學中有廣泛的應用。本文件生成的文件也得到了有效的壓縮,中間應用了大量的位操作。這些操作用c語言寫多少有點不方便。
以下是hfmTree.h的內容,這是個公共的頭文件,其餘源文件都需要包含它。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

//這是樹的節點 
struct Node {
	char ch;
	int weight;
	int filepoint,lf,rf;//事實上這是白浪費空間,但我不想再定義一個結構體維護它了
	struct Node* lchild;
	struct Node* rchild;
};

//這是一個編碼數組 ,在編碼時用 
struct Code {
	char ch;
	unsigned int code;
	int length;
};

以下是Init.c中的內容,這個文件主要是根據輸入的數據建立赫夫曼樹,並保存在hfmTree文件中,這個文件編碼和譯碼要用到
#include "hfmTree.h"

int n=0;
int filepoint=0;
struct Code* code;
struct Node* root;

int input();
int showCode();
int toCode(struct Node* root,int ncode,int length);
int writetofile(const char *filename);
int static writefilepoint(struct Node* root,FILE* fout);
int static makefilepoint(struct Node* root);
int deltree(struct Node* tree);

int main() {
	int i;
	input();
	//writetofile("hfmTree");
	code=(struct Code*)malloc(sizeof(struct Code)*n);
	for(i=0; i<n; i++) {
		code[i].length=0;
	}
	toCode(root,0,0);
	writetofile("hfmTree");
	showCode();
	free(code);
	deltree(root);
	return 0;
}


int showCode() { //將所有編碼顯示出來
	int i;
	for(i=0; i<n; i++) {
		printf("%c:",code[i].ch);
		int j;
		for(j=code[i].length-1; j>=0; j--) {
			if((code[i].code)&(1<<j))
				printf("1");
			else
				printf("0");
		}
		printf("\n");
	}
}

int toCode(struct Node* root,int ncode,int length) {//根據樹建立編碼數組 
	if(root->lchild!=NULL&&root->rchild!=NULL) {
		toCode(root->lchild,ncode<<1,length+1);
		toCode(root->rchild,(ncode<<1)+1,length+1);
	} else {
		int i;
		for(i=0; i<n; i++) {
			if(code[i].length==0)
				break;
		}
		code[i].ch=root->ch;
		code[i].code=ncode;
		code[i].length=length;
	}
	return 0;
}

int writetofile(const char *filename) {//將樹寫入文件 
	FILE* fout=fopen(filename,"wb");
	if(fout==NULL) {
		printf("file open error!\n");
		return -1;
	}
	fwrite(&n,sizeof(int),1,fout);
	filepoint=8;
	makefilepoint(root);
	writefilepoint(root,fout);
	fseek(fout,4,SEEK_SET);
	fwrite(&filepoint,sizeof(int),1,fout);
	fseek(fout,filepoint,SEEK_SET);
	fwrite(code,sizeof(struct Code)*n,1,fout);
	fclose(fout);
	//需要
	return 0;
}
int static writefilepoint(struct Node* root,FILE* fout){
	if(root!=NULL){
		fseek(fout,root->filepoint,SEEK_SET);
		fwrite(root,sizeof(struct Node),1,fout);
		writefilepoint(root->lchild,fout);
		writefilepoint(root->rchild,fout);
	}
	return 0;
}
int static makefilepoint(struct Node* root) {
	if(root!=NULL) {
		root->filepoint=filepoint;
		filepoint+=sizeof(struct Node);
		if(root->lchild!=NULL) {
			root->lf=filepoint;
			makefilepoint(root->lchild);
		} else {
			root->lf=0;
		}
		if(root->rchild!=NULL) {
			root->rf=filepoint;
			makefilepoint(root->rchild);
		} else {
			root->rf=0;
		}
	}
	return 0;
}

int deltree(struct Node* tree) {
	if(tree!=NULL) {
		deltree(tree->lchild);
		deltree(tree->rchild);
		free(tree);
	}
	return 0;
}
int input() {
	int i,weight;
	int si;//暫存根節點
	char ch;
	printf("Input:n=");
	scanf("%d",&n);
	struct Node* temp;
	struct Node** table=(struct Node**)malloc(sizeof(struct Node*)*n);
	printf("example(they are in different line):w 34\ne 56\n");
	for(i=0; i<n; i++) {
		temp=(struct Node*)malloc(sizeof(struct Node));
		fflush(stdin);
		scanf("%c %d",&(temp->ch),&(temp->weight));
		temp->lchild=NULL;
		temp->rchild=NULL;
		table[i]=temp;
	}

	//將輸入的數組轉換成樹 
	while(1) {
		int flag=0;
		int sm1=0,sm2=0;
		int weight=0x7fffffff;
		//判斷樹是否建成 
		for(i=0; i<n; i++) {
			if(table[i]!=NULL) {
				flag++;
				si=i;
			}
		}
		if(flag==1)
			break;
		for(i=0; i<n; i++) { //獲得最小權值的數
			if(table[i]==NULL)
				continue;
			if(weight>table[i]->weight) {
				weight=table[i]->weight;
				sm1=i;
			}
		}
		weight=0x7fffffff;
		for(i=0; i<n; i++) { //獲得第二個最小權值的數
			if(table[i]==NULL)
				continue;
			if(i==sm1)
				continue;
			if(weight>table[i]->weight) {
				weight=table[i]->weight;
				sm2=i;
			}
		}
		//合併
		temp=(struct Node*)malloc(sizeof(struct Node));
		temp->weight=table[sm1]->weight+table[sm2]->weight;
		temp->lchild=table[sm1];
		temp->rchild=table[sm2];
		table[sm2]=NULL;
		table[sm1]=temp;
	}
	root=table[si];
	free(table);
	return 0;
}
以下是Encod.c的內容,這個文件主要是根據建立的赫夫曼樹進行編碼,要讀的文件是A.txt,寫出的文件是B.dat。當數據量較大時,通過試驗10個阿拉伯數字的編碼,B.dat的大小約爲A.txt一半。
#include "hfmTree.h"

int n=0;
struct Code* code;

int Encoding(char c);
int Encodtofile(const char* fnin,const char* fnout);
int showCode();
int readcode(const char* filename);
int deltree(struct Node* tree);

int main() {
	readcode("hfmTree");
	showCode();
	Encodtofile("A.txt","B.dat");
	free(code);
	return 0;
}

int Encoding(char c) { //找到則返回0-(n-1),否則返回n
	int i;
	for(i=0; i<n; i++) {
		if(code[i].ch==c)
			break;
	}
	return i;
}
int Encodtofile(const char* fnin,const char* fnout) {//編碼 
	FILE* fin,*fout;
	fin=fopen(fnin,"r");
	fout=fopen(fnout,"wb");
	if(fin==NULL||fout==NULL) {
		printf("file open error!\n");
		return -1;
	}
	unsigned int buf=0;
	int length=0;//這表示最後一個字節的長度
	//intel的機器是大端模式,以四個字節爲一個單位往文件寫,字節順序是倒的 
	fseek(fout,4,SEEK_SET);
	while(!feof(fin)) {
		char ch=fgetc(fin);
		int i=Encoding(ch);
		if(i==n) {
			//文件末尾可能會有一個未知的編碼,這不會影響結果 
			printf("Unknown Code!\n");
			continue;
		}
		if((length+code[i].length)>32) { //會溢出,趕緊往文件寫
			int temp=code[i].length-32+length;
			//以下操作用了大量位運算,主要是屏蔽無效位以及位移操作
			buf=((buf<<(32-length))|(((code[i].code)&(~((1<<temp)-1)))>>temp));
			fwrite(&buf,sizeof(int),1,fout);
			buf=code[i].code&((1<<temp)-1);
			length=temp;
			continue;
		}
		buf=(buf<<(code[i].length))|code[i].code;
		length+=code[i].length;
	}
	buf=buf<<(32-length);
	fwrite(&buf,sizeof(int),1,fout);
	fseek(fout,0,SEEK_SET);
	fwrite(&length,sizeof(int),1,fout);
	fclose(fin);
	fclose(fout);
	return 0;
}

int showCode() { //將所有編碼顯示出來
	int i;
	for(i=0; i<n; i++) {
		printf("%c:",code[i].ch);
		int j;
		for(j=code[i].length-1; j>=0; j--) {
			if((code[i].code)&(1<<j))
				printf("1");
			else
				printf("0");
		}
		printf("\n");
	}
	return 0;
}

int readcode(const char* filename){
	FILE* fin=fopen(filename,"rb");
	if(fin==NULL) {
		printf("file open error!\n");
		return -1;
	}
	int offset;
	fread(&n,sizeof(int),1,fin);
	fread(&offset,sizeof(int),1,fin);
	code=(struct Code*)malloc(sizeof(struct Code)*n);
	fseek(fin,offset,SEEK_SET);
	fread(code,sizeof(struct Code)*n,1,fin);
	fclose(fin);
	return 0;
}

以下是Decod.c的內容,用於譯碼。將上一步生成的文件B.dat譯碼成C.txt,通過比較C.txt和A.txt的內容,前面基本基本一致,在結尾處沒能處理好,出現了差異。
#include "hfmTree.h"

int n=0;
struct Node* root; 

int Dcoding(int temp,struct Node* root,int dep);
int Dcodingfromfile(const char* fnin,const char* fnout);
int Dcodingfromfile2(const char* fnin,const char* fnout);
int readtree(const char* filename);
int readnode(FILE* fin,struct Node* root,int seek);

int main() {
	//struct Node* root=(struct Node*)malloc(sizeof(struct Node));
	readtree("hfmTree");
	//printf("%c\n",Dcoding(0xffffffff,root,1)&(0xffff));
	Dcodingfromfile2("B.dat","C.txt");
	deltree(root);
	return 0;
}
int Dcodingfromfile2(const char* fnin,const char* fnout){
	FILE* fin,*fout;
	fin=fopen(fnin,"rb");
	fout=fopen(fnout,"wb");
	if(fin==NULL||fout==NULL) {
		printf("file open error!\n");
		return -1;
	}
	fseek(fin,4,SEEK_SET);
	unsigned int buf,buf2,buf3;
	int ret;
	fread(&buf,sizeof(int),1,fin);
	fread(&buf2,sizeof(int),1,fin);
	int pos2=32;//指明二緩衝有效位個數
	while(1){
		ret=Dcoding(buf,root,0);
		char ch=(char)(ret&0xff);
		fwrite(&ch,sizeof(char),1,fout);
		ret=(ret&0xffff0000)>>16;
		buf=buf<<ret;
		if(pos2>ret){
			buf=buf|(buf2>>(32-ret));
			buf2=buf2<<ret;
			pos2=pos2-ret;
		}else{
			fread(&buf3,sizeof(int),1,fin);
			buf2=buf2|((buf3&(0xffffffff<<pos2))>>pos2);
			buf=buf|(buf2>>(32-ret));
			buf2=buf2<<ret;
			//
			
			buf2=buf2|((buf3&(0xffffffff>>(32-pos2)))<<(ret-pos2));
			//buf2=buf2|(buf3&(0xffffffff>>(32-pos2)));
			//pos2=32-pos2;
			pos2=32-ret+pos2;
			if(feof(fin)){
				break;
			}
		}
	}
	int pos3,oldpos2=pos2-32+ret;
	fseek(fin,0,SEEK_SET);
	fread(&pos3,sizeof(int),1,fin);
	if(ret-oldpos2-pos3>0){
		pos3=ret-oldpos2-pos3;
	}else{
		pos3=32+pos3-ret+oldpos2;
	}
	while(pos3>0){
		ret=Dcoding(buf,root,0);
		char ch=(char)(ret&0xff);
		fwrite(&ch,sizeof(char),1,fout);
		ret=(ret&0xffff0000)>>16;
		buf=buf<<ret; 
		if(pos3>32){
			buf=buf|(buf2>>(32-ret));
			buf2=buf2<<ret;
		}
		pos3=pos3-ret;
	}
	fclose(fin);
	fclose(fout); 
	return 0;
}
int Dcodingfromfile(const char* fnin,const char* fnout){
	FILE* fin,*fout;
	fin=fopen(fnin,"rb");
	fout=fopen(fnout,"wb");
	if(fin==NULL||fout==NULL) {
		printf("file open error!\n");
		return -1;
	}
	int pos1=32,pos2=32;//有效位 
	unsigned int buf=0;//兩級緩衝 
	unsigned int buf2=0;
	
	//int pos=0;
	fseek(fin,4,SEEK_SET);
	fread(&buf,sizeof(int),1,fin);
	fread(&buf2,sizeof(int),1,fin);
	while(1){
		int ret=Dcoding(buf,root,0);
		fwrite(&ret,sizeof(char),1,fout);
		ret=(ret&0xffff0000)>>16;//這是要移入的位數 
		buf=buf<<ret;
		if(pos2>=ret){
			buf=buf&(0xffffffff<<ret);
			buf=buf|((buf2&((0xffffffff)<<(32-ret)))>>(32-ret));
			buf2=buf2<<ret;
			buf2=buf2&(0xffffffff<<ret);
			pos2-=ret;
		}else{
			pos1=ret-pos2;
			buf=buf&(0xffffffff<<ret);
			buf=buf+((buf2&((0xffffffff)<<(32-pos2)))>>(32-ret));
			
			//pos1=pos1-ret+pos2;
			//pos2=0;
			fread(&buf2,sizeof(int),1,fin);
			if(feof(fin)){
				fseek(fin,0,SEEK_SET);
				fread(&pos2,sizeof(int),1,fin);
				break;
			}
			buf=buf+((buf2&((0xffffffff)<<(32-pos1)))>>(32-ret));
			buf2=buf2<<pos1;
			buf2=buf2&(0xffffffff<<pos1);
			pos2=32-pos1;
			pos1=32;
		}
	}
	//還需要處理後續的不超過8個字節 
	fclose(fin);
	fclose(fout);
	return 0;
}

int Dcoding(int temp,struct Node* root,int dep){//高兩字節是深度,低兩字節是ch 
	int wei=0x80000000;
	if(root->lchild==NULL&&root->rchild==NULL)
		return root->ch+(dep<<16);
	if(temp&wei){
		return Dcoding(temp<<1,root->rchild,dep+1);
	}else{
		return Dcoding(temp<<1,root->lchild,dep+1); 
	}
}

int readtree(const char* filename){
	root=(struct Node*)malloc(sizeof(struct Node));
	FILE* fin=fopen(filename,"rb");
	if(fin==NULL) {
		printf("file open error!\n");
		return -1;
	}
	fread(&n,sizeof(int),1,fin);
	readnode(fin,root,8);
	fclose(fin);
	return 0;
}

int readnode(FILE* fin,struct Node* root,int seek){
	fseek(fin,seek,SEEK_SET);
	fread(root,sizeof(struct Node),1,fin);
	if(root->lf!=0){
		root->lchild=(struct Node*)malloc(sizeof(struct Node));
		readnode(fin,root->lchild,root->lf);
	}else{
		root->lchild=NULL;
	}
	if(root->rf!=0){
		root->rchild=(struct Node*)malloc(sizeof(struct Node));
		readnode(fin,root->rchild,root->rf);
	}else{
		root->rchild=NULL;
	}
}

int deltree(struct Node* tree) {
	if(tree!=NULL) {
		deltree(tree->lchild);
		deltree(tree->rchild);
		free(tree);
	}
}

以上就是所有程序,執行的先後順序是Init.c->Encod.c->Decod.c,其中要求預先寫好A.txt,文件出現的字符應該在Init.c時輸入。對於未知的編碼,程序直接忽略,由於有一個文件結束符,文件末尾總會有一個未知的編碼,好在這並不影響測試結果。
B.dat是二進制文件,以四個字節爲一個單位,由於Intel是小端模式,用二進制文件查看是需要注意字節順序。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章