python哈夫曼壓縮與解壓算法
壓縮
#encoding: utf-8
from bitarray import bitarray
import random
import json
class Node(object):
"""docstring for Node"""
left=None
right=None
times=0
char = ''
parent=None
def __init__(self):
super(Node, self).__init__()
# 獲取帶有權重的字符 的字典
def countTimes(str):
times = {}
for i in range(0,len(str)):
if (not str[i] in times):
times[str[i]]=1
else:
times[str[i]]+=1
return sorted(times.items(),key = lambda x:x[1])
# 將帶有次數以及字符的數據轉換成一顆樹
def getTree(data):
temp=data[:]
head= None #拿到 head 就相當於拿到了整棵樹
nodes = []
# 都變成 Node
for x in temp:
tempNode = Node()
tempNode.char = x[0]
tempNode.times = x[1]
nodes.append(tempNode)
while len(nodes)>=2:
# 需要把nodes都排序好,然後再操作第1,2個,因爲是他們最小
nodes = sorted(nodes,key= lambda x:x.times)
nums = nodes[0].times + nodes[1].times
chars = nodes[0].char + nodes[1].char
node_parent =Node()
# print("len(temp):%d\tnums:%d\tchars:%s"%(len(temp),nums,chars))
node_parent.times = nums
node_parent.left = nodes[0]
node_parent.right = nodes[1]
node_parent.char =chars
head = node_parent
del(nodes[0:2]) # 刪除第1,2個
nodes.append(node_parent) # 加入本輪1,2個的合體,參與下一輪的循環
return head
# 哈夫曼編碼,使用遞歸。注意:只含葉子節點 返回 字符-哈夫曼編碼 字典
def encode(head,code=""):
head_bak = head
now =head
global result # python 要聲明global這個纔可以 對全局變量進行寫操作
if now.left != None:
encode(now.left,code+"0")
if now.right != None:
encode(now.right,code+"1")
if now.right == None and now.left == None:
result[now.char] = code
def printTree(head):
temp = head
if temp.left != None:
printTree(temp.left)
if temp.right != None:
printTree(temp.right)
print("char:%s\ttimes:%d"%(temp.char,temp.times))
# 可以將 字符權重 字典變爲bitarray 以便寫入二進制壓縮文件
def dict2bits(dictObject,endian="little"):
str = json.dumps(dictObject)
bits = bitarray(endian=endian)
for x in str:
bits.frombytes(bytes(x,encoding="utf-8"))
return bits
# 將原字符串按照哈夫曼編碼進行壓縮
def zipTobits(s,encodeList,endian="little"):
bits = bitarray(endian=endian)
for x in s:
code = encodeList[x]
for c in code:
if c == "0":
bits.append(False)
else:
bits.append(True)
if len(bits)%8:
# 如果不滿一個字節,則使用0 填充
# 其實如果不填充,因爲讀入時也是按字節讀入,故讀時也會自動填充0
for x in range(0,8-len(bits)%8):
bits.append(False)
return bits
# 以二進制的形式保存成文件
def saveBits(bits,encodeList):
with open("ziped.hfm","wb") as p:
# 先寫字符-哈夫曼編碼 字典,再寫壓縮後的內容
p.write(dict2bits(encodeList))
p.write(bits)
# 將原字符串保存下來
def saveStr(str):
with open("unziped.hfm","w") as p:
p.write(str)
def getSeedStr(times):
str=""
for x in range(0,times):
str+=chr(random.randint(ord('a'),ord('z')))
return str
# a = "abcabcabcabcabcabcddddddddd"
# for x in range(1,10):
# a+=a
theStr=getSeedStr(100)
sortedTimes = countTimes(a)
head=getTree(sortedTimes)
result = {}
encode(head)
bits= zipTobits(theStr,result)
saveBits(bits,result)
saveStr(theStr)
解壓
#encoding: utf-8
from bitarray import bitarray
import json
# 從二進制文件讀取數據,並返回 字符-哈夫曼編碼 字典 與 壓縮後的字符串
def readFile(filepath):
size = 1
encodeListStr=""
zipedBits = bitarray(endian="little")
# 讀入 字符-哈夫曼編碼 字典
with open(filepath,"rb") as f:
tag_begin = f.read(size).decode()
if tag_begin != '{':
print("Data Error")
return
encodeListStr+=tag_begin
content=""
while content != "}":
content = f.read(size)
if content == '':
print("Data Error")
return
content = content.decode()
encodeListStr += content
zipedBits.frombytes(f.read())
return encodeListStr,zipedBits
# 字符-哈夫曼編碼 字典是{'a':'00'}形式的,要轉爲 bitarray的形式,即 {'a':bitarray('00')}
# 以便可以進行判斷
def changeToBitArray(encodeList):
encodeBitArray = {}
for x in encodeList.keys():
bits = bitarray(endian="little")
code = encodeList[x]
for c in code:
if c == "0":
bits.append(False)
else:
bits.append(True)
encodeBitArray[x] = bits
return encodeBitArray
# 是否匹配
def contain(bits,encodeBitArray):
flag = False
char = ''
for x in encodeBitArray.keys():
if encodeBitArray[x] == bits:
flag = True
char = x
break
return flag,char
def saveStr(str):
with open("decode.hfm","w") as p:
p.write(str)
# 解壓的核心內容
def decode(encodeList,bits):
waitingBits = bitarray(endian = "little")
encodeBitArray = changeToBitArray(encodeList)
decodeStr = ""
while bits.length() > 0:
waitingBits.append(bits[0])
bits = bits[1:]
flag,char = contain(waitingBits,encodeBitArray)
# print("waitingBits:%s\tflag:%s\tchar:%s"%(waitingBits,flag,char))
if flag == True:
decodeStr+=char
waitingBits = bitarray(endian = "little")
return decodeStr
encodeListStr,zipedBits = readFile("ziped.hfm")
encodeList = json.loads(encodeListStr)
decodeStr = decode(encodeList,zipedBits)
saveStr(decodeStr)
主要問題
- 不滿一個字節填充問題。有一種場景是 當不夠一個字節然後填充了
00
,但是恰好00
是某個字符的哈夫曼編碼,此時不知如何進行處理 - 哈夫曼壓縮算法好像只能對字符進行壓縮,這裏限制在
[a-z]
其實也可以不限制,只不過要多做一些處理,比如編碼字符範圍包括{
}
時,要進行反義操作之類的。 - 本人不善於算法,所以解壓時進行匹配時只能每次都去匹配一下,不知道有沒有其他算法可以改善這個情況?希望有大佬能指點指點。
至於第一個問題如果有人能告訴我如何解決那就更好了