#!F://python
# page coding=utf-8
#狀態轉移矩陣:狀態之間的轉移概率 B(混淆矩陣):狀態爲y的情況下,輸出爲x的概率 Pi:初始狀態爲s的狀態
# 本代碼用來訓練中文分詞HMM模型,得到A矩陣(狀態轉移矩陣)、B矩陣(混淆矩陣)、Pi向量(初始概率向量)並且用pickle 將他們的utf-8碼寫到了文件當中去
import pickle
import codecs
A_dic = {} # 狀態轉移矩陣
B_dic = {} # 混淆矩陣
Pi_dic = {} # 狀態初始概率向量
state_list = ['B', 'M', 'E', 'S']
state_M = 4
word_N = 0
Pi_dic_size = 0.0
A_row_count_dic = {} # 記錄A[key]所在行的所有列的總和,也就是在求解A_dic每行的每一個值候(A[key1][key2]),分母就是Count_dic[key]
B_dic_element_size = {} # 這個詞典中記錄的是,混淆矩陣中的對應着同一個輸出字符的隱藏狀態的數量
PROB_SATRT = "prob_start.py" # 初始概率向量寫在這裏
INPUT_DATA = "RenMinData.txt" # 用於訓練的語料
PROB_EMIT = "prob_emit.py" # 混淆矩陣
PROB_TRANS = "prob_trans.py" # 狀態轉移矩陣
def init():
global Pi_dic
global B_dic
global A_dic
global A_row_count_dic
for state in state_list:
A_dic[state] = {}
for state1 in state_list:
A_dic[state][state1] = 0.0
for state in state_list:
Pi_dic[state] = 0.0 # 初始化狀態初始概率向量中所有的元素值爲0.0
B_dic[state] = {} # 初始化混淆矩陣中所有的元素對應爲一個詞典
A_row_count_dic[state] = 0
# print B_dic,"over"
def getList(input_str): # 生成”山棱“=》“BE” ”君“=》“S” "纔敢與君絕"=》“BMMME”
output_str = []
input_str_length = len(input_str)
if input_str_length == 1:
output_str.append('S')
else:
Middle_Num = input_str_length-2
output_str.append('B')
output_str.extend(['M']*Middle_Num) # list.extend(list)直接將一個list合併到另一個list中
output_str.append('E')
return output_str
def main(train_file_path):
init()
global Pi_dic_size
global word_N
global B_dic_element_size
global A_row_count_dic
global B_dic
train_file = codecs.open("copy.txt", "rb", "utf8")
train_file.read(1) # utf-8文件讀取時候第一個字符的位置是不可見字符,所以要踢掉
for line in train_file: # line=山無棱 天地合
if not line:
continue
word_list = line.split(" ") # word_list:['山無棱',‘天地合’]
line_state=[]
for word in word_list:
line_state.append(getList(word)) # line_state:[[BME],[BME]]
print line_state
if len(line_state)!=len(word_list):
print "different length for a word and the corresponding state"
return
for i in range(len(line_state)): # 這裏的i是第幾個[B,M,E]
Pi_dic[line_state[i][0]] += 1 # 構建初始向量
Pi_dic_size += len(line_state)
for j in range(len(word_list[i])): # word_list[i]:山無棱 #line_state[i]:['B','M','E']
if word_list[i][j] not in B_dic[line_state[i][j]]:
utfWordList=word_list[i][j].encode('utf-8')
B_dic[line_state[i][j]][utfWordList] = 1.0 # 構建混淆矩陣:line_state:word_list
else:
B_dic[line_state[i][j]][word_list[i][j].encode('utf-8')] += 1
if word_list[i][j] not in B_dic_element_size:
B_dic_element_size[word_list[i][j].encode('utf-8')] = 1
else:
B_dic_element_size[word_list[i][j].encode('utf-8')] += 1
if j < len(line_state[i]) - 1:
A_dic[line_state[i][j]][line_state[i][j + 1]] += 1 # 狀態轉移向量
A_row_count_dic[line_state[i][j]] += 1
print B_dic_element_size
train_file.close()
probs()
def probs():
PROB_SATRT = "prob_start.py" # 初始概率向量寫在這裏
INPUT_DATA = "RenMinData.txt" # 用於訓練的語料
PROB_EMIT = "prob_emit.py" # 混淆矩陣
PROB_TRANS = "prob_trans.py" # 狀態轉移矩陣
global Pi_dic
global Pi_dic_size
global B_dic
global A_dic
global B_dic_element_size
global A_row_count_dic
start_fp = open(PROB_SATRT, 'w')
emit_fp = open(PROB_EMIT, 'w')
trans_fp = open(PROB_TRANS, 'w')
print "-------------------以下Pi向量------------------------"
for key in Pi_dic:
Pi_dic[key] = Pi_dic[key] / Pi_dic_size
print Pi_dic
print "-------------------以下是狀態轉移矩陣------------------------"
for key in A_dic:
for key2 in A_dic[key]:
if A_row_count_dic[key] != 0:
A_dic[key][key2] = A_dic[key][key2]/A_row_count_dic[key]
print A_dic
print "------------------以下是混淆矩陣-----------------------"
for key in B_dic:
for key1 in B_dic[key]:
B_dic[key][key1]=B_dic[key][key1]/B_dic_element_size[key1]
for item in B_dic:
for key in B_dic[item] :
print item, '-->', key,B_dic[item][key],' ',
# for key in B_dic:
pickle.dump(A_dic, start_fp)
pickle.dump(B_dic, emit_fp)
pickle.dump(Pi_dic, trans_fp)
# for key in Pi_dic:
# start_fp.write(key+" : "+str(Pi_dic[key]).encode('utf-8'))
# start_fp.write("\n")
# for key in B_dic:
# # emit_fp.write()
# emit_fp.write(key+" : ")
# for key2 in B_dic[key]:
# emit_fp.write(key2.encode('utf-8'))
# emit_fp.write("\n")
# for key in A_dic:
# trans_fp.write(key+" : "+str(A_dic[key]).encode('utf-8'))
# trans_fp.write("\n")
start_fp.close()
emit_fp.close()
trans_fp.close()
main("RenMinData.txt")
訓練中文分詞HMM模型,得到A(狀態轉移矩陣)、B(混淆矩陣)、Pi(初始狀態概率)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.