訓練中文分詞HMM模型,得到A(狀態轉移矩陣)、B(混淆矩陣)、Pi(初始狀態概率)

#!F://python
# page coding=utf-8
#狀態轉移矩陣:狀態之間的轉移概率      B(混淆矩陣):狀態爲y的情況下,輸出爲x的概率  Pi:初始狀態爲s的狀態
# 本代碼用來訓練中文分詞HMM模型,得到A矩陣(狀態轉移矩陣)、B矩陣(混淆矩陣)、Pi向量(初始概率向量)並且用pickle 將他們的utf-8碼寫到了文件當中去
import pickle
import codecs
A_dic = {}    # 狀態轉移矩陣
B_dic = {}    # 混淆矩陣
Pi_dic = {}   # 狀態初始概率向量
state_list = ['B', 'M', 'E', 'S']
state_M = 4
word_N = 0
Pi_dic_size = 0.0
A_row_count_dic = {}  # 記錄A[key]所在行的所有列的總和,也就是在求解A_dic每行的每一個值候(A[key1][key2]),分母就是Count_dic[key]
B_dic_element_size = {}  # 這個詞典中記錄的是,混淆矩陣中的對應着同一個輸出字符的隱藏狀態的數量
PROB_SATRT = "prob_start.py"   # 初始概率向量寫在這裏
INPUT_DATA = "RenMinData.txt"  # 用於訓練的語料
PROB_EMIT = "prob_emit.py"     # 混淆矩陣
PROB_TRANS = "prob_trans.py"   # 狀態轉移矩陣


def init():
    global Pi_dic
    global B_dic
    global A_dic
    global A_row_count_dic
    for state in state_list:
        A_dic[state] = {}
        for state1 in state_list:
            A_dic[state][state1] = 0.0
    for state in state_list:
        Pi_dic[state] = 0.0  # 初始化狀態初始概率向量中所有的元素值爲0.0
        B_dic[state] = {}    # 初始化混淆矩陣中所有的元素對應爲一個詞典
        A_row_count_dic[state] = 0
    # print B_dic,"over"


def getList(input_str):     # 生成”山棱“=》“BE” ”君“=》“S” "纔敢與君絕"=》“BMMME”
    output_str = []
    input_str_length = len(input_str)
    if input_str_length == 1:
        output_str.append('S')
    else:
        Middle_Num = input_str_length-2
        output_str.append('B')
        output_str.extend(['M']*Middle_Num)  # list.extend(list)直接將一個list合併到另一個list中
        output_str.append('E')
    return output_str


def main(train_file_path):
    init()
    global Pi_dic_size
    global word_N
    global B_dic_element_size
    global A_row_count_dic
    global B_dic
    train_file = codecs.open("copy.txt", "rb", "utf8")
    train_file.read(1)       # utf-8文件讀取時候第一個字符的位置是不可見字符,所以要踢掉
    for line in train_file:  # line=山無棱  天地合
        if not line:
            continue
        word_list = line.split(" ")  # word_list:['山無棱',‘天地合’]
        line_state=[]
        for word in word_list:
            line_state.append(getList(word))  # line_state:[[BME],[BME]]
        print line_state
        if len(line_state)!=len(word_list):
            print "different length for a word and the corresponding state"
            return
        for i in range(len(line_state)):  # 這裏的i是第幾個[B,M,E]
            Pi_dic[line_state[i][0]] += 1     # 構建初始向量
            Pi_dic_size += len(line_state)
            for j in range(len(word_list[i])):  # word_list[i]:山無棱   #line_state[i]:['B','M','E']
                if word_list[i][j] not in B_dic[line_state[i][j]]:
                    utfWordList=word_list[i][j].encode('utf-8')
                    B_dic[line_state[i][j]][utfWordList] = 1.0  # 構建混淆矩陣:line_state:word_list
                else:
                    B_dic[line_state[i][j]][word_list[i][j].encode('utf-8')] += 1
                if word_list[i][j] not in B_dic_element_size:
                    B_dic_element_size[word_list[i][j].encode('utf-8')] = 1
                else:
                    B_dic_element_size[word_list[i][j].encode('utf-8')] += 1
                if j < len(line_state[i]) - 1:
                    A_dic[line_state[i][j]][line_state[i][j + 1]] += 1  # 狀態轉移向量
                    A_row_count_dic[line_state[i][j]] += 1
    print B_dic_element_size
    train_file.close()
    probs()


def probs():
    PROB_SATRT = "prob_start.py"  # 初始概率向量寫在這裏
    INPUT_DATA = "RenMinData.txt"  # 用於訓練的語料
    PROB_EMIT = "prob_emit.py"  # 混淆矩陣
    PROB_TRANS = "prob_trans.py"  # 狀態轉移矩陣
    global Pi_dic
    global Pi_dic_size
    global B_dic
    global A_dic
    global B_dic_element_size
    global A_row_count_dic
    start_fp = open(PROB_SATRT, 'w')
    emit_fp = open(PROB_EMIT, 'w')
    trans_fp = open(PROB_TRANS, 'w')
    print "-------------------以下Pi向量------------------------"
    for key in Pi_dic:
        Pi_dic[key] = Pi_dic[key] / Pi_dic_size
    print Pi_dic
    print "-------------------以下是狀態轉移矩陣------------------------"
    for key in A_dic:
        for key2 in A_dic[key]:
            if A_row_count_dic[key] != 0:
                A_dic[key][key2] = A_dic[key][key2]/A_row_count_dic[key]
    print A_dic
    print "------------------以下是混淆矩陣-----------------------"
    for key in B_dic:
        for key1 in B_dic[key]:
            B_dic[key][key1]=B_dic[key][key1]/B_dic_element_size[key1]
    for item in B_dic:
        for key in B_dic[item] :
            print item, '-->', key,B_dic[item][key],'   ',
    # for key in B_dic:

    pickle.dump(A_dic, start_fp)
    pickle.dump(B_dic, emit_fp)
    pickle.dump(Pi_dic, trans_fp)

    # for key in Pi_dic:
    #     start_fp.write(key+" : "+str(Pi_dic[key]).encode('utf-8'))
    #     start_fp.write("\n")
    # for key in B_dic:
    #     # emit_fp.write()
    #     emit_fp.write(key+" : ")
    #     for key2 in B_dic[key]:
    #         emit_fp.write(key2.encode('utf-8'))
    #     emit_fp.write("\n")
    # for key in A_dic:
    #     trans_fp.write(key+" : "+str(A_dic[key]).encode('utf-8'))
    #     trans_fp.write("\n")
    start_fp.close()
    emit_fp.close()
    trans_fp.close()
main("RenMinData.txt")
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章