LTP分詞與詞性標註(使用用戶詞典)

#coding:utf-8

from pyltp import Segmentor
from pyltp import Postagger

def read_and_seg_pos(file_dir):
    segmentor = Segmentor()
    postagger = Postagger()
    segmentor.load_with_lexicon("模型地址/cws.model","用戶詞典/fulluserdict")
    postagger.load_with_lexicon("模型地址/pos.model","用戶詞典/fulluserdict")
    #用戶詞典爲純文本,第一列爲詞,第二列爲詞性
    file_read = open(file_dir,"r")
    texts = file_read.readlines()    #這裏是一次性全部讀取,對於大語料,往往採用readline(),一次讀取一行
    file_write_seg = open(file_dir+"_seg","w")
    file_write_pos = open(file_dir+"_pos","w")
    for text in texts:
        words = segmentor.segment(text)#分詞
        file_write_seg.write(" ".join(words)+"\n")#將以空格分好的詞寫入文檔
        postags = postagger.postag(words)         #必須先分詞再詞性標註
        words_and_pos = zip(words,postags)
        words_and_pos.append(('$','$'))   #'$'作爲判斷一句話結束的標誌
        for word,pos in words_and_pos:
            if word != '$':
                file_write_pos.write(word+" "+pos+" ")
            else:
                file_write_pos.write('\n')


    file_read.close()
    file_write_seg.close()
    file_write_pos.close()
    segmentor.release()
    postagger.release()

read_and_seg_pos("./corpus")
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章