#coding:utf-8
from pyltp import Segmentor
from pyltp import Postagger
def read_and_seg_pos(file_dir):
segmentor = Segmentor()
postagger = Postagger()
segmentor.load_with_lexicon("模型地址/cws.model","用戶詞典/fulluserdict")
postagger.load_with_lexicon("模型地址/pos.model","用戶詞典/fulluserdict")
#用戶詞典爲純文本,第一列爲詞,第二列爲詞性
file_read = open(file_dir,"r")
texts = file_read.readlines() #這裏是一次性全部讀取,對於大語料,往往採用readline(),一次讀取一行
file_write_seg = open(file_dir+"_seg","w")
file_write_pos = open(file_dir+"_pos","w")
for text in texts:
words = segmentor.segment(text)#分詞
file_write_seg.write(" ".join(words)+"\n")#將以空格分好的詞寫入文檔
postags = postagger.postag(words) #必須先分詞再詞性標註
words_and_pos = zip(words,postags)
words_and_pos.append(('$','$')) #'$'作爲判斷一句話結束的標誌
for word,pos in words_and_pos:
if word != '$':
file_write_pos.write(word+" "+pos+" ")
else:
file_write_pos.write('\n')
file_read.close()
file_write_seg.close()
file_write_pos.close()
segmentor.release()
postagger.release()
read_and_seg_pos("./corpus")
LTP分詞與詞性標註(使用用戶詞典)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.