# 顯示處理流程
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# 停用詞文檔
stopwords_path = "G:/1研究生/news_stopwords.txt"
"""創建停用詞列表"""
def stopwordslist():
stopwords = [line.strip() for line in open(stopwords_path,encoding='UTF-8').readlines()]
return stopwords
# 對句子進行中文分詞
def seg_depart(sentence):
sentence_depart = jieba.cut(sentence.strip())
stopwords = stopwordslist()
outstr = ''
# 去停用詞
for word in sentence_depart:
if word not in stopwords and len(word)>1:
outstr += word
outstr += " "
return outstr
"""如果文檔還沒分詞,就進行分詞"""
count=0
if not os.path.exists(outfilename):
inputs = open(filename, 'r', encoding='UTF-8')
outputs = open(outfilename, 'w', encoding='UTF-8')
# 把非漢字的字符全部去掉
# 將輸出結果寫入ouputs.txt中
for line in inputs:
line = line.split('\t')[1]
line = re.sub(r'[^\u4e00-\u9fa5]+','',line)
line_seg = seg_depart(line.strip())
outputs.write(line_seg.strip() + '\n')
count+=1
if(count%200==0):
print(count)
outputs.close()
inputs.close()
print("刪除停用詞和分詞成功!!!")
代碼!以備不時之需!中文文本預處理(停用詞、空格分隔、按行分類)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.