代碼!以備不時之需!中文文本預處理(停用詞、空格分隔、按行分類)

# 顯示處理流程
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 停用詞文檔
stopwords_path = "G:/1研究生/news_stopwords.txt"
"""創建停用詞列表"""
def stopwordslist():
    stopwords = [line.strip() for line in open(stopwords_path,encoding='UTF-8').readlines()]
    return stopwords

# 對句子進行中文分詞
def seg_depart(sentence):
    sentence_depart = jieba.cut(sentence.strip())
    stopwords = stopwordslist()
    outstr = ''
    # 去停用詞
    for word in sentence_depart:
        if word not in stopwords and len(word)>1:
            outstr += word
            outstr += " "
    return outstr


"""如果文檔還沒分詞,就進行分詞"""
count=0
if not os.path.exists(outfilename):
    inputs = open(filename, 'r', encoding='UTF-8')
    outputs = open(outfilename, 'w', encoding='UTF-8')

    # 把非漢字的字符全部去掉
    # 將輸出結果寫入ouputs.txt中
    for line in inputs:
        line = line.split('\t')[1]
        line = re.sub(r'[^\u4e00-\u9fa5]+','',line)
        line_seg = seg_depart(line.strip())
        outputs.write(line_seg.strip() + '\n')
        count+=1
        if(count%200==0):
            print(count)
    
    outputs.close()
    inputs.close()
    print("刪除停用詞和分詞成功!!!")


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章