Python_文本分析_分詞

文本分詞的介紹網上已經很全面了,這裏主要介紹一種文本分詞、去停用詞的具體實現,停用詞表是對百度、哈工大等常見停用詞表融合後去重

import csv
import datetime
import re
import pandas as pd
import numpy as np
import jieba

# 停用詞路徑
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return list(set(stopwords))


# 分詞去停用詞
def seg_sentence(sentence):
    sentence_seged = jieba.cut(sentence.strip())
    stopwords = stopwordslist(r"C:\Users\lenovo\Desktop\fin_data\NLP_code\wordtoremove.txt")  # 這裏加載停用詞的路徑
    
    outstr = ''
    for word in sentence_seged:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr


# 分詞、去停用詞、只保留動 名詞
def seg_sentence_return_vn(sentence):
    seg = psg.cut(sentence.strip())
    
    outstr = ''
    for x in seg:
        if x.flag in ('n','v'):
            outstr += x.word
            outstr += " "
    return outstr 


if __name__ == '__main__':  
    
    filename = 'zhihu_data_setiment.csv'
    csv_data = pd.read_csv(filename, header = 0, index_col = False, engine='python',encoding = 'utf-8-sig')
    
    file_userdict = 'personal_dic.txt'
    jieba.load_userdict(file_userdict)
    
    for i in range(csv_data.shape[0]):
        answer = str(csv_data.iloc[i,14])
        csv_data.iloc[i,20] = seg_sentence(answer)
        csv_data.iloc[i,21] = seg_sentence_return_vn(answer)
        
        # 進度查看器
        if(i % 500 ==0):
            print(i,end = '  ')
        
    # 寫文件
    csv_data.to_csv("zhihu_data_jieba_seg.csv",header=True,index=False,encoding='utf-8-sig')
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章