文本分詞的介紹網上已經很全面了,這裏主要介紹一種文本分詞、去停用詞的具體實現,停用詞表是對百度、哈工大等常見停用詞表融合後去重
import csv
import datetime
import re
import pandas as pd
import numpy as np
import jieba
# 停用詞路徑
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return list(set(stopwords))
# 分詞去停用詞
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist(r"C:\Users\lenovo\Desktop\fin_data\NLP_code\wordtoremove.txt") # 這裏加載停用詞的路徑
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
# 分詞、去停用詞、只保留動 名詞
def seg_sentence_return_vn(sentence):
seg = psg.cut(sentence.strip())
outstr = ''
for x in seg:
if x.flag in ('n','v'):
outstr += x.word
outstr += " "
return outstr
if __name__ == '__main__':
filename = 'zhihu_data_setiment.csv'
csv_data = pd.read_csv(filename, header = 0, index_col = False, engine='python',encoding = 'utf-8-sig')
file_userdict = 'personal_dic.txt'
jieba.load_userdict(file_userdict)
for i in range(csv_data.shape[0]):
answer = str(csv_data.iloc[i,14])
csv_data.iloc[i,20] = seg_sentence(answer)
csv_data.iloc[i,21] = seg_sentence_return_vn(answer)
# 進度查看器
if(i % 500 ==0):
print(i,end = ' ')
# 寫文件
csv_data.to_csv("zhihu_data_jieba_seg.csv",header=True,index=False,encoding='utf-8-sig')