基於情感詞典的文本情感分析

原創

StringKai

2019-01-22 00:36

原代碼來源：https://blog.csdn.net/lom9357bye/article/details/79058946

本文是對原代碼的幾個bug進行了修復，用到的詞典可由以上鍊接下載

import codecs
from collections import defaultdict
import jieba
import xlrd

# 分詞，去除停用詞
def seg_word(sentence):
# 分詞
seg_list = jieba.cut(sentence)
seg_result = []
for w in seg_list:
seg_result.append(w)
# 讀取停用詞
stopwords = set() # 集合
fr = codecs.open('stopwords.txt', 'r', 'utf-8')
for word in fr:
stopwords.add(word.strip())
fr.close()
# 去除停用詞
return list(filter(lambda x: x not in stopwords, seg_result))

# 對分詞結果分類：情感詞、否定詞、程度副詞
# key爲索引，value爲權值
def classify_words(word_list):
# 讀取情感字典
sen_file = open('BosonNLP_sentiment_score.txt', 'r+', encoding='utf-8')
# 獲取字典內容
# 去除'\n'
sen_list = sen_file.read().splitlines()
# 創建情感字典
sen_dict = defaultdict()
# 讀取字典文件每一行內容，將其轉換爲字典對象，key爲情感詞，value爲對應的分值
for s in sen_list:
# 對每一行內容根據空格分隔，索引0是情感詞，1是情感分值
if len(s.split(' ')) == 2:
sen_dict[s.split(' ')[0]] = s.split(' ')[1]

# 讀取否定詞文件
not_word_file = open('notDic.txt', 'r+', encoding='utf-8')
# 否定詞沒有分值,使用列表
not_word_list = not_word_file.read().splitlines()

# 讀取程度副詞文件
degree_file = open('degree.txt', 'r+', encoding='utf-8')
degree_list = degree_file.read().splitlines()
degree_dic = defaultdict()
# 程度副詞轉爲字典對象，key爲詞，value爲權值
for d in degree_list:
degree_dic[d.split(',')[0]] = d.split(',')[1]

# 分類結果，詞語索引爲key，分值爲value，否定詞分值爲-1
sen_word = dict()
not_word = dict()
degree_word = dict()

# 分類
for word in word_list:
if word in sen_dict.keys() and word not in not_word_list and word not in degree_dic.keys():
# 找出分詞結果中在情感字典中的詞
sen_word[word] = sen_dict[word]
elif word in not_word_list and word not in degree_dic.keys():
# 分詞結果中在否定詞列表中的詞
not_word[word] = -1
elif word in degree_dic.keys():
# 分詞結果中在程度副詞中的詞
degree_word[word] = degree_dic[word]
sen_file.close()
degree_file.close()
not_word_file.close()
# 將分類結果返回
# 詞語索引爲key，分值爲value，否定詞分值爲 - 1
return sen_word, not_word, degree_word

# 計算每個情感詞得分，再相加
def score_sentiment(sen_word, not_word, degreen_word, seg_result):
# 權重初始化爲1
W = 1
score = 0
# 遍歷分詞結果
for i in range(0, len(seg_result)):
# 若是程度副詞
if seg_result[i] in degreen_word.keys():
W *= float(degreen_word[seg_result[i]])
# 若是否定詞
elif seg_result[i] in not_word.keys():
W *= -1
elif seg_result[i] in sen_word.keys():
score += float(W) * float(sen_word[seg_result[i]])
W = 1
return score

# 調度各函數
def sentiment_score(sentence):
# 1.分詞
seg_list = seg_word(sentence)
# 2.將分詞結果轉爲dic,再分類
sen_word, not_word, degree_word = classify_words(seg_list)
# 3.計算得分
score = score_sentiment(sen_word, not_word, degree_word, seg_list)
return score

if __name__ == '__main__':
score=sentiment_score('我很開心。')

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

基於情感詞典的文本情感分析

一個Java線程間內存可見性實例的解讀

hello world彙編程序的簡化段定義方式

在Ubuntu服務器上配置一個Django網站的簡單方法

Java 8 Stream的一些用法,持續更新中...

基於Nginx-rtmp+Java的直播拉流鑑權方案

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結