七類情感分析
情感字典標註數據集
通過之前word2vec查找七類情感的相近詞,得到一個情感詞典,由於我們需要對其進行詩句評分,來判斷整體的情感表現。
故而,我們重新收集了相關的情感詞典,並收集了對應的權重(相似程度值),以此來判斷整首詩的情感。
如下表所示:
根據我們所得到的情感字典,來對唐詩48330首詩詞進行情感標註
import pandas as pd
def emotion():
data=pd.read_excel('new_emotion.xlsx')
similar=data.get('similar')
value = data.get('val')
sad_list=str(similar[0]).split(',')
fear_list=str(similar[1]).split(',')
happy_list=str(similar[2]).split(',')
anger_list=str(similar[3]).split(',')
think_list=str(similar[4]).split(',')
like_list=str(similar[5]).split(',')
worry_list=str(similar[6]).split(',')
sad_val_list = str(value[0]).split(',')
fear_val_list = str(value[1]).split(',')
happy_val_list = str(value[2]).split(',')
anger_val_list = str(value[3]).split(',')
think_val_list = str(value[4]).split(',')
like_val_list = str(value[5]).split(',')
worry_val_list = str(value[6]).split(',')
return sad_list,fear_list,happy_list,anger_list,think_list,like_list,worry_list,sad_val_list,fear_val_list,happy_val_list,anger_val_list,think_val_list,like_val_list,worry_val_list
def test_sentence(sentence):
sad_list, fear_list, happy_list, anger_list, think_list, like_list, worry_list,sad_val_list,fear_val_list,happy_val_list,anger_val_list,think_val_list,like_val_list,worry_val_list=emotion()
sad=fear=happy=anger=think=like=worry=0
for k in sentence:
if k in sad_list:
sad+=float(sad_val_list[sad_list.index(k)])
elif k in fear_list:
fear+=float(fear_val_list[fear_list.index(k)])
elif k in happy_list:
happy+=float(happy_val_list[happy_list.index(k)])
elif k in anger_list:
anger+=float(anger_val_list[anger_list.index(k)])
elif k in think_list:
think+=float(think_val_list[think_list.index(k)])
elif k in like_list:
like+=float(like_val_list[like_list.index(k)])
elif k in worry_list:
worry+=float(worry_val_list[worry_list.index(k)])
ans=max(sad,fear,happy,anger,think,like,worry)
scord_list=[]
scord_list.append(sad)
scord_list.append(fear)
scord_list.append(happy)
scord_list.append(anger)
scord_list.append(think)
scord_list.append(like)
scord_list.append(worry)
emotion_list=['悲','懼','樂','怒','思','喜','憂']
i=0
for i in range(len(scord_list)):
if scord_list[i]==ans:
#print(emotion_list[i])
break
if ans!=0:
return emotion_list[i],ans
else:
return '無',0
def read():
data=pd.read_excel('tang.xlsx')
content_list=data.get('content')
ans_emotion_content=[]
print(len(content_list))
for i in range(len(content_list)):
print('第'+str(i)+'個')
content=content_list[i].replace('\n','')
#print(content)
ans_content=[]
content_l=str(content).split('。')
for k in content_l:
kk=str(k).split(',')
for it in kk:
if it!='':
ans_content.append(it)
ans_emotion = {}
for sentence in ans_content:
#print(sentence)
emot,score=test_sentence(sentence)
if emot!='無':
#print(emot+str(score))
if emot not in ans_emotion.keys():
ans_emotion[emot] = score
else:
ans_emotion[emot]+=score
#print(sorted(ans_emotion.items(), key=lambda item: item[1], reverse=True))
if len(ans_emotion.items())==0:
#print('整篇文章情感:無')
ans_emotion_content.append('無')
else:
ans_emotion=dict(sorted(ans_emotion.items(), key=lambda item: item[1], reverse=True))
for key,value in ans_emotion.items():
#print('整篇文章情感:'+key)
ans_emotion_content.append(key)
break
import xlwt
xl = xlwt.Workbook()
# 調用對象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
sheet1.write(0, 0, "sentence")
sheet1.write(0, 1, 'label')
print(len(content_list))
for i in range(0, len(content_list)):
sheet1.write(i + 1, 0, content_list[i].replace('\n',''))
sheet1.write(i + 1, 1, ans_emotion_content[i])
xl.save("train.xlsx")
if __name__ == '__main__':
read()
劃分訓練集與測試集
將源數據進行劃分,劃分出對應的訓練集和測試集
首先統計下該源數據中各類情感詩詞各有多少個:
test爲對應的測試集個數
# 樂 9440 test:1000
# 悲 13784 test:1000
# 憂 3977 test:300
# 思 10550 test:1000
# 喜 6578 test:500
# 怒 2158 test:200
# 懼 493 test:100
劃分訓練集與測試集coding:
import pandas as pd
# 樂 9440 test:1000
# 悲 13784 test:1000
# 憂 3977 test:300
# 思 10550 test:1000
# 喜 6578 test:500
# 怒 2158 test:200
# 懼 493 test:100
def split_data():
data=pd.read_excel('train.xlsx')
sentence_list=data.get('sentence')
label_list=data.get('label')
emotion = ['悲', '懼', '樂', '怒', '思', '喜', '憂']
ans={}
train_senten=[]
train_label=[]
test_senten=[]
test_label=[]
k1=k2=k3=k4=k5=k6=k7=0
for i in range(len(sentence_list)):
lab=label_list[i]
sente=sentence_list[i]
if lab in emotion:
if lab == '悲':
if k1<=1000:
k1+=1
test_senten.append(sente)
test_label.append(0)
else:
train_label.append(0)
train_senten.append(sente)
elif lab == '懼':
if k2<=100:
k2+=1
test_senten.append(sente)
test_label.append(1)
else:
train_label.append(1)
train_senten.append(sente)
elif lab == '樂':
if k3<=1000:
k3+=1
test_senten.append(sente)
test_label.append(2)
else:
train_label.append(2)
train_senten.append(sente)
elif lab == '怒':
if k4<=200:
k4+=1
test_senten.append(sente)
test_label.append(3)
else:
train_label.append(3)
train_senten.append(sente)
elif lab == '思':
if k5<=1000:
k5+=1
test_senten.append(sente)
test_label.append(4)
else:
train_label.append(4)
train_senten.append(sente)
elif lab == '喜':
if k6<=500:
k6+=1
test_senten.append(sente)
test_label.append(5)
else:
train_label.append(5)
train_senten.append(sente)
elif lab == '憂':
if k7<=300:
k7+=1
test_senten.append(sente)
test_label.append(6)
else:
train_label.append(6)
train_senten.append(sente)
import xlwt
xl = xlwt.Workbook()
# 調用對象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
sheet1.write(0, 0, "sentence")
sheet1.write(0, 1, 'label')
for i in range(0, len(test_senten)):
sheet1.write(i + 1, 0, test_senten[i])
sheet1.write(i + 1, 1, test_label[i])
xl.save("data/test.xlsx")
if __name__ == '__main__':
split_data()
結果爲:
構建詞典
主要是詩詞的分割:我們採用逐個詞語分割,去除中文標點符號
#詩詞分割
def tokenlize(sentence):
"""
進行文本分詞
:param sentence: str
:return: [str,str,str]
"""
fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>',
'\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”',
'“', ]
sentence = re.sub("|".join(fileters), "", sentence)
punctuation_str = punctuation
for i in punctuation_str:
sentence = sentence.replace(i, '')
sentence=' '.join(sentence)
result = [i for i in sentence.split(" ") if len(i) > 0]
return result
建立詞典
# -*-coding:utf-8-*-
import pickle
from tqdm import tqdm
from 情感分析.詩詞情感分析 import dataset
# from 情感分析.imdb_sentiment.vocab import Vocab
from torch.utils.data import DataLoader
class Vocab:
UNK_TAG = "<UNK>" # 表示未知字符
PAD_TAG = "<PAD>" # 填充符
PAD = 0
UNK = 1
def __init__(self):
self.dict = { # 保存詞語和對應的數字
self.UNK_TAG: self.UNK,
self.PAD_TAG: self.PAD
}
self.count = {} # 統計詞頻的
def fit(self, sentence):
"""
接受句子,統計詞頻
:param sentence:[str,str,str]
:return:None
"""
for word in sentence:
self.count[word] = self.count.get(word, 0) + 1 # 所有的句子fit之後,self.count就有了所有詞語的詞頻
def build_vocab(self, min_count=1, max_count=None, max_features=None):
"""
根據條件構造 詞典
:param min_count:最小詞頻
:param max_count: 最大詞頻
:param max_features: 最大詞語數
:return:
"""
if min_count is not None:
self.count = {word: count for word, count in self.count.items() if count >= min_count}
if max_count is not None:
self.count = {word: count for word, count in self.count.items() if count <= max_count}
if max_features is not None:
# [(k,v),(k,v)....] --->{k:v,k:v}
self.count = dict(sorted(self.count.items(), lambda x: x[-1], reverse=True)[:max_features])
for word in self.count:
self.dict[word] = len(self.dict) # 每次word對應一個數字
# 把dict進行翻轉
self.inverse_dict = dict(zip(self.dict.values(), self.dict.keys()))
def transform(self, sentence, max_len=None):
"""
把句子轉化爲數字序列
:param sentence:[str,str,str]
:return: [int,int,int]
"""
if len(sentence) > max_len:
sentence = sentence[:max_len]
else:
sentence = sentence + [self.PAD_TAG] * (max_len - len(sentence)) # 填充PAD
return [self.dict.get(i, 1) for i in sentence]
def inverse_transform(self, incides):
"""
把數字序列轉化爲字符
:param incides: [int,int,int]
:return: [str,str,str]
"""
return [self.inverse_dict.get(i, "<UNK>") for i in incides]
def __len__(self):
return len(self.dict)
def collate_fn(batch):
"""
對batch數據進行處理
:param batch: [一個getitem的結果,getitem的結果,getitem的結果]
:return: 元組
"""
reviews, labels = zip(*batch)
return reviews, labels
def get_dataloader(train=True):
imdb_dataset = dataset.ImdbDataset(train)
my_dataloader = DataLoader(imdb_dataset, batch_size=200, shuffle=True, collate_fn=collate_fn)
return my_dataloader
if __name__ == '__main__':
ws = Vocab()
dl_train = get_dataloader(True)
dl_test = get_dataloader(False)
for reviews, label in tqdm(dl_train, total=len(dl_train)):
for sentence in reviews:
ws.fit(sentence)
for reviews, label in tqdm(dl_test, total=len(dl_test)):
for sentence in reviews:
ws.fit(sentence)
ws.build_vocab()
print(len(ws))
pickle.dump(ws, open("./models/vocab.pkl", "wb"))
BiLSTM模型訓練
模型修改了對應的train_batchsize=128,num_layer=6,epoch=30,acc達到了:77.19%
# -*-coding:utf-8-*-
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
from 情感分析.詩詞情感分析 import dataset
from 情感分析.詩詞情感分析.vocab import Vocab
train_batch_size = 128
test_batch_size = 128
voc_model = pickle.load(open("./models/vocab.pkl", "rb"))
sequence_max_len = 100
def collate_fn(batch):
"""
對batch數據進行處理
:param batch: [一個getitem的結果,getitem的結果,getitem的結果]
:return: 元組
"""
reviews, labels = zip(*batch)
reviews = torch.LongTensor([voc_model.transform(i, max_len=sequence_max_len) for i in reviews])
labels = torch.LongTensor(labels)
return reviews, labels
def get_dataloader(train=True):
imdb_dataset = dataset.ImdbDataset(train)
batch_size = train_batch_size if train else test_batch_size
return DataLoader(imdb_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
class ImdbModel(nn.Module):
def __init__(self):
super(ImdbModel, self).__init__()
self.embedding = nn.Embedding(num_embeddings=len(voc_model), embedding_dim=200, padding_idx=voc_model.PAD).to()
self.lstm = nn.LSTM(input_size=200, hidden_size=64, num_layers=6, batch_first=True, bidirectional=True,
dropout=0.1)
self.fc1 = nn.Linear(64 * 2, 64)
self.fc2 = nn.Linear(64, 7)
def forward(self, input):
"""
:param input:[batch_size,max_len]
:return:
"""
input_embeded = self.embedding(input) # input embeded :[batch_size,max_len,200]
output, (h_n, c_n) = self.lstm(input_embeded) # h_n :[4,batch_size,hidden_size]
# out :[batch_size,hidden_size*2]
out = torch.cat([h_n[-1, :, :], h_n[-2, :, :]], dim=-1) # 拼接正向最後一個輸出和反向最後一個輸出
# 進行全連接
out_fc1 = self.fc1(out)
# 進行relu
out_fc1_relu = F.relu(out_fc1)
# 全連接
out_fc2 = self.fc2(out_fc1_relu) # out :[batch_size,2]
return F.log_softmax(out_fc2, dim=-1)
def device():
if torch.cuda.is_available():
return torch.device('cuda')
else:
return torch.device('cpu')
def train(imdb_model, epoch):
"""
:param imdb_model:
:param epoch:
:return:
"""
train_dataloader = get_dataloader(train=True)
optimizer = Adam(imdb_model.parameters())
for i in range(epoch):
bar = tqdm(train_dataloader, total=len(train_dataloader))
for idx, (data, target) in enumerate(bar):
optimizer.zero_grad()
data = data.to(device())
target = target.to(device())
output = imdb_model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
bar.set_description("epcoh:{} idx:{} loss:{:.6f}".format(i, idx, loss.item()))
torch.save(imdb_model, 'lstm_model.pkl')
def test(imdb_model):
"""
驗證模型
:param imdb_model:
:return:
"""
test_loss = 0
correct = 0
imdb_model.eval()
test_dataloader = get_dataloader(train=False)
with torch.no_grad():
for data, target in tqdm(test_dataloader):
data = data.to(device())
target = target.to(device())
output = imdb_model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item()
pred = output.data.max(1, keepdim=True)[1] # 獲取最大值的位置,[batch_size,1]
correct += pred.eq(target.data.view_as(pred)).sum()
test_loss /= len(test_dataloader.dataset)
print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
test_loss, correct, len(test_dataloader.dataset),
100. * correct / len(test_dataloader.dataset)))
def xlftest():
import numpy as np
model = torch.load('lstm_model.pkl')
model.to(device())
from 情感分析.詩詞情感分析.dataset import tokenlize
#樂,悲,憂,思,喜,怒,懼
lines=['獨在異鄉爲異客,每逢佳節倍思親。遙知兄弟登高處,遍插茱萸少一人。 ',
'昔日齷齪不足誇,今朝放蕩思無涯。春風得意馬蹄疾,一日看盡長安花。',
'鋤禾日當午,汗滴禾下土。誰知盤中餐,粒粒皆辛苦?',
'少小離家老大回,鄉音無改鬢毛衰。兒童相見不相識,笑問客從何處來。',
'故人具雞黍,邀我至田家。綠樹村邊合,青山郭外斜。開軒面場圃,把酒話桑麻。待到重陽日,還來就菊花。',
'怒髮衝冠,憑欄處'
]
for line in lines:
print(line)
review = tokenlize(line)
# review=tokenlize(line)
vocab_model = pickle.load(open("./models/vocab.pkl", "rb"))
result = vocab_model.transform(review,sequence_max_len)
# print(result)
data = torch.LongTensor(result).to(device())
data=torch.reshape(data,(1,sequence_max_len)).to(device())
# print(data.shape)
output = model(data)
#print(output.data)
pred = output.data.max(1, keepdim=True)[1] # 獲取最大值的位置,[batch_size,1]
#print(pred.item())
#['悲', '懼', '樂', '怒', '思', '喜', '憂']
if pred.item() == 0:
print("悲")
elif pred.item() == 1:
print("懼")
elif pred.item() == 2:
print("樂")
elif pred.item() == 3:
print("怒")
elif pred.item() == 4:
print("思")
elif pred.item() == 5:
print("喜")
elif pred.item() == 6:
print("憂")
if __name__ == '__main__':
# imdb_model = ImdbModel().to(device())
# train(imdb_model,30)
# test(imdb_model)
xlftest()
測試效果如下:
都是正確的。
不足之處在於:
由於標註源數據是採用情緒字典,會有一部分數據標註不正確,因而訓練的模型也會出現問題。但相對於人工標註已經好太多了