一.Python代碼
#!/usr/bin/env python3
# encoding: utf-8
'''
@file: Keras_predict_sentiment.py
@time: 2020/7/5 0005 11:58
@author: Jack
@contact: [email protected]
'''
import string
import re
from os import listdir
from numpy import array
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
def load_doc(filename):
file = open(filename, 'r')
text = file.read()
file.close()
return text
def clean_doc(doc):
tokens = doc.split()
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
tokens = [re_punc.sub('', w) for w in tokens]
tokens = [w for w in tokens if w.isalpha()]
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
tokens = [w for w in tokens if len(w) > 1]
return tokens
def doc_to_line(filename, vocab):
doc = load_doc(filename)
tokens = clean_doc(doc)
tokens = [w for w in tokens if w in vocab]
return ' '.join(tokens)
def precess_docs(directory, vocab):
lines = list()
for filename in listdir(directory):
path = directory + '/' + filename
line = doc_to_line(path, vocab)
lines.append(line)
return lines
def load_clean_dataset(vocab):
neg = precess_docs('txt_sentoken/neg', vocab)
pos = precess_docs('txt_sentoken/pos', vocab)
docs = neg + pos
labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
return docs, labels
def create_tokenizer(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
def define_model(n_words):
model = Sequential()
model.add(Dense(50, input_shape=(n_words,), activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# plot_model(model, to_file='model.png', show_shapes=True)
return model
def predict_sentiment(review, vocab, tokenizer, model):
tokens = clean_doc(review)
tokens = [w for w in tokens if w in vocab]
line = ' '.join(tokens)
encoded = tokenizer.texts_to_matrix([line], mode='binary')
yhat = model.predict(encoded, verbose=0)
percent_pos = yhat[0, 0]
if round(percent_pos) == 0:
return (1 - percent_pos), 'NEGATIVE'
return percent_pos, 'POSITIVE'
if __name__ == "__main__":
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
train_docs, ytrain = load_clean_dataset(vocab)
test_docs, ytest = load_clean_dataset(vocab)
tokenizer = create_tokenizer(train_docs)
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='binary')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='binary')
n_words = Xtrain.shape[1]
model = define_model(n_words)
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
text = 'Best movie ever! It was great!.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print("Review:{}\tSentiment:{}({})".format(text, sentiment, percent * 100))
text = 'This is a bad movie.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print("Review:{}\tSentiment:{}({})".format(text, sentiment, percent * 100))
二.代碼說明
最後使用所有可用數據(訓練集+測試集)訓練開發最終模型並使用其來預測新的電影評論的類別。預測新評論的情感分類同樣需要遵循相同的測試數據準備步驟,即加載文檔、清理文檔、編碼文檔,然後進行預測。代碼中體現爲函數predict_sentiment(),函數的參數爲評論文本、詞彙表、分詞器tokenizer和模型,返回爲預測的情緒和該分類在兩個分類中的百分比,通過調用predict()來直接使用擬合模型預測分類值,對於負面評論返回0,正面評論返回1。
三.結果輸出
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 50) 1288950
_________________________________________________________________
dense_1 (Dense) (None, 1) 51
=================================================================
Total params: 1,289,001
Trainable params: 1,289,001
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
63/63 - 0s - loss: 0.4540 - accuracy: 0.7920
Epoch 2/10
63/63 - 0s - loss: 0.0531 - accuracy: 0.9965
Epoch 3/10
63/63 - 0s - loss: 0.0151 - accuracy: 1.0000
Epoch 4/10
63/63 - 0s - loss: 0.0069 - accuracy: 1.0000
Epoch 5/10
63/63 - 0s - loss: 0.0037 - accuracy: 1.0000
Epoch 6/10
63/63 - 1s - loss: 0.0020 - accuracy: 1.0000
Epoch 7/10
63/63 - 1s - loss: 0.0012 - accuracy: 1.0000
Epoch 8/10
63/63 - 0s - loss: 7.2241e-04 - accuracy: 1.0000
Epoch 9/10
63/63 - 0s - loss: 4.8940e-04 - accuracy: 1.0000
Epoch 10/10
63/63 - 0s - loss: 3.5379e-04 - accuracy: 1.0000
Review:Best movie ever! It was great!. Sentiment:POSITIVE(51.90609693527222)
Review:This is a bad movie. Sentiment:NEGATIVE(67.59185492992401)