一.Python代碼
#!/usr/bin/env python3
# encoding: utf-8
'''
@file: keras_mode_comparison.py
@time: 2020/7/4 0004 20:10
@author: Jack
@contact: [email protected]
'''
import string
import re
from os import listdir
from numpy import array
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from pandas import DataFrame
from matplotlib import pyplot
def load_doc(filename):
file = open(filename, 'r')
text = file.read()
file.close()
return text
def clean_doc(doc):
tokens = doc.split()
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
tokens = [re_punc.sub('', w) for w in tokens]
tokens = [w for w in tokens if w.isalpha()]
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
tokens = [w for w in tokens if len(w) > 1]
return tokens
def doc_to_line(filename, vocab):
doc = load_doc(filename)
tokens = clean_doc(doc)
tokens = [w for w in tokens if w in vocab]
return ' '.join(tokens)
def process_docs(directory, vocab, is_train):
lines = list()
for filename in listdir(directory):
if is_train and filename.startswith('cv9'):
continue
if not is_train and not filename.startswith('cv9'):
continue
path = directory + '/' + filename
line = doc_to_line(path, vocab)
lines.append(line)
return lines
def load_clean_dataset(vocab, is_train):
neg = process_docs('txt_sentoken/neg', vocab, is_train)
pos = process_docs('txt_sentoken/pos', vocab, is_train)
docs = neg + pos
labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
return docs, labels
def define_model(n_words):
model = Sequential()
model.add(Dense(50, input_shape=(n_words,), activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
scores = list()
n_repeats = 10
n_words = Xtest.shape[1]
for i in range(n_repeats):
model = define_model(n_words)
model.fit(Xtrain, ytrain, epochs=10, verbose=0)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
scores.append(acc)
print('{} accuracy: {}'.format((i + 1), acc))
return scores
def prepare_data(train_docs, test_docs, mode):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_docs)
Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
return Xtrain, Xtest
if __name__ == "__main__":
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
modes = ['binary', 'count', 'tfidf', 'freq']
results = DataFrame()
for mode in modes:
Xtrain, Xtest = prepare_data(train_docs, test_docs, mode)
results[mode] = evaluate_mode(Xtrain, ytrain, Xtest, ytest)
print(results.describe())
results.boxplot()
pyplot.show()
二.代碼說明
Keras API中Tokenizer的text_to_matrix()函數提供了4種不同的評分方法:
- binary 單詞被標記爲存在(1)或不存在(0)
- count 每個單詞的出現次數
- tfidf 每個單詞根據其泥文檔頻率進行評分,其中所有文檔中常見的單詞都會受到懲罰
- freq 根據文檔中出現的頻率對單詞進行評分
代碼中函數prepare_data()在給定訓練和測試文檔列表上實現用特定的編碼mode進行文檔編碼,最後輸出編碼後的特徵向量矩陣。
因爲神經網絡是隨機的,即便相同的模型擬合(fit)相同的數據時,它們也有可能產生不同的結果,這主要是兩個原因引起的:
- 模型的權重參數的數值初始化是隨機的
- 訓練過程中數據是隨機打亂的
所以模型的任何一個評分都是不可靠的,我們應該根據多次運行的平均值來評估模型好壞。函數evaluate_mode()運行模型n_repeats次,每次都在訓練集上訓練10個epoch,然後使用測試集評估模型性能,並把每次測試集得到的準確率添加到分數列表scores中,最後返回所有這些運行中的準確率分數列表。
最後在運行結束時,提供每種單詞評分方法的統計摘要,總結每個模式的10次運行中的每一次的模型性能分數的分佈。從結果數據可以看到count和binary方法的平均得分似乎比freq和tfidf更好。此外從最後顯示的箱線圖也可以看到binary模式通過適度的擴展獲得了最佳結果,並且時這個數據集的首選方法。
三.結果輸出
1 accuracy: 0.9200000166893005
2 accuracy: 0.925000011920929
3 accuracy: 0.925000011920929
4 accuracy: 0.9150000214576721
5 accuracy: 0.9150000214576721
6 accuracy: 0.9350000023841858
7 accuracy: 0.9300000071525574
8 accuracy: 0.9300000071525574
9 accuracy: 0.925000011920929
10 accuracy: 0.9350000023841858
1 accuracy: 0.8949999809265137
2 accuracy: 0.8949999809265137
3 accuracy: 0.9100000262260437
4 accuracy: 0.8849999904632568
5 accuracy: 0.8999999761581421
6 accuracy: 0.9100000262260437
7 accuracy: 0.8949999809265137
8 accuracy: 0.9049999713897705
9 accuracy: 0.8999999761581421
10 accuracy: 0.8949999809265137
1 accuracy: 0.9150000214576721
2 accuracy: 0.8849999904632568
3 accuracy: 0.8949999809265137
4 accuracy: 0.8799999952316284
5 accuracy: 0.8700000047683716
6 accuracy: 0.875
7 accuracy: 0.875
8 accuracy: 0.875
9 accuracy: 0.9049999713897705
10 accuracy: 0.8899999856948853
1 accuracy: 0.8650000095367432
2 accuracy: 0.8700000047683716
3 accuracy: 0.8600000143051147
4 accuracy: 0.8700000047683716
5 accuracy: 0.8700000047683716
6 accuracy: 0.8600000143051147
7 accuracy: 0.8700000047683716
8 accuracy: 0.8700000047683716
9 accuracy: 0.875
10 accuracy: 0.8650000095367432
binary count tfidf freq
count 10.000000 10.000000 10.000000 10.000000
mean 0.925500 0.899000 0.886500 0.867500
std 0.007246 0.007746 0.014729 0.004859
min 0.915000 0.885000 0.870000 0.860000
25% 0.921250 0.895000 0.875000 0.865000
50% 0.925000 0.897500 0.882500 0.870000
75% 0.930000 0.903750 0.893750 0.870000
max 0.935000 0.910000 0.915000 0.875000