Keras NLP——四种单词评分方法对比

一.Python代码

#!/usr/bin/env python3
# encoding: utf-8
'''
@file: keras_mode_comparison.py
@time: 2020/7/4 0004 20:10
@author: Jack
@contact: [email protected]
'''

import string
import re
from os import listdir
from numpy import array
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from pandas import DataFrame
from matplotlib import pyplot


def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text


def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [w for w in tokens if w.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [w for w in tokens if len(w) > 1]
    return tokens


def doc_to_line(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)


def process_docs(directory, vocab, is_train):
    lines = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        line = doc_to_line(path, vocab)
        lines.append(line)
    return lines


def load_clean_dataset(vocab, is_train):
    neg = process_docs('txt_sentoken/neg', vocab, is_train)
    pos = process_docs('txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs, labels


def define_model(n_words):
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
    scores = list()
    n_repeats = 10
    n_words = Xtest.shape[1]
    for i in range(n_repeats):
        model = define_model(n_words)
        model.fit(Xtrain, ytrain, epochs=10, verbose=0)
        _, acc = model.evaluate(Xtest, ytest, verbose=0)
        scores.append(acc)
        print('{} accuracy: {}'.format((i + 1), acc))
    return scores


def prepare_data(train_docs, test_docs, mode):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_docs)
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
    return Xtrain, Xtest


if __name__ == "__main__":
    vocab_filename = 'vocab.txt'
    vocab = load_doc(vocab_filename)
    vocab = set(vocab.split())
    train_docs, ytrain = load_clean_dataset(vocab, True)
    test_docs, ytest = load_clean_dataset(vocab, False)
    modes = ['binary', 'count', 'tfidf', 'freq']
    results = DataFrame()
    for mode in modes:
        Xtrain, Xtest = prepare_data(train_docs, test_docs, mode)
        results[mode] = evaluate_mode(Xtrain, ytrain, Xtest, ytest)
    print(results.describe())
    results.boxplot()
    pyplot.show()

二.代码说明

Keras API中Tokenizer的text_to_matrix()函数提供了4种不同的评分方法:

  • binary 单词被标记为存在(1)或不存在(0)
  • count 每个单词的出现次数
  • tfidf 每个单词根据其泥文档频率进行评分,其中所有文档中常见的单词都会受到惩罚
  • freq 根据文档中出现的频率对单词进行评分

代码中函数prepare_data()在给定训练和测试文档列表上实现用特定的编码mode进行文档编码,最后输出编码后的特征向量矩阵。
因为神经网络是随机的,即便相同的模型拟合(fit)相同的数据时,它们也有可能产生不同的结果,这主要是两个原因引起的:

  • 模型的权重参数的数值初始化是随机的
  • 训练过程中数据是随机打乱的

所以模型的任何一个评分都是不可靠的,我们应该根据多次运行的平均值来评估模型好坏。函数evaluate_mode()运行模型n_repeats次,每次都在训练集上训练10个epoch,然后使用测试集评估模型性能,并把每次测试集得到的准确率添加到分数列表scores中,最后返回所有这些运行中的准确率分数列表。
最后在运行结束时,提供每种单词评分方法的统计摘要,总结每个模式的10次运行中的每一次的模型性能分数的分布。从结果数据可以看到count和binary方法的平均得分似乎比freq和tfidf更好。此外从最后显示的箱线图也可以看到binary模式通过适度的扩展获得了最佳结果,并且时这个数据集的首选方法。

三.结果输出

1 accuracy: 0.9200000166893005
2 accuracy: 0.925000011920929
3 accuracy: 0.925000011920929
4 accuracy: 0.9150000214576721
5 accuracy: 0.9150000214576721
6 accuracy: 0.9350000023841858
7 accuracy: 0.9300000071525574
8 accuracy: 0.9300000071525574
9 accuracy: 0.925000011920929
10 accuracy: 0.9350000023841858
1 accuracy: 0.8949999809265137
2 accuracy: 0.8949999809265137
3 accuracy: 0.9100000262260437
4 accuracy: 0.8849999904632568
5 accuracy: 0.8999999761581421
6 accuracy: 0.9100000262260437
7 accuracy: 0.8949999809265137
8 accuracy: 0.9049999713897705
9 accuracy: 0.8999999761581421
10 accuracy: 0.8949999809265137
1 accuracy: 0.9150000214576721
2 accuracy: 0.8849999904632568
3 accuracy: 0.8949999809265137
4 accuracy: 0.8799999952316284
5 accuracy: 0.8700000047683716
6 accuracy: 0.875
7 accuracy: 0.875
8 accuracy: 0.875
9 accuracy: 0.9049999713897705
10 accuracy: 0.8899999856948853
1 accuracy: 0.8650000095367432
2 accuracy: 0.8700000047683716
3 accuracy: 0.8600000143051147
4 accuracy: 0.8700000047683716
5 accuracy: 0.8700000047683716
6 accuracy: 0.8600000143051147
7 accuracy: 0.8700000047683716
8 accuracy: 0.8700000047683716
9 accuracy: 0.875
10 accuracy: 0.8650000095367432
          binary      count      tfidf       freq
count  10.000000  10.000000  10.000000  10.000000
mean    0.925500   0.899000   0.886500   0.867500
std     0.007246   0.007746   0.014729   0.004859
min     0.915000   0.885000   0.870000   0.860000
25%     0.921250   0.895000   0.875000   0.865000
50%     0.925000   0.897500   0.882500   0.870000
75%     0.930000   0.903750   0.893750   0.870000
max     0.935000   0.910000   0.915000   0.875000

在这里插入图片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章