Bert文本分類(基於keras-bert實現訓練,保存,加載,預測單個文本)

Bert 預訓練模型準備

中文預訓練模型下載      當Bert遇上Keras:這可能是Bert最簡單的打開姿勢      keras-bert

下載解壓結果

Bert 模型文本分類

1、數據準備

在GitHub中有酒店評論的數據:https://github.com/Hejp5665/bert_keras_nlp

訓練集:5888個,正樣本:2940,負樣本:2948

測試集:101個,正樣本:50,負樣本:51

2、代碼實現


'''
提示:受GPU性能的影響,只能運行基礎版的bert預訓練模型,若出現OOM 適當調整batch_size,maxlen 
我用的是numpy==1.16.4。其他版本可能會有提示
'''

import pandas as pd
import codecs, gc
import numpy as np
from sklearn.model_selection import KFold
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
from keras.metrics import top_k_categorical_accuracy
from keras.layers import *
from keras.callbacks import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
from keras.utils import to_categorical
from sklearn import metrics
# 讀取訓練集和測試集
from sklearn.model_selection import train_test_split

# 參數配置
maxlen      = 100   # 設置序列長度爲100,要保證序列長度不超過512
Batch_size  = 16    #批量運行的個數
Epoch       = 1     #迭代次數

def get_train_test_data():
    train_df = pd.read_excel(r'data\data_train.xlsx' ).astype(str)
    test_df = pd.read_excel(r'data\data_test.xlsx').astype(str)

    # 訓練數據、測試數據和標籤轉化爲模型輸入格式
    DATA_LIST = []
    for data_row in train_df.iloc[:].itertuples():
        DATA_LIST.append((data_row.contents, to_categorical(data_row.labels, 2)))
    DATA_LIST = np.array(DATA_LIST)

    DATA_LIST_TEST = []
    for data_row in test_df.iloc[:].itertuples():
        DATA_LIST_TEST.append((data_row.contents, to_categorical(data_row.labels, 2)))
    DATA_LIST_TEST = np.array(DATA_LIST_TEST)

    data = DATA_LIST
    data_test = DATA_LIST_TEST

    X_train,X_valid = train_test_split(data,test_size=0.2,random_state = 0)
    return X_train,X_valid,data_test

# 預訓練好的模型 roberta_wwm_ext_large
# config_path     = r'roberta_wwm_ext_large\bert_config.json' # 加載配置文件
# checkpoint_path = r'roberta_wwm_ext_large\bert_model.ckpt'
# dict_path       = r'roberta_wwm_ext_large\vocab.txt'

# 預訓練好的模型 bert base
config_path     = r'bert\bert_config.json' # 加載配置文件
checkpoint_path = r'bert\bert_model.ckpt'
dict_path       = r'bert\vocab.txt'


def get_token_dict():
    """
    # 將詞表中的字編號轉換爲字典
    :return: 返回自編碼字典
    """
    token_dict = {}
    with codecs.open(dict_path, 'r', 'utf8') as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)
    return token_dict

# 重寫tokenizer
class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]')  # 用[unused1]來表示空格類字符
            else:
                R.append('[UNK]')  # 不在列表的字符用[UNK]表示   UNK是unknown的意思
        return R

# 獲取新的tokenizer
tokenizer = OurTokenizer(get_token_dict())


def seq_padding(X, padding=0):
    """
    :param X: 文本列表
    :param padding: 填充爲0
    :return: 讓每條文本的長度相同,用0填充
    """
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([ np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X])


class data_generator:
    """
    data_generator只是一種爲了節約內存的數據方式
    """
    def __init__(self, data, batch_size=Batch_size, shuffle=True):
        """
        :param data: 訓練的文本列表
        :param batch_size:  每次訓練的個數
        :param shuffle: 文本是否打亂
        """
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        while True:
            idxs = list(range(len(self.data)))

            if self.shuffle:
                np.random.shuffle(idxs)

            X1, X2, Y = [], [], []
            for i in idxs:
                d = self.data[i]
                text = d[0][:maxlen]
                x1, x2 = tokenizer.encode(first=text)
                y = d[1]
                X1.append(x1)
                X2.append(x2)
                Y.append([y])
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    yield [X1, X2], Y[:, 0, :]
                    [X1, X2, Y] = [], [], []



def acc_top2(y_true, y_pred):
    """
    :param y_true: 真實值
    :param y_pred: 訓練值
    :return: # 計算top-k正確率,當預測值的前k個值中存在目標類別即認爲預測正確
    """
    return top_k_categorical_accuracy(y_true, y_pred, k=2)


# bert模型設置
def build_bert(nclass):
    """
    :param nclass: 文本分類種類
    :return: 構建的bert模型
    """
    bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)  # 加載預訓練模型

    for l in bert_model.layers:
        l.trainable = True
    #構建模型
    x1_in = Input(shape=(None,))
    x2_in = Input(shape=(None,))

    x = bert_model([x1_in, x2_in])
    x = Lambda(lambda x: x[:, 0])(x)  # 取出[CLS]對應的向量用來做分類
    p = Dense(nclass, activation='softmax')(x)

    model = Model([x1_in, x2_in], p)
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(1e-5),  # 用足夠小的學習率
                  metrics=['accuracy', acc_top2])
    print(model.summary())
    return model


def run_kb():
    """
    訓練模型
    :return: 驗證預測集,測試預測集,訓練好的模型
    """
    # 搭建模型參數
    print('正在加載模型,請耐心等待....')
    model = build_bert(2)  # 二分類模型
    print('模型加載成功,開始訓練....')
    early_stopping = EarlyStopping(monitor='val_acc', patience=3)  # 早停法,防止過擬合
    plateau = ReduceLROnPlateau(monitor="val_acc", verbose=1, mode='max', factor=0.5, patience=2)  # 當評價指標不在提升時,減少學習率
    checkpoint = ModelCheckpoint(r'C:\Users\ChuangLan\PycharmProjects\bert_porject\use_bert\bert_dump1.hdf5', monitor='val_acc', verbose=2,
                                 save_best_only=True, mode='max', save_weights_only=True)  # 保存最好的模型
    # 獲取數據並文本序列化
    X_train, X_valid, data_test = get_train_test_data()
    train_D = data_generator(X_train, shuffle=True)
    valid_D = data_generator(X_valid, shuffle=True)
    test_D = data_generator(data_test, shuffle=False)

    # 模型訓練
    model.fit_generator(
        train_D.__iter__(),
        steps_per_epoch=len(train_D),
        epochs=Epoch,
        validation_data=valid_D.__iter__(),
        validation_steps=len(valid_D),
        callbacks=[early_stopping, plateau, checkpoint],
        )
    # 對驗證集和測試集進行預測
    valid_D = data_generator(X_valid, shuffle=False)
    train_model_pred = model.predict_generator(valid_D.__iter__(), steps=len(valid_D), verbose=1)
    test_model_pred  = model.predict_generator(test_D.__iter__(), steps=len(test_D), verbose=1)
    # 將預測概率值轉化爲類別值
    train_pred = [np.argmax(x) for x in train_model_pred]
    test_pred = [np.argmax(x) for x in test_model_pred]
    y_true = [np.argmax(x) for x in X_valid[:, 1]]

    return train_pred,test_pred,y_true,model,data_test


def bk_metrics(y_true,y_pred,type ='metrics'):
    """
    :param y_true: 真實值
    :param y_pred: 預測值
    :param type: 預測種類
    :return: 評估指標
    """
    print(type,'...')
    print(metrics.confusion_matrix(y_true,y_pred))
    print('準確率:',metrics.accuracy_score(y_true,y_pred))
    print('類別精度:',metrics.precision_score(y_true,y_pred,average = None)) #不求平均
    print('宏平均精度:',metrics.precision_score(y_true,y_pred,average = 'macro'))
    print('微平均召回率:',metrics.recall_score(y_true,y_pred,average = 'micro'))
    print('加權平均F1得分:',metrics.f1_score(y_true,y_pred,average = 'weighted'))

#
if __name__ == '__main__':

    # 訓練和預測
    train_pred, test_pred, y_true,model,data_test = run_kb()

    # 評估驗證集
    bk_metrics(train_pred,y_true,type =' train metrics')
    # 評估測試集

    bk_metrics(test_pred,[np.argmax(x) for x in data_test[:, 1]],type =' test metrics')
    # 將模型保存
    model_path =r'use_bert\bertkeras_model.h5'
    model.save(model_path)


    # 模型加載
    from keras_bert import get_custom_objects
    from keras.models import load_model
    custom_objects = get_custom_objects()
    my_objects = {'acc_top2': acc_top2}
    custom_objects.update(my_objects)
    model = load_model(model_path, custom_objects=custom_objects)


    # 單獨評估一個本來分類
    text = '這家餐廳的菜味道可以'
    DATA_text = []
    DATA_text.append((text, to_categorical(0, 2)))
    DATA_text = np.array(DATA_text)
    text= data_generator(DATA_text, shuffle=False)
    test_model_pred  = model.predict_generator(text.__iter__(), steps=len(text), verbose=1)
    print('預測結果',test_model_pred)
    print(np.argmax(test_model_pred))


    del model # 刪除模型減少緩存
    gc.collect()  # 清理內存
    K.clear_session()  # clear_session就是清除一個session

訓練迭代10次 

[[48  3]
 [ 6 44]]
準確率: 0.9108910891089109
類別精度: [0.88888889 0.93617021]
宏平均精度: 0.9125295508274232
微平均召回率: 0.9108910891089109
加權平均F1得分: 0.9107861007013809

訓練迭代20次

[[48  3]
 [ 5 45]]
準確率: 0.9207920792079208
類別精度: [0.90566038 0.9375    ]
宏平均精度: 0.9215801886792453
微平均召回率: 0.9207920792079208
加權平均F1得分: 0.9207454497412065

 
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章