Bert 預訓練模型準備
中文預訓練模型下載 當Bert遇上Keras:這可能是Bert最簡單的打開姿勢 keras-bert
下載解壓結果
Bert 模型文本分類
1、數據準備
在GitHub中有酒店評論的數據:https://github.com/Hejp5665/bert_keras_nlp
訓練集:5888個,正樣本:2940,負樣本:2948
測試集:101個,正樣本:50,負樣本:51
2、代碼實現
'''
提示:受GPU性能的影響,只能運行基礎版的bert預訓練模型,若出現OOM 適當調整batch_size,maxlen
我用的是numpy==1.16.4。其他版本可能會有提示
'''
import pandas as pd
import codecs, gc
import numpy as np
from sklearn.model_selection import KFold
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
from keras.metrics import top_k_categorical_accuracy
from keras.layers import *
from keras.callbacks import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
from keras.utils import to_categorical
from sklearn import metrics
# 讀取訓練集和測試集
from sklearn.model_selection import train_test_split
# 參數配置
maxlen = 100 # 設置序列長度爲100,要保證序列長度不超過512
Batch_size = 16 #批量運行的個數
Epoch = 1 #迭代次數
def get_train_test_data():
train_df = pd.read_excel(r'data\data_train.xlsx' ).astype(str)
test_df = pd.read_excel(r'data\data_test.xlsx').astype(str)
# 訓練數據、測試數據和標籤轉化爲模型輸入格式
DATA_LIST = []
for data_row in train_df.iloc[:].itertuples():
DATA_LIST.append((data_row.contents, to_categorical(data_row.labels, 2)))
DATA_LIST = np.array(DATA_LIST)
DATA_LIST_TEST = []
for data_row in test_df.iloc[:].itertuples():
DATA_LIST_TEST.append((data_row.contents, to_categorical(data_row.labels, 2)))
DATA_LIST_TEST = np.array(DATA_LIST_TEST)
data = DATA_LIST
data_test = DATA_LIST_TEST
X_train,X_valid = train_test_split(data,test_size=0.2,random_state = 0)
return X_train,X_valid,data_test
# 預訓練好的模型 roberta_wwm_ext_large
# config_path = r'roberta_wwm_ext_large\bert_config.json' # 加載配置文件
# checkpoint_path = r'roberta_wwm_ext_large\bert_model.ckpt'
# dict_path = r'roberta_wwm_ext_large\vocab.txt'
# 預訓練好的模型 bert base
config_path = r'bert\bert_config.json' # 加載配置文件
checkpoint_path = r'bert\bert_model.ckpt'
dict_path = r'bert\vocab.txt'
def get_token_dict():
"""
# 將詞表中的字編號轉換爲字典
:return: 返回自編碼字典
"""
token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
return token_dict
# 重寫tokenizer
class OurTokenizer(Tokenizer):
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # 用[unused1]來表示空格類字符
else:
R.append('[UNK]') # 不在列表的字符用[UNK]表示 UNK是unknown的意思
return R
# 獲取新的tokenizer
tokenizer = OurTokenizer(get_token_dict())
def seq_padding(X, padding=0):
"""
:param X: 文本列表
:param padding: 填充爲0
:return: 讓每條文本的長度相同,用0填充
"""
L = [len(x) for x in X]
ML = max(L)
return np.array([ np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X])
class data_generator:
"""
data_generator只是一種爲了節約內存的數據方式
"""
def __init__(self, data, batch_size=Batch_size, shuffle=True):
"""
:param data: 訓練的文本列表
:param batch_size: 每次訓練的個數
:param shuffle: 文本是否打亂
"""
self.data = data
self.batch_size = batch_size
self.shuffle = shuffle
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def __iter__(self):
while True:
idxs = list(range(len(self.data)))
if self.shuffle:
np.random.shuffle(idxs)
X1, X2, Y = [], [], []
for i in idxs:
d = self.data[i]
text = d[0][:maxlen]
x1, x2 = tokenizer.encode(first=text)
y = d[1]
X1.append(x1)
X2.append(x2)
Y.append([y])
if len(X1) == self.batch_size or i == idxs[-1]:
X1 = seq_padding(X1)
X2 = seq_padding(X2)
Y = seq_padding(Y)
yield [X1, X2], Y[:, 0, :]
[X1, X2, Y] = [], [], []
def acc_top2(y_true, y_pred):
"""
:param y_true: 真實值
:param y_pred: 訓練值
:return: # 計算top-k正確率,當預測值的前k個值中存在目標類別即認爲預測正確
"""
return top_k_categorical_accuracy(y_true, y_pred, k=2)
# bert模型設置
def build_bert(nclass):
"""
:param nclass: 文本分類種類
:return: 構建的bert模型
"""
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) # 加載預訓練模型
for l in bert_model.layers:
l.trainable = True
#構建模型
x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,))
x = bert_model([x1_in, x2_in])
x = Lambda(lambda x: x[:, 0])(x) # 取出[CLS]對應的向量用來做分類
p = Dense(nclass, activation='softmax')(x)
model = Model([x1_in, x2_in], p)
model.compile(loss='categorical_crossentropy',
optimizer=Adam(1e-5), # 用足夠小的學習率
metrics=['accuracy', acc_top2])
print(model.summary())
return model
def run_kb():
"""
訓練模型
:return: 驗證預測集,測試預測集,訓練好的模型
"""
# 搭建模型參數
print('正在加載模型,請耐心等待....')
model = build_bert(2) # 二分類模型
print('模型加載成功,開始訓練....')
early_stopping = EarlyStopping(monitor='val_acc', patience=3) # 早停法,防止過擬合
plateau = ReduceLROnPlateau(monitor="val_acc", verbose=1, mode='max', factor=0.5, patience=2) # 當評價指標不在提升時,減少學習率
checkpoint = ModelCheckpoint(r'C:\Users\ChuangLan\PycharmProjects\bert_porject\use_bert\bert_dump1.hdf5', monitor='val_acc', verbose=2,
save_best_only=True, mode='max', save_weights_only=True) # 保存最好的模型
# 獲取數據並文本序列化
X_train, X_valid, data_test = get_train_test_data()
train_D = data_generator(X_train, shuffle=True)
valid_D = data_generator(X_valid, shuffle=True)
test_D = data_generator(data_test, shuffle=False)
# 模型訓練
model.fit_generator(
train_D.__iter__(),
steps_per_epoch=len(train_D),
epochs=Epoch,
validation_data=valid_D.__iter__(),
validation_steps=len(valid_D),
callbacks=[early_stopping, plateau, checkpoint],
)
# 對驗證集和測試集進行預測
valid_D = data_generator(X_valid, shuffle=False)
train_model_pred = model.predict_generator(valid_D.__iter__(), steps=len(valid_D), verbose=1)
test_model_pred = model.predict_generator(test_D.__iter__(), steps=len(test_D), verbose=1)
# 將預測概率值轉化爲類別值
train_pred = [np.argmax(x) for x in train_model_pred]
test_pred = [np.argmax(x) for x in test_model_pred]
y_true = [np.argmax(x) for x in X_valid[:, 1]]
return train_pred,test_pred,y_true,model,data_test
def bk_metrics(y_true,y_pred,type ='metrics'):
"""
:param y_true: 真實值
:param y_pred: 預測值
:param type: 預測種類
:return: 評估指標
"""
print(type,'...')
print(metrics.confusion_matrix(y_true,y_pred))
print('準確率:',metrics.accuracy_score(y_true,y_pred))
print('類別精度:',metrics.precision_score(y_true,y_pred,average = None)) #不求平均
print('宏平均精度:',metrics.precision_score(y_true,y_pred,average = 'macro'))
print('微平均召回率:',metrics.recall_score(y_true,y_pred,average = 'micro'))
print('加權平均F1得分:',metrics.f1_score(y_true,y_pred,average = 'weighted'))
#
if __name__ == '__main__':
# 訓練和預測
train_pred, test_pred, y_true,model,data_test = run_kb()
# 評估驗證集
bk_metrics(train_pred,y_true,type =' train metrics')
# 評估測試集
bk_metrics(test_pred,[np.argmax(x) for x in data_test[:, 1]],type =' test metrics')
# 將模型保存
model_path =r'use_bert\bertkeras_model.h5'
model.save(model_path)
# 模型加載
from keras_bert import get_custom_objects
from keras.models import load_model
custom_objects = get_custom_objects()
my_objects = {'acc_top2': acc_top2}
custom_objects.update(my_objects)
model = load_model(model_path, custom_objects=custom_objects)
# 單獨評估一個本來分類
text = '這家餐廳的菜味道可以'
DATA_text = []
DATA_text.append((text, to_categorical(0, 2)))
DATA_text = np.array(DATA_text)
text= data_generator(DATA_text, shuffle=False)
test_model_pred = model.predict_generator(text.__iter__(), steps=len(text), verbose=1)
print('預測結果',test_model_pred)
print(np.argmax(test_model_pred))
del model # 刪除模型減少緩存
gc.collect() # 清理內存
K.clear_session() # clear_session就是清除一個session
訓練迭代10次
[[48 3]
[ 6 44]]
準確率: 0.9108910891089109
類別精度: [0.88888889 0.93617021]
宏平均精度: 0.9125295508274232
微平均召回率: 0.9108910891089109
加權平均F1得分: 0.9107861007013809
訓練迭代20次
[[48 3]
[ 5 45]]
準確率: 0.9207920792079208
類別精度: [0.90566038 0.9375 ]
宏平均精度: 0.9215801886792453
微平均召回率: 0.9207920792079208
加權平均F1得分: 0.9207454497412065