1.下载bert源代码和中文预训练模型
Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
2.准备样本
依旧采用上一节中使用的ai挑战赛用户评论信息。对于自己使用的场景按照对应的格式处理好即可。例如这边样本格式如下(正文+标签):
(说明 此处用的标签含义是从-2~1 共4种代表不同的情感标签,是个4分类。为了便于处理,会将标签投影到1~4 data.others_overall_experience = data.others_overall_experience + 3)
将样本分成三个文件,且放置于同一个文件夹下:
- train.tsv:训练集
- dev.tsv:验证集
- test.tsv:测试集
样本打乱之后按照比例划分。新建一个preprocess.py
的文件用于数据预处理。
import os
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
def train_valid_test_split(x_data, y_data, validation_size = 0.1, test_size = 0.1):
x_, x_test, y_, y_test = train_test_split(x_data, y_data, test_size=test_size)
valid_size = validation_size / (1.0 - test_size)
x_train, x_valid, y_train, y_valid = train_test_split(x_, y_, test_size=valid_size)
return x_train, x_valid, x_test, y_train, y_valid, y_test
pd_all = pd.read_csv("./sample.csv"))
pd_all = shuffle(pd_all)
x_data, y_data = pd_all.content, pd_all.others_overall_experience
x_train, x_valid, x_test, y_train, y_valid, y_test = train_valid_test_split(x_data, y_data, 0.1, 0.1)
train = pd.DataFrame({'label': y_train, 'x_train': x_train})
train.to_csv("./train.csv", index=False, encoding='utf-8',sep='\t')
valid = pd.DataFrame({'label': y_valid, 'x_valid': x_valid})
valid.to_csv("./dev.csv", index=False, encoding='utf-8',sep='\t')
test = pd.DataFrame({'label': y_test, 'x_test': x_test})
test.to_csv("./test.csv", index=False, encoding='utf-8',sep='\t')
3.修改bert代码
run_classifier.py
添加自定义的数据处理模块,默认内部已经存在了几个。
class CommentProcessor(DataProcessor):
"""Processor for the WeiBo data set ."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.csv"), quotechar='"'), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.csv"), quotechar='"'), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.csv"), quotechar='"'), "test")
def get_labels(self):#这里返回了数据样本中定义标签枚举 即1~4
"""See base class."""
return ["1", "2", "3", "4"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
# All sets have a header
if i == 0: continue
guid = "%s-%s" % (set_type, i)
text_a = tokenization.convert_to_unicode(line[1])#读取第二列正文
label = tokenization.convert_to_unicode(line[0])#读取第一列标签
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
#
# .........
#
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
processors = {
"cola": ColaProcessor,
"mnli": MnliProcessor,
"mrpc": MrpcProcessor,
"xnli": XnliProcessor,
"comment": CommentProcessor,#新增数据处理模块
}
注:
self._read_tsv
是继承于DataProcessor
方法所以数据的处理需要注意下该方法实现的默认参数。
例如此处csv.reader delimiter
是按照\t
分割
self._read_tsv(os.path.join(data_dir, "train.csv"), quotechar='"')
这里的quotechar
引用符,如果正文中存在嵌套的双引号就会出错了。
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with tf.gfile.Open(input_file, "r") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
lines.append(line)
print(len(lines))
return lines
4.run
python run_classifier.py -data_dir=/bert/bert-demo/bert/data/ --task_name=news --do_train=true --do_eval=true --data_dir=/bert/bert-demo/bert/data/ --vocab_file=/bert/model/chinese_L-12_H-768_A-12/vocab.txt --bert_config_file=/bert/model/chinese_L-12_H-768_A-12/bert_config.json --init_checkpoint=/bert/model/chinese_L-12_H-768_A-12/bert_model.ckpt --max_seq_length=128 --train_batch_size=32 --learning_rate=2e-5 --num_train_epochs=3.0 --output_dir=/output
涉及到路径/bert/xxx
加载的是数据和模型 按照实际的路径修改即可
do_train
是否训练
do_eval
是否验证
结果:
/output/model.ckpt-7875
INFO:tensorflow:evaluation_loop marked as finished
INFO:tensorflow:***** Eval results *****
INFO:tensorflow: eval_accuracy = 0.73209524
INFO:tensorflow: eval_loss = 0.7203514
INFO:tensorflow: global_step = 7875
INFO:tensorflow: loss = 0.72009945
5.总结
与上小节lstm 和cnn 相比,bert的精度和损失更具有优势(训练集和验证集均为随机,没有使用完全相同的分组可能存在一点差异)
在 自然语言几个重要的模型 这一节中最后提到 ERNIE更适合中文场景进行的词MASK,这也是待优化的点。