基於Adversarial Attack的問題等價性判別比賽baseline

比賽地址:https://biendata.com/competition/2019diac/

詳見 github

數據處理:

# -*- coding: utf-8 -*-
"""
# @Time    : 2019/11/28 17:46
# @Author  : xiaoxiong
# @Email   : [email protected]
# @File    : data_process.py
# @Software: PyCharm
# DESC :
"""
import sys
import importlib
import re
import jieba
import codecs
import json
importlib.reload(sys)

from xml.dom.minidom import parse
def generate_train_data_pair(equ_questions, not_equ_questions):
    a = [x+"\t"+y+"\t"+"0" for x in equ_questions for y in not_equ_questions]
    b = [x+"\t"+y+"\t"+"1" for x in equ_questions for y in equ_questions if x!=y]
    return a+b
def parse_train_data(xml_data):
    pair_list = []
    doc = parse(xml_data)
    collection = doc.documentElement
    for i in collection.getElementsByTagName("Questions"):
        # if i.hasAttribute("number"):
        #     print ("Questions number=", i.getAttribute("number"))
        EquivalenceQuestions = i.getElementsByTagName("EquivalenceQuestions")
        NotEquivalenceQuestions = i.getElementsByTagName("NotEquivalenceQuestions")
        equ_questions = EquivalenceQuestions[0].getElementsByTagName("question")
        not_equ_questions = NotEquivalenceQuestions[0].getElementsByTagName("question")
        equ_questions_list, not_equ_questions_list = [], []
        for q in equ_questions:
            try:
                equ_questions_list.append(q.childNodes[0].data.strip())
            except:
                continue
        for q in not_equ_questions:
            try:
                not_equ_questions_list.append(q.childNodes[0].data.strip())
            except:
                continue
        pair = generate_train_data_pair(equ_questions_list, not_equ_questions_list)
        pair_list.extend(pair)
    print("All pair count=", len(pair_list))
    return pair_list
def write_train_data(file, pairs):
    with open(file, "w", encoding='utf-8') as f:
        for pair in pairs:
            f.write(pair+"\n")

def cut_word(file, cut_file):
    tmp_list = []
    punctuation_remove = '[:;……()『』《》【】★~!"#$%&\'()*+,-./:;<=>?!“”:、,。?@[\\]^_`{|}~]+'
    with open(file, "r", encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            tmp_line = line.split('\t')
            x1 = ' '.join(list(jieba.cut(re.sub(punctuation_remove, '', tmp_line[0].strip()))))
            x2 = ' '.join(list(jieba.cut(re.sub(punctuation_remove, '', tmp_line[1].strip()))))
            x3 = tmp_line[2].strip()
            x = x1.strip()+","+x2.strip()+","+x3
            tmp_list.append(x)
    write_train_data(cut_file, tmp_list)

def all_chars():
    chars = {}
    with open("./data/train_data.txt", "r", encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            for word in line:
                chars[word] = chars.get(word, 0) + 1
    with open("./data/dev_set.csv", "r", encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            for word in line:
                chars[word] = chars.get(word, 0) + 1
    return chars
if __name__ == "__main__":
    min_count = 2
    pair_list = parse_train_data("./data/train_set.xml")
    write_train_data("./data/train_data.txt", pair_list)
    cut_word("./data/train_data.txt", "./data/train_data_cut.txt")
    chars = all_chars()
    with codecs.open('./data/all_chars.json', 'w', encoding='utf-8') as f:
        chars = {i: j for i, j in chars.items() if j >= min_count}
        id2char = {i + 2: j for i, j in enumerate(chars)}  # padding: 0, unk: 1
        char2id = {j: i for i, j in id2char.items()}
        json.dump([id2char, char2id], f, indent=4, ensure_ascii=False)

判別模型:


import codecs
import pandas as pd
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam
import keras.backend as K
from sklearn.metrics import f1_score
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import tensorflow as tf
from tqdm import tqdm

pd.set_option('display.max_columns', None)
maxlen=100

def save_data(is_cut=False):
    x_data = []
    y_data = []
    with open('./data/train_data.txt', "r", encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            tmp_line = line.split('\t')
            tmp_x = (tmp_line[0].strip(), tmp_line[1].strip())
            x_data.append(tmp_x)
            y_data.append(int(tmp_line[2].strip()))
    return x_data, y_data


# bert預訓練模型路徑
config_path = '/home/pywork/kf7899/datas/chinese_wwm_ext_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/home/pywork/kf7899/datas/chinese_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/home/pywork/kf7899/datas/chinese_wwm_ext_L-12_H-768_A-12/vocab.txt'

token_dict = {}

class_numer = 2

with codecs.open(dict_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)


class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]')  # space類用未經訓練的[unused1]表示
            else:
                R.append('[UNK]')  # 剩餘的字符是[UNK]
        return R


def seq_padding(X, padding=0, maxlen=None):
    if maxlen is None:
        L = [len(x) for x in X]
        ML = max(L)
    else:
        ML = maxlen
    return np.array([
        np.concatenate([x[:ML], [padding] * (ML - len(x))]) if len(x[:ML]) < ML else x for x in X
    ])


class data_generator:
    def __init__(self, data, max_len, batch_size=16):
        self.data = data
        self.batch_size = batch_size
        self.max_len = max_len
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        while True:
            idxs = list(range(len(self.data)))
            np.random.shuffle(idxs)
            X1, X2, Y = [], [], []
            for i in idxs:
                d = self.data[i]
                text1 = d[0][0]
                text2 = d[0][1]
                x1, x2 = tokenizer.encode(first=text1, second=text2)
                y = [d[1]]
                X1.append(x1)
                X2.append(x2)
                Y.append(y)
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    print(X1.shape, X2.shape)
                    yield [X1, X2], Y
                    [X1, X2, Y] = [], [], []


# 樣本不均衡時使用的損失函數focal_loss
def focal_loss_fixed(y_true, y_pred):
    # y_pred = K.sigmoid(y_pred)
    gamma = 2.0
    alpha = 0.25
    epsilon = 1e-6
    pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
    pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
    return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1 + epsilon))-K.sum((1-alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0  + epsilon))


def trian_model_bert():
    bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)

    for l in bert_model.layers:
        l.trainable = True

    x1_in = Input(shape=(None,))
    x2_in = Input(shape=(None,))

    x = bert_model([x1_in, x2_in])
    print(x.shape)
    x = Lambda(lambda x: x[:, 0])(x)  #只取cls用於分類
    p = Dense(1, activation='sigmoid')(x)

    model = Model([x1_in, x2_in], p)
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(1e-5),  # 用足夠小的學習率
        metrics=['accuracy']
    )
    model.summary()
    return model

# maxpool時消除mask部分的影響
def seq_maxpool(x):
    """seq是[None, seq_len, s_size]的格式,
    mask是[None, seq_len, 1]的格式,先除去mask部分,
    然後再做maxpooling。
    """
    seq, mask = x
    seq -= (1 - mask) * 1e10
    return MaxPool1D(padding='same')(seq)
    # return K.max(seq, keepdims=True)

def seq_avgpool(x):
    """seq是[None, seq_len, s_size]的格式,
    mask是[None, seq_len, 1]的格式,先除去mask部分,
    然後再做maxpooling。
    """
    seq, mask = x
    seq -= (1 - mask) * 1e10
    return AvgPool1D(padding='same')(seq)


def trian_model_bertlstmgru():
    bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)

    for l in bert_model.layers:
        l.trainable = True

    x1_in = Input(shape=(None,))
    x2_in = Input(shape=(None,))

    x1, x2 =x1_in, x2_in
    mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x1)
    x = bert_model([x1, x2])
    t = Dropout(0.1)(x)
    t = Bidirectional(LSTM(80, recurrent_dropout=0.1, return_sequences=True))(t)
    t = Bidirectional(GRU(80, recurrent_dropout=0.1, return_sequences=True))(t)
    t = Dropout(0.4)(t)
    t = Dense(160)(t)
    # t_maxpool = Lambda(seq_maxpool)([t, mask])
    # t_maxpool = MaxPool1D()(t)
    # t_avgpool = Lambda(seq_avgpool)([t, mask])
    # t_ = concatenate([t_maxpool, t_avgpool], axis=-1)
    print(x.shape,  t.shape)
    # x = Lambda(lambda x: x[:, 0])(x)  #只取cls用於分類
    c = concatenate([x, t], axis=-1)
    c = Lambda(lambda c: c[:, 0])(c)
    p = Dense(1, activation='sigmoid')(c)

    model = Model([x1, x2], p)
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(2e-5),  # 用足夠小的學習率
        metrics=['accuracy']
    )
    model.summary()
    return model

def get_mode_type(model_type = "trian_model_bertlstmgru"):
    trian_model = ''
    if model_type == "trian_model_bert":
        trian_model = trian_model_bert()
    elif model_type == "trian_model_bertlstmgru":
        trian_model = trian_model_bertlstmgru()
    return trian_model, model_type


def fit_mode(train_data, valid_data, max_len ,i, epochs, batch_size):
    print(train_data[0])
    train_D = data_generator(train_data, max_len, batch_size)
    valid_D = data_generator(valid_data, max_len, batch_size)
    model, model_type = get_mode_type()
    model_weight_filepath = "./model/"+model_type+ str(i) + ".weights"
    earlystopping = EarlyStopping(monitor='val_acc', verbose=1, patience=3)
    reducelronplateau = ReduceLROnPlateau(monitor="val_acc", verbose=1, mode='max', factor=0.5, patience=2)
    checkpoint = ModelCheckpoint(filepath=model_weight_filepath, monitor='val_acc',
                                 verbose=1, save_best_only=True, save_weights_only=True, mode='max', period=1)

    model.fit_generator(
        train_D.__iter__(),
        steps_per_epoch=len(train_D),
        epochs=epochs,
        validation_data=valid_D.__iter__(),
        validation_steps=len(valid_D),
        callbacks=[earlystopping, reducelronplateau, checkpoint])
    del model
    K.clear_session()


def test(outfile="./data/dev_result.csv"):
    pbar = tqdm()
    dev_set = pd.read_csv("./data/dev_set.csv", encoding='utf-8', delimiter='\t')
    result = []
    result1 = []
    for index, row in dev_set.iterrows():
        content1 = str(row["question1"])
        content2 = str(row["question2"])
        x1, x2 = tokenizer.encode(first=content1, second=content2)
        x1 = x1[:maxlen]
        x2 = x2[:maxlen]
        tmp_result = model.predict([np.array([x1]), np.array([x2])])
        result_label = tmp_result[0][0]
        result1.append(result_label)  # 查看預測概率分佈情況
        if result_label > 0.5:
            result_label = 1
        else:
            result_label = 0
        print(result_label)
        result.append(int(result_label))
        pbar.update(1)
    dev_set['label'] = pd.DataFrame(result, columns=['label'])
    dev_set['label'] = dev_set['qid'].astype(str) + "\t" + dev_set['label'].astype(str)
    dev_set = dev_set.drop(columns=['qid', 'question1', 'question2'])
    dev_set.to_csv(outfile, header=False, index=False)
    pbar.close()

tokenizer = OurTokenizer(token_dict)
train_x, train_y = save_data()
# train_y = [str(i) for i in train_y]


if __name__ == '__main__':
    # 多次shuffe,k-fold
    for i in range(0, 5):
        data = []
        for d, y in zip(train_x, train_y):
            data.append((d, y))
        # 按照9:1的比例劃分訓練集和驗證集
        random_order = list(range(len(data)))
        np.random.shuffle(random_order)
        train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
        valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]
        fit_mode(train_data, valid_data, 100, i, 5, 16)
else:
    model, model_type = get_mode_type()
    model.load_weights("./model/trian_model_bertlstmgrm0.weights")
    test()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章