1 賽事背景
問答系統中包括三個主要的部分:問題理解,信息檢索和答案抽取。而問題理解是問答系統的第一部分也是非常關鍵的一部分。問題理解有非常廣泛的應用,如重複評論識別、相似問題識別等。
重複問題檢測是一個常見的文本挖掘任務,在很多實際問答社區都有相應的應用。重複問題檢測可以方便進行問題的答案聚合,以及問題答案推薦,自動QA等。由於中文詞語的多樣性和靈活性,本賽題需要選手構建一個重複問題識別算法。
2 賽事任務
本次賽題希望參賽選手對兩個問題完成相似度打分。
訓練集:約5千條問題對和標籤。若兩個問題是相同的問題,標籤爲1;否則爲0。
測試集:約5千條問題對,需要選手預測標籤。
3 評審規則
1. 數據說明
訓練集給定問題對和標籤,使用\t進行分隔。測試集給定問題對,使用\t進行分隔。
eg:世界上什麼東西最恐怖 世界上最恐怖的東西是什麼? 1
解析:“世界上什麼東西最恐怖”與”世界上最恐怖的東西是什麼“問題相同,故是重複問題,標籤爲1。
2. 評估指標
本次競賽的評價標準採用準確率指標,最高分爲1。計算方法參考https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html,評估代碼參考:
from sklearn.metrics import accuracy_score
y_pred = [0, 2, 1, 3]
y_true = [0, 1, 2, 3]
accuracy_score(y_true, y_pred)
4 特徵工程
1 基礎特徵
# 文本長度特徵
data['q1_len']=data['q1'].astype(str).map(len)
data['q2_len']=data['q2'].astype(str).map(len)
# 長度差特徵:差/比例
data['q1q2_len_diff']=data['q1_len']-data['q2_len']
data['q1q2_len_diff_abs']=np.abs(data['q1_len']-data['q2_len'])
data['q1q2_rate']=data['q1_len']/data['q2_len']
data['q2q1_rate']=data['q2_len']/data['q1_len']
## 特殊符號特徵
data['q1_end_special']=data['q1'].str.endswith('?').astype(int)
data['q2_end_special']=data['q2'].str.endswith('?').astype(int)
2 共現字特徵
data['comm_q1q2char_nums']=data.apply(lambda row:len(set(row['q1'])&set(row['q2'])),axis=1)
# 共現字位置
def char_match_pos(q1, q2, pos_i):
q1 = list(q1)
q2 = list(q2)
if pos_i < len(q1):
q2_len = min(len(q2), 25) # q2_len只匹配前25個字
for pos_j in range(q2_len):
if q1[pos_i] == q2[pos_j]:
q_pos = pos_j + 1 # 如果匹配上了 記錄匹配的位置
break
elif pos_j == q2_len - 1:
q_pos = 0 # 如果沒有匹配上 賦值爲0
else:
q_pos = -1 # 如果後續長度不存在 賦值爲-1
return q_pos
for pos_i in range(8):
data['q1_pos_' + str(pos_i + 1)] = data.apply(
lambda row: char_match_pos(row['q1'], row['q2'], pos_i), axis=1).astype(np.int8)
這裏也可以用結巴分詞,改成“詞”粒度的
3 距離特徵
print("===========距離特徵 =============")
sim_func_dict = {"jaccard": distance.jaccard,
"sorensen": distance.sorensen,
"levenshtein": distance.levenshtein,
"ratio": Levenshtein.ratio
}
for sim_func in tqdm(sim_func_dict, desc="距離特徵"):
data[sim_func] = data.apply(lambda row: sim_func_dict[sim_func](row["q1"],row["q2"]), axis=1)
qt = [[3, 3], [3, 5], [5, 5], [5, 10], [10, 10], [10, 15], [15, 15], [15, 25]]
for qt_len in qt:
if qt_len[0] == 3 and sim_func == "levenshtein":
pass
else:
data[sim_func + '_q' + str(qt_len[0]) + '_t' + str(qt_len[1])] = data.apply(
lambda row: sim_func_dict[sim_func](row["q1"][:qt_len[0]],
row["q2"][:qt_len[1]]),
axis=1)
4 文本向量匹配特徵
from scipy.spatial.distance import cosine, cityblock, canberra, euclidean, \
minkowski, braycurtis, correlation, chebyshev, jensenshannon, mahalanobis, \
seuclidean, sqeuclidean
from tqdm import tqdm
tqdm.pandas()
# 計算詞向量的相似度
def get_w2v(query, title, num):
q = np.zeros(100)
count = 0
for w in query:
if w in w2v_model.wv:
q += w2v_model.wv[w]
count += 1
if count == 0:
query_vec = q
query_vec = (q / count).tolist()
t = np.zeros(100)
count = 0
for w in title:
if w in w2v_model.wv:
t += w2v_model.wv[w]
count += 1
if count == 0:
title_vec = q
title_vec = (t / count).tolist()
if num == 1:
try:
vec_cosine = cosine(query_vec, title_vec)
return vec_cosine
except Exception as e:
return 0
if num == 2:
try:
vec_canberra = canberra(query_vec, title_vec) / len(query_vec)
return vec_canberra
except Exception as e:
return 0
if num == 3:
try:
vec_cityblock = cityblock(query_vec, title_vec) / len(query_vec)
return vec_cityblock
except Exception as e:
return 0
if num == 4:
try:
vec_euclidean = euclidean(query_vec, title_vec)
return vec_euclidean
except Exception as e:
return 0
if num == 5:
try:
vec_braycurtis = braycurtis(query_vec, title_vec)
return vec_braycurtis
except Exception as e:
return 0
if num == 6:
try:
vec_minkowski = minkowski(query_vec, title_vec)
return vec_minkowski
except Exception as e:
return 0
if num == 7:
try:
vec_correlation = correlation(query_vec, title_vec)
return vec_correlation
except Exception as e:
return 0
if num == 8:
try:
vec_chebyshev = chebyshev(query_vec, title_vec)
return vec_chebyshev
except Exception as e:
return 0
if num == 9:
try:
vec_jensenshannon = jensenshannon(query_vec, title_vec)
return vec_jensenshannon
except Exception as e:
return 0
if num == 10:
try:
vec_mahalanobis = mahalanobis(query_vec, title_vec)
return vec_mahalanobis
except Exception as e:
return 0
if num == 11:
try:
vec_seuclidean = seuclidean(query_vec, title_vec)
return vec_seuclidean
except Exception as e:
return 0
if num == 12:
try:
vec_sqeuclidean = sqeuclidean(query_vec, title_vec)
return vec_sqeuclidean
except Exception as e:
return 0
# 詞向量的相似度特徵
data['vec_cosine'] = data.progress_apply(lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 1),
axis=1)
data['vec_canberra'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 2), axis=1)
data['vec_cityblock'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 3), axis=1)
data['vec_euclidean'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 4), axis=1)
data['vec_braycurtis'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 5), axis=1)
data['vec_minkowski'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 6), axis=1)
data['vec_correlation'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 7), axis=1)
data['vec_chebyshev'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 8), axis=1)
data['vec_jensenshannon'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 9), axis=1)
data['vec_mahalanobis'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 10), axis=1)
data['vec_seuclidean'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 11), axis=1)
data['vec_sqeuclidean'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 12), axis=1)
data['vec_cosine'] = data['vec_cosine'].astype('float32')
data['vec_canberra'] = data['vec_canberra'].astype('float32')
data['vec_cityblock'] = data['vec_cityblock'].astype('float32')
data['vec_euclidean'] = data['vec_euclidean'].astype('float32')
data['vec_braycurtis'] = data['vec_braycurtis'].astype('float32')
data['vec_correlation'] = data['vec_correlation'].astype('float32')
5 向量特徵
def w2v_sent2vec(words):
"""計算句子的平均word2vec向量, sentences是一個句子, 句向量最後會歸一化"""
M = []
for word in words:
try:
M.append(w2v_model.wv[word])
except KeyError: # 不在詞典裏
continue
M = np.array(M)
v = M.sum(axis=0)
return (v / np.sqrt((v ** 2).sum())).astype(np.float32).tolist()
fea_names = ['q1_vec_{}'.format(i) for i in range(100)]
data[fea_names] = data.progress_apply(lambda row: w2v_sent2vec(row['q1_words_list']), result_type='expand', axis=1)
fea_names = ['q2_vec_{}'.format(i) for i in range(100)]
data[fea_names] = data.progress_apply(lambda row: w2v_sent2vec(row['q2_words_list']), result_type='expand', axis=1)
5 模型訓練
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'num_leaves': 5,
'max_depth': 6,
'min_data_in_leaf': 450,
'learning_rate': 0.1,
'feature_fraction': 0.9,
'bagging_fraction': 0.95,
'bagging_freq': 5,
'lambda_l1': 1,
'lambda_l2': 0.001, # 越小l2正則程度越高
'min_gain_to_split': 0.2,
}
oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
model = lgb.LGBMRegressor(**params, n_estimators=50000, n_jobs=-1)
model.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_valid, y_valid)],
eval_metric='binary_logloss',
verbose=50, early_stopping_rounds=200)
y_pred_valid = model.predict(X_valid)
y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
oof[valid_index] = y_pred_valid.reshape(-1, )
prediction += y_pred
prediction /= n_fold
線下分數爲
from sklearn.metrics import accuracy_score
y_pred = (oof > 0.5)
# score=accuracy_score(np.round(abs(oof)) ,train['label'].values)
score=accuracy_score(y_pred ,train['label'].values)
score
0.839,線上0.8406,線上和線下比較吻合