日萌社
人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度學習實戰(不定時更新)
8.5 排序模型進階-Wide&Deep
學習目標
- 目標
- 無
- 應用
- 無
8.5.1 wide&deep
- Wide部分的輸入特徵:
- 離散特徵
- 離散特徵之間做組合
- 不輸入有連續值特徵的,在W&D的paper裏面是這樣使用的。
-
Deep部分的輸入特徵:
- raw input+embeding處理
- 對非連續值之外的特徵做embedding處理,這裏都是策略特徵,就是乘以個embedding-matrix。在TensorFlow裏面的接口是:tf.feature_column.embedding_column,默認trainable=True.
- 對連續值特徵的處理是:將其按照累積分佈函數P(X≤x),壓縮至[0,1]內。
注:訓練:notice: Wide部分用FTRL來訓練;Deep部分用AdaGrad來訓練。
-
Wide&Deep在TensorFlow裏面的API接口爲:tf.estimator.DNNLinearCombinedClassifier
- estimator = tf.estimator.DNNLinearCombinedClassifier()
- model_dir="",
- linear_feature_columns=wide_columns,
- dnn_feature_columns=deep_columns
- dnn_hidden_units=[]):dnn層的網絡結構
- estimator = tf.estimator.DNNLinearCombinedClassifier()
tf.estimator傳入參數原則
- LinearClassifier 和 LinearRegressor:接受所有類型的特徵列。
- DNNClassifier 和 DNNRegressor:只接受密集列。其他類型的列必須封裝在 indicator_column 或 embedding_column 中。
- DNNLinearCombinedClassifier 和 DNNLinearCombinedRegressor:
- linear_feature_columns 參數接受任何類型的特徵列。
- dnn_feature_columns 參數只接受密集列。
代碼:
import tensorflow as tf
class WDL(object):
"""wide&deep模型
"""
def __init__(self):
pass
@staticmethod
def read_ctr_records():
# 定義轉換函數,輸入時序列化的
def parse_tfrecords_function(example_proto):
features = {
"label": tf.FixedLenFeature([], tf.int64),
"feature": tf.FixedLenFeature([], tf.string)
}
parsed_features = tf.parse_single_example(example_proto, features)
feature = tf.decode_raw(parsed_features['feature'], tf.float64)
feature = tf.reshape(tf.cast(feature, tf.float32), [1, 121])
# 特徵順序 1 channel_id, 100 article_vector, 10 user_weights, 10 article_weights
# 1 channel_id類別型特徵, 100維文章向量求平均值當連續特徵,10維用戶權重求平均值當連續特徵
channel_id = tf.cast(tf.slice(feature, [0, 0], [1, 1]), tf.int32)
vector = tf.reduce_sum(tf.slice(feature, [0, 1], [1, 100]), axis=1)
user_weights = tf.reduce_sum(tf.slice(feature, [0, 101], [1, 10]), axis=1)
article_weights = tf.reduce_sum(tf.slice(feature, [0, 111], [1, 10]), axis=1)
label = tf.cast(parsed_features['label'], tf.float32)
# 構造字典 名稱-tensor
FEATURE_COLUMNS = ['channel_id', 'vector', 'user_weigths', 'article_weights']
tensor_list = [channel_id, vector, user_weights, article_weights]
feature_dict = dict(zip(FEATURE_COLUMNS, tensor_list))
return feature_dict, label
dataset = tf.data.TFRecordDataset(["./train_ctr_201905.tfrecords"])
dataset = dataset.map(parse_tfrecords_function)
dataset = dataset.batch(64)
dataset = dataset.repeat()
return dataset
def build_estimator(self):
"""建立模型
:param dataset:
:return:
"""
# 離散分類
article_id = tf.feature_column.categorical_column_with_identity('channel_id', num_buckets=25)
# 連續類型
vector = tf.feature_column.numeric_column('vector')
user_weigths = tf.feature_column.numeric_column('user_weigths')
article_weights = tf.feature_column.numeric_column('article_weights')
wide_columns = [article_id]
# embedding_column用來表示類別型的變量
deep_columns = [tf.feature_column.embedding_column(article_id, dimension=25),
vector, user_weigths, article_weights]
estimator = tf.estimator.DNNLinearCombinedClassifier(model_dir="./ckpt/wide_and_deep",
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=[1024, 512, 256])
return estimator
if __name__ == '__main__':
wdl = WDL()
estimator = wdl.build_estimator()
estimator.train(input_fn=wdl.read_ctr_records)
eval_result = estimator.evaluate(input_fn=wdl.read_ctr_records)
print(eval_result)
8.5.2 三個版本特徵數據處理效果對比
特徵不同的處理效果 | baseline | 1離三特徵、文章向量平均值、用戶權重平均值、文章權重平均值 | 1離散特徵、1個111連續特徵 | 1離散特徵、100個連續文章向量、10文章權重、10用戶權重 |
---|---|---|---|---|
accuracy | 0.9051438053097345 | 0.9046435 | 0.9046435 | 0.9046435 |
auc | 0.719274521004087 | 0.57850575 | 0.5896939 | 0.62383443 |
效果對比總結:
- 黑馬頭條數據離散數據數量過少,所以基礎模型就已能夠解決問題
- 如果隨着離散或者連續特徵的增多,使用WDL模型會帶來一定的準確率或者AUC的提高
三個版本特徵處理數據函數以及構建模型
- 第一個版本:
@staticmethod
def read_ctr_records_v1():
# 定義轉換函數,輸入時序列化的
def parse_tfrecords_function(example_proto):
features = {
"label": tf.FixedLenFeature([], tf.int64),
"feature": tf.FixedLenFeature([], tf.string)
}
parsed_features = tf.parse_single_example(example_proto, features)
feature = tf.decode_raw(parsed_features['feature'], tf.float64)
feature = tf.reshape(tf.cast(feature, tf.float32), [1, 121])
# 特徵順序 1 channel_id, 100 article_vector, 10 user_weights, 10 article_weights
# 1 channel_id類別型特徵, 100維文章向量求平均值當連續特徵,10維用戶權重求平均值當連續特徵
channel_id = tf.cast(tf.slice(feature, [0, 0], [1, 1]), tf.int32)
vector = tf.reduce_mean(tf.slice(feature, [0, 1], [1, 100]), axis=1)
user_weights = tf.reduce_mean(tf.slice(feature, [0, 101], [1, 10]), axis=1)
article_weights = tf.reduce_mean(tf.slice(feature, [0, 111], [1, 10]), axis=1)
label = tf.cast(parsed_features['label'], tf.float32)
# 構造字典 名稱-tensor
FEATURE_COLUMNS = ['channel_id', 'vector', 'user_weights', 'article_weights']
tensor_list = [channel_id, vector, user_weights, article_weights]
feature_dict = dict(zip(FEATURE_COLUMNS, tensor_list))
return feature_dict, label
dataset = tf.data.TFRecordDataset(["./ctr_train_20190706.tfrecords"])
dataset = dataset.map(parse_tfrecords_function)
dataset = dataset.batch(64)
dataset = dataset.repeat(100)
return dataset
def build_estimator(self):
"""
構建特徵列輸入到模型中
:return:
"""
# 指定列特徵
channel_id = tf.feature_column.categorical_column_with_identity('channel_id', num_buckets=25)
vector = tf.feature_column.numeric_column('vector')
user_weights = tf.feature_column.numeric_column('user_weights')
article_weights = tf.feature_column.numeric_column('article_weights')
# wide側
wide_columns = [channel_id]
# deep側
deep_columns = [
tf.feature_column.embedding_column(channel_id, dimension=25),
vector,
user_weights,
article_weights
]
# 構造模型
estimator = tf.estimator.DNNLinearCombinedClassifier(model_dir="./tmp/ckpt/wide_and_deep",
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=[256, 128, 64])
return estimator
- 第二個版本:
@staticmethod
def read_ctr_records_v2():
# 定義轉換函數,輸入時序列化的
def parse_tfrecords_function(example_proto):
features = {
"label": tf.FixedLenFeature([], tf.int64),
"feature": tf.FixedLenFeature([], tf.string)
}
parsed_features = tf.parse_single_example(example_proto, features)
feature = tf.decode_raw(parsed_features['feature'], tf.float64)
feature = tf.reshape(tf.cast(feature, tf.float32), [1, 121])
channel_id = tf.cast(tf.slice(feature, [0, 0], [1, 1]), tf.int32)
label = tf.cast(parsed_features['label'], tf.float32)
# 構造字典 名稱-tensor
FEATURE_COLUMNS = ['channel_id', 'feature']
tensor_list = [channel_id, feature]
feature_dict = dict(zip(FEATURE_COLUMNS, tensor_list))
return feature_dict, label
dataset = tf.data.TFRecordDataset(["./ctr_train_20190706.tfrecords"])
dataset = dataset.map(parse_tfrecords_function)
dataset = dataset.batch(64)
dataset = dataset.repeat(100)
return dataset
def build_estimator_v2(self):
"""
構建特徵列輸入到模型中
:return:
"""
# 指定列特徵
channel_id = tf.feature_column.categorical_column_with_identity('channel_id', num_buckets=25)
feature = tf.feature_column.numeric_column('feature', shape=[1, 121])
# wide側
wide_columns = [channel_id]
# deep側
deep_columns = [
tf.feature_column.embedding_column(channel_id, dimension=25),
feature
]
# 構造模型
estimator = tf.estimator.DNNLinearCombinedClassifier(model_dir="./tmp/ckpt/wide_and_deep_v2",
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=[256, 128, 64])
return estimator
- 第三個版本
@staticmethod
def read_ctr_records_v3():
# 定義轉換函數,輸入時序列化的
def parse_tfrecords_function(example_proto):
features = {
"label": tf.FixedLenFeature([], tf.int64),
"feature": tf.FixedLenFeature([], tf.string)
}
parsed_features = tf.parse_single_example(example_proto, features)
feature = tf.decode_raw(parsed_features['feature'], tf.float64)
feature = tf.reshape(tf.cast(feature, tf.float32), [1, 121])
channel_id = tf.cast(tf.slice(feature, [0, 0], [1, 1]), tf.int32)
vector = tf.slice(feature, [0, 1], [1, 100])
user_weights = tf.slice(feature, [0, 101], [1, 10])
article_weights = tf.slice(feature, [0, 111], [1, 10])
label = tf.cast(parsed_features['label'], tf.float32)
# 構造字典 名稱-tensor
FEATURE_COLUMNS = ['channel_id', 'vector', 'user_weights', 'article_weights']
tensor_list = [channel_id, vector, user_weights, article_weights]
feature_dict = dict(zip(FEATURE_COLUMNS, tensor_list))
return feature_dict, label
dataset = tf.data.TFRecordDataset(["./ctr_train_20190706.tfrecords"])
dataset = dataset.map(parse_tfrecords_function)
dataset = dataset.batch(64)
dataset = dataset.repeat(100)
return dataset
def build_estimator_v3(self):
"""
構建特徵列輸入到模型中
:return:
"""
# 指定列特徵
channel_id = tf.feature_column.categorical_column_with_identity('channel_id', num_buckets=25)
vector = tf.feature_column.numeric_column('vector', shape=[1, 100])
user_weights = tf.feature_column.numeric_column('user_weights', shape=[1, 10])
article_weights = tf.feature_column.numeric_column('article_weights', shape=[1, 10])
# wide側
wide_columns = [channel_id]
# deep側
deep_columns = [
tf.feature_column.embedding_column(channel_id, dimension=25),
vector,
user_weights,
article_weights
]
# 構造模型
estimator = tf.estimator.DNNLinearCombinedClassifier(model_dir="./tmp/ckpt/wide_and_deep_v3",
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=[256, 128, 64])
return estimator