# Deep&Wide推薦系統

## Wide Component

wide部分是一個廣義線性模型，如圖一左邊所示，形如$y = w^T+b,$其中y是預測值，x是d維的特徵向量，w是模型參數。而特徵又包括了輸入特徵和變換特徵，最重要的轉換特徵之一是叉積變換，其定義如下：
$\phi_k(x)=\prod_{i=1}^nx_i^{c_{ki}}, c_{ki} \subset {\left\lbrace0,1 \right\rbrace}$

## Deep Component

Deep部分是一個前饋神經網絡，如圖一右邊所示。對應類別特徵，原始的輸入是特徵字符，這些稀疏的高維類別特徵首先被轉化爲低維真值向量，通常被稱爲嵌入向量，維度通常爲O(10)到O(100)，嵌入向量隨機初始化。這些低維嵌入向量然後被喂入神經網絡的隱層，隱層如下：
$a^{(l+1)}=f(W^{(l)}a^{(l)}+b^{(l)})$

## Deep&Wide模型

$P(Y=1|x)=\sigma(w^T_{wide}[x,\phi(x)]+w^T_{deep}a^{(l_f)}+b$

# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# Unless required by applicable law or agreed to in writing, software
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# ==============================================================================
"""Example code for TensorFlow Wide & Deep Tutorial using TF.Learn API."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys
import tempfile

from six.moves import urllib

import pandas as pd
import tensorflow as tf

COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
"marital_status", "occupation", "relationship", "race", "gender",
"capital_gain", "capital_loss", "hours_per_week", "native_country",
"income_bracket"]
LABEL_COLUMN = "label"
CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
"relationship", "race", "gender", "native_country"]
CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss",
"hours_per_week"]

if train_data:
train_file_name = train_data
else:
train_file = tempfile.NamedTemporaryFile(delete=False)
train_file_name = train_file.name
train_file.close()

if test_data:
test_file_name = test_data
else:
test_file = tempfile.NamedTemporaryFile(delete=False)
test_file_name = test_file.name
test_file.close()

return train_file_name, test_file_name

def build_estimator(model_dir, model_type):
"""Build an estimator."""
# Sparse base columns.
gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender",
keys=["female", "male"])
education = tf.contrib.layers.sparse_column_with_hash_bucket(
"education", hash_bucket_size=1000)
relationship = tf.contrib.layers.sparse_column_with_hash_bucket(
"relationship", hash_bucket_size=100)
workclass = tf.contrib.layers.sparse_column_with_hash_bucket(
"workclass", hash_bucket_size=100)
occupation = tf.contrib.layers.sparse_column_with_hash_bucket(
"occupation", hash_bucket_size=1000)
native_country = tf.contrib.layers.sparse_column_with_hash_bucket(
"native_country", hash_bucket_size=1000)

# Continuous base columns.
age = tf.contrib.layers.real_valued_column("age")
education_num = tf.contrib.layers.real_valued_column("education_num")
capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")

# Transformations.
age_buckets = tf.contrib.layers.bucketized_column(age,
boundaries=[
18, 25, 30, 35, 40, 45,
50, 55, 60, 65
])

# Wide columns and deep columns.
wide_columns = [gender, native_country, education, occupation, workclass,
relationship, age_buckets,
tf.contrib.layers.crossed_column([education, occupation],
hash_bucket_size=int(1e4)),
tf.contrib.layers.crossed_column(
[age_buckets, education, occupation],
hash_bucket_size=int(1e6)),
tf.contrib.layers.crossed_column([native_country, occupation],
hash_bucket_size=int(1e4))]
deep_columns = [
tf.contrib.layers.embedding_column(workclass, dimension=8),
tf.contrib.layers.embedding_column(education, dimension=8),
tf.contrib.layers.embedding_column(gender, dimension=8),
tf.contrib.layers.embedding_column(relationship, dimension=8),
tf.contrib.layers.embedding_column(native_country,
dimension=8),
tf.contrib.layers.embedding_column(occupation, dimension=8),
age,
education_num,
capital_gain,
capital_loss,
hours_per_week,
]

if model_type == "wide":
m = tf.contrib.learn.LinearClassifier(model_dir=model_dir,
feature_columns=wide_columns)
elif model_type == "deep":
m = tf.contrib.learn.DNNClassifier(model_dir=model_dir,
feature_columns=deep_columns,
hidden_units=[100, 50])
else:
m = tf.contrib.learn.DNNLinearCombinedClassifier(
model_dir=model_dir,
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=[100, 50])
return m

def input_fn(df):
"""Input builder function."""
# Creates a dictionary mapping from each continuous feature column name (k) to
# the values of that column stored in a constant Tensor.
continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
# Creates a dictionary mapping from each categorical feature column name (k)
# to the values of that column stored in a tf.SparseTensor.
categorical_cols = {
k: tf.SparseTensor(
indices=[[i, 0] for i in range(df[k].size)],
values=df[k].values,
dense_shape=[df[k].size, 1])
for k in CATEGORICAL_COLUMNS}
'''
categorical_cols = {
k: tf.SparseTensor(
indices=[[i, 0] for i in range(df[k].size)],
values=df[k].values,
shape=[df[k].size, 1])
for k in CATEGORICAL_COLUMNS}
'''
# Merges the two dictionaries into one.
feature_cols = dict(continuous_cols)
feature_cols.update(categorical_cols)
# Converts the label column into a constant Tensor.
label = tf.constant(df[LABEL_COLUMN].values)
# Returns the feature columns and the label.
return feature_cols, label

def train_and_eval(model_dir, model_type, train_steps, train_data, test_data):
"""Train and evaluate the model."""
tf.gfile.Open(train_file_name),
names=COLUMNS,
skipinitialspace=True,
engine="python")
tf.gfile.Open(test_file_name),
names=COLUMNS,
skipinitialspace=True,
skiprows=1,
engine="python")

# remove NaN elements
df_train = df_train.dropna(how='any', axis=0)
df_test = df_test.dropna(how='any', axis=0)

df_train[LABEL_COLUMN] = (
df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
df_test[LABEL_COLUMN] = (
df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)

model_dir = tempfile.mkdtemp() if not model_dir else model_dir
print("model directory = %s" % model_dir)

m = build_estimator(model_dir, model_type)
m.fit(input_fn=lambda: input_fn(df_train), steps=train_steps)
results = m.evaluate(input_fn=lambda: input_fn(df_test), steps=1)
for key in sorted(results):
print("%s: %s" % (key, results[key]))
print("Train WDL End")

FLAGS = None

def main(_):
print(FLAGS)
train_and_eval(FLAGS.model_dir, FLAGS.model_type, FLAGS.train_steps,
FLAGS.train_data, FLAGS.test_data)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
"--model_dir",
type=str,
default="./wdl_data/model_save",
help="Base directory for output models."
)
"--model_type",
type=str,
default="wide_n_deep",
help="Valid model types: {'wide', 'deep', 'wide_n_deep'}."
)
"--train_steps",
type=int,
default=2000,
help="Number of training steps."
)
"--train_data",
type=str,
help="Path to the training data."
)