深度協同過濾NCF實踐

修改官網代碼
去掉了不方便直接引用的from official.utils.*代碼,關鍵處理處添加了註釋。

從official.utils.flags扒出一個有用的小函數:

def get_gpus_num():
  """ 獲取GPU個數 """
  from tensorflow.python.client import device_lib
  local_device_protos = device_lib.list_local_devices()
  return sum([1 for d in local_device_protos if d.device_type == "GPU"])

1. 模型實現

NCF模型圖示:

在這裏插入圖片描述

對着代碼看模型結構圖,發現實現起來不難。NCF模型代碼:

from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

import constants  # pylint: disable=g-bad-import-order


class NeuMF(tf.keras.models.Model):
  """Neural matrix factorization (NeuMF) model for recommendations."""

  def __init__(self, num_users, num_items, mf_dim, model_layers, batch_size,
               mf_regularization, mlp_reg_layers):
    """Initialize NeuMF model.
    Args:
      num_users: An integer, the number of users.
      num_items: An integer, the number of items.
      mf_dim: An integer, the embedding size of Matrix Factorization (MF) model.
      model_layers: A list of integers for Multi-Layer Perceptron (MLP) layers.
        Note that the first layer is the concatenation of user and item
        embeddings. So model_layers[0]//2 is the embedding size for MLP.
      batch_size: An integer for the batch size.
      mf_regularization: A floating number, the regularization factor for MF
        embeddings.
      mlp_reg_layers: A list of floating numbers, the regularization factors for
        each layer in MLP.
    Raises:
      ValueError: if the first model layer is not even.
    """
    # ["32", "16", "8"]
    if model_layers[0] % 2 != 0:
      raise ValueError("The first layer size should be multiple of 2!")

    # Input variables
    user_input = tf.keras.layers.Input(
        shape=(1,), dtype=tf.int32, name=constants.USER)
    item_input = tf.keras.layers.Input(
        shape=(1,), dtype=tf.int32, name=constants.ITEM)

    # Initializer for embedding layer
    embedding_initializer = tf.keras.initializers.RandomNormal(stddev=0.01)
    # Embedding layers of GMF and MLP
    # GMF--user Embedding 
    # 把num_users個用戶索引號Embedding成mf_dim大小的向量
    mf_embedding_user = tf.keras.layers.Embedding(
        num_users,
        mf_dim,
        embeddings_initializer=embedding_initializer,
        embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
        input_length=1)
    # GMF--item Embedding
    mf_embedding_item = tf.keras.layers.Embedding(
        num_items,
        mf_dim,
        embeddings_initializer=embedding_initializer,
        embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
        input_length=1)
    # MLP--user Embedding
    # model_layers[0]//2 is the embedding size for MLP
    mlp_embedding_user = tf.keras.layers.Embedding(
        num_users,
        model_layers[0]//2,
        embeddings_initializer=embedding_initializer,
        embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]),
        input_length=1)
    # MLP--item Embedding
    mlp_embedding_item = tf.keras.layers.Embedding(
        num_items,
        model_layers[0]//2,
        embeddings_initializer=embedding_initializer,
        embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]),
        input_length=1)

    # GMF part 
    # 就是輸入的User向量和Item向量的乘積,後面和MLP的最後一層合併
    # Flatten the embedding vector as latent features in GMF
    mf_user_latent = tf.keras.layers.Flatten()(mf_embedding_user(user_input))
    mf_item_latent = tf.keras.layers.Flatten()(mf_embedding_item(item_input))
    # Element-wise multiply
    mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent])

    # MLP part
    # 第一步把embedding_user和embedding_item 合併起來成爲mlp_vector;
    # Flatten the embedding vector as latent features in MLP
    mlp_user_latent = tf.keras.layers.Flatten()(mlp_embedding_user(user_input))
    mlp_item_latent = tf.keras.layers.Flatten()(mlp_embedding_item(item_input))
    # Concatenation of two latent features
    mlp_vector = tf.keras.layers.concatenate([mlp_user_latent, mlp_item_latent])

    # 第二步把mlp_vector傳入Dense層,一層傳入下一層,逐層相連
    num_layer = len(model_layers)  # Number of layers in the MLP
    # model_layers比如:["32", "16", "8"] 
    for layer in xrange(1, num_layer):
      model_layer = tf.keras.layers.Dense(
          model_layers[layer],
          kernel_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[layer]),
          activation="relu")
      mlp_vector = model_layer(mlp_vector)

    # 第三步 把之前的GMF向量和MLP的最後一層Concatenate起來
    # Concatenate GMF and MLP parts
    predict_vector = tf.keras.layers.concatenate([mf_vector, mlp_vector])

    # Final prediction layer
    # 最後,把兩部分合並的結果傳入只有一個神經元的全連接層,作爲結果層
    # 該層使用sigmoid激活,LeCun均勻抽樣初始化。
    prediction = tf.keras.layers.Dense(
        1, activation="sigmoid", kernel_initializer="lecun_uniform",
        name=constants.RATING)(predict_vector)

    super(NeuMF, self).__init__(
        inputs=[user_input, item_input], outputs=prediction)
    

2. 數據集預處理

使用MovieLens數據集,
ml-1m dataset contains 1,000,209 anonymous ratings of approximately 3,706 movies made by 6,040 users who joined MovieLens in 2000.格式如下:

UserID::MovieID::Rating::Timestamp

Ratings are made on a 5-star scale (whole-star ratings only).

ml-20m格式:

userId,movieId,rating,timestamp

Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).

第一步 下載轉換數據

下載,解壓,解析成csv,寫入文件備用.

得到三個csv文件,train-rating和test-rating都是三列,test是留一法得到的最新一條記錄。
train/test都是隻有正例,負例在單獨的test-negative文件裏,
它是從每個user的沒有發生過交互的負例集合中,隨機取100個。

import collections
import os
import sys
import time
import zipfile

# pylint: disable=g-bad-import-order
import numpy as np
import pandas as pd
from six.moves import urllib  # pylint: disable=redefined-builtin
from absl import app as absl_app
from absl import flags
import tensorflow as tf
# pylint: enable=g-bad-import-order
import constants

# URL to download dataset
_DATA_URL = "http://files.grouplens.org/datasets/movielens/"

_USER_COLUMN = "user_id"
_ITEM_COLUMN = "item_id"
_RATING_COLUMN = "rating"
_TIMESTAMP_COLUMN = "timestamp"
# The number of negative examples attached with a positive example
# in training dataset. It is set as 100 in the paper.
_NUMBER_NEGATIVES = 100
# In both datasets, each user has at least 20 ratings.
_MIN_NUM_RATINGS = 20

RatingData = collections.namedtuple(
    "RatingData", ["items", "users", "ratings", "min_date", "max_date"])

def main(_):
  """Download and extract the data from GroupLens website."""
  tf.logging.set_verbosity(tf.logging.INFO)

#   make_dir(FLAGS.data_dir)

  # Download the zip dataset of ml-1m
  data_dir = "movielens-data/"
  dataset = "ml-1m"
  dataset_zip = dataset + ".zip"
  file_path = os.path.join(data_dir, dataset_zip)
  if not tf.gfile.Exists(file_path):
    def _progress(count, block_size, total_size):
      sys.stdout.write("\r>> Downloading {} {:.1f}%".format(
          file_path, 100.0 * count * block_size / total_size))
      sys.stdout.flush()

    file_path, _ = urllib.request.urlretrieve(
        _DATA_URL + dataset_zip, file_path, _progress)
    statinfo = os.stat(file_path)
    # A new line to clear the carriage return from download progress
    # tf.logging.info is not applicable here
    print()
    tf.logging.info(
        "Successfully downloaded {} {} bytes".format(
            file_path, statinfo.st_size))

  # 解壓zip數據集
  if not tf.gfile.Exists(os.path.join(data_dir, dataset)):
    zipfile.ZipFile(file_path, "r").extractall(data_dir)

  # Preprocess and parse the dataset to csv
  train_ratings = dataset + "-" + constants.TRAIN_RATINGS_FILENAME
  if not tf.gfile.Exists(os.path.join(data_dir, train_ratings)):
    parse_file_to_csv(data_dir, dataset)

parse_file_to_csv函數:
1,過濾出至少有20次rating的用戶,把user和item映射成從0開始的索引號
2,生成train數據和test數據,還有測試負例數據,使用generate_train_eval_data函數
3,序列化成csv文件,每個csv文件包含三列:(user_id, item_id, interaction),由於all_ratings和test_ratings表示[user_id, item_id] with interactions,我們添加一列fake_rating變成三列,默認值爲1.
Tips: 輸出數據全是int整數。

主要是generate_train_eval_data函數:

已知所有user和item的interaction信息,對每個用戶,首先按timestamp排序,
然後取出最新的一條interaction記錄作爲Test評分(留一法),剩下的作爲訓練數據。
Test negatives(負例)是從所有non-interacted的items中隨機抽取的,默認取100個(由_NUMBER_NEGATIVES定義)

def generate_train_eval_data(df, original_users, original_items):
  # ...
  # Need to sort before popping to get last item
  tf.logging.info("Sorting user_item_map by timestamp...")
  df.sort_values(by=_TIMESTAMP_COLUMN, inplace=True)
  all_ratings = set(zip(df[_USER_COLUMN], df[_ITEM_COLUMN]))

  # key爲userId,value爲用戶打分過的itemId列表[item1,item2,...,itemk]
  user_to_items = collections.defaultdict(list)
  # Generate user_item rating matrix for training
  # 生成user-item打分矩陣用來訓練
  t1 = time.time()
  row_count = 0
  for row in df.itertuples():
    user_to_items[getattr(row, _USER_COLUMN)].append(getattr(row, _ITEM_COLUMN))
    row_count += 1
    if row_count % 50000 == 0:
      tf.logging.info("Processing user_to_items row: {}".format(row_count))
  tf.logging.info(
      "Process {} rows in [{:.1f}]s".format(row_count, time.time() - t1))

  # Generate test ratings and test negatives
  t2 = time.time()
  test_ratings = []
  test_negs = []
  # Generate the 0-based index for each item, and put it into a set
  all_items = set(range(len(original_items)))
  for user in range(len(original_users)):
    # 彈出每個user的最後一個item id
    test_item = user_to_items[user].pop()
    all_ratings.remove((user, test_item))  # Remove the test item
    # 從所有items中,去掉每個用戶有過interaction的,得到負例集合
    all_negs = all_items.difference(user_to_items[user])
    all_negs = sorted(list(all_negs))  # determinism
    # 每個user和最新的time組成測試數據
    test_ratings.append((user, test_item))
    # 從每個user的負例集合中,隨機取100個作爲Test negatives
    test_negs.append(list(np.random.choice(all_negs, _NUMBER_NEGATIVES)))

    if user % 1000 == 0:
      tf.logging.info("Processing user: {}".format(user))

  tf.logging.info("Process {} users in {:.1f}s".format(
      len(original_users), time.time() - t2))

  all_ratings = list(all_ratings)  # convert set to list
  return all_ratings, test_ratings, test_negs

第二步 數據預處理

要加載的數據包括訓練數據,測試數據,負例數據。
通過dataset.data_preprocessing()加載後,生成一個NCFDataSet的數據類。包含:

  • train_data: 訓練用的正例集合
  • num_users: An integer, the number of users in training dataset.
  • num_items: An integer, the number of items in training dataset.
  • num_negatives: An integer, the number of negative instances for each user
  • true_items: 從test.csv文件讀取正例item作爲真實值,用於評估結果
  • all_items: 每個user的前100個負例,添加一個真實值,得到所有的items
  • all_test_data: 所有user的測試數據,每一個user 帶上他的all_items。

主要代碼:

def data_preprocessing(train_fname, test_fname, test_neg_fname, num_negatives):
  # 讀取正例到內存,load_data()讀取每一行數值列表,返回整個list
  train_data = load_data(train_fname)
  # 獲取unique的用戶數
  num_users = len(np.unique(np.array(train_data)[:, 0]))

  test_ratings = load_data(test_fname)
  test_negatives = load_data(test_neg_fname)
  # 獲取train,test中第二列items列的並集,得到所有item數
  num_items = len(
      set(np.array(train_data)[:, 1]) | set(np.array(test_ratings)[:, 1]))

  # Generate test instances for each user
  true_items, all_items = [], []
  all_test_data = []
  for idx in range(num_users):
    items = test_negatives[idx]
    rating = test_ratings[idx]
    user = rating[0]  # User
    true_item = rating[1]  # Positive item as ground truth

    # 所有items由前100個負例添加一個測試真值的正例組成
    items.append(true_item)
    users = np.full(len(items), user, dtype=np.int32)

    users_items = list(zip(users, items))  # User-item list
    true_items.append(true_item)  # all ground truth items
    all_items.append(items)  # All items (including positive and negative items)
    all_test_data.extend(users_items)  # Generate test dataset

  # Create NCFDataset object
  ncf_dataset = NCFDataSet(
      train_data, num_users, num_items, num_negatives, true_items, all_items,
      np.asarray(all_test_data)
  )

  return ncf_dataset

第三步 訓練和評估

創建NeuMF模型並轉換成Estimator,一邊訓練一邊Evaluate the model;
達到hit-ratio閾值則停止訓練。

Hit Ratio 就是按預測得分從大到小排序取前十個,裏邊有命中測試真值的個數佔比。

NDCG 就是 math.log(2) / math.log(ranklist.index(true_item) + 2)

import heapq
import math
import os

# pylint: disable=g-bad-import-order
import numpy as np
from absl import app as absl_app
from absl import flags
import tensorflow as tf
# pylint: enable=g-bad-import-order

import constants
import dataset
import neumf_model

_TOP_K = 10  # Top-k list for evaluation
# keys for evaluation metrics
_HR_KEY = "HR"
_NDCG_KEY = "NDCG"


def evaluate_model(estimator, batch_size, num_gpus, ncf_dataset):
  # 定義預測的 input function
  def pred_input_fn():
    return dataset.input_fn(
        False, per_device_batch_size(batch_size, num_gpus), ncf_dataset)

  # 用estimator預測
  predictions = estimator.predict(input_fn=pred_input_fn)
  all_predicted_scores = [p[constants.RATING] for p in predictions]

  # 計算Hit Ratio
  def _get_hr(ranklist, true_item):
    return 1 if true_item in ranklist else 0

  # 計算 NDCG 得分
  def _get_ndcg(ranklist, true_item):
    if true_item in ranklist:
      return math.log(2) / math.log(ranklist.index(true_item) + 2)
    return 0

  hits, ndcgs = [], []
  num_users = len(ncf_dataset.eval_true_items)
  # Reshape the predicted scores and each user takes one row
  predicted_scores_list = np.asarray(
      all_predicted_scores).reshape(num_users, -1)

  for i in range(num_users):
    items = ncf_dataset.eval_all_items[i]
    predicted_scores = predicted_scores_list[i]
    # Map item and score for each user
    map_item_score = {}
    for j, item in enumerate(items):
      score = predicted_scores[j]
      map_item_score[item] = score

    # Evaluate top rank list with HR and NDCG
    ranklist = heapq.nlargest(_TOP_K, map_item_score, key=map_item_score.get)
    true_item = ncf_dataset.eval_true_items[i]
    hr = _get_hr(ranklist, true_item)
    ndcg = _get_ndcg(ranklist, true_item)
    hits.append(hr)
    ndcgs.append(ndcg)

  # Get average HR and NDCG scores
  hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
  global_step = estimator.get_variable_value(tf.GraphKeys.GLOBAL_STEP)
  eval_results = {
      _HR_KEY: hr,
      _NDCG_KEY: ndcg,
      tf.GraphKeys.GLOBAL_STEP: global_step
  }
  return eval_results


def convert_keras_to_estimator(keras_model, num_gpus, model_dir):
  """Configure and convert keras model to Estimator.
  Args:
    keras_model: A Keras model object.
    num_gpus: An integer, the number of gpus.
    model_dir: A string, the directory to save and restore checkpoints.
  Returns:
    est_model: The converted Estimator.
  """
  flags_learning_rate = 0.001
  # TODO(b/79866338): update GradientDescentOptimizer with AdamOptimizer
  optimizer = tf.train.GradientDescentOptimizer(
      learning_rate=flags_learning_rate)
  keras_model.compile(optimizer=optimizer, loss="binary_crossentropy")

  if num_gpus == 0:
    distribution = tf.contrib.distribute.OneDeviceStrategy("device:CPU:0")
  elif num_gpus == 1:
    distribution = tf.contrib.distribute.OneDeviceStrategy("device:GPU:0")
  else:
    distribution = tf.contrib.distribute.MirroredStrategy(num_gpus=num_gpus)

  run_config = tf.estimator.RunConfig(train_distribute=distribution)

  estimator = tf.keras.estimator.model_to_estimator(
      keras_model=keras_model, model_dir=model_dir, config=run_config)

  return estimator

def get_gpus_num():
  """ 獲取GPU個數 """
  from tensorflow.python.client import device_lib
  local_device_protos = device_lib.list_local_devices()
  return sum([1 for d in local_device_protos if d.device_type == "GPU"])

def per_device_batch_size(batch_size, num_gpus):
  """For multi-gpu, batch-size must be a multiple of the number of GPUs.
  Note that this should eventually be handled by DistributionStrategies
  directly. Multi-GPU support is currently experimental, however,
  so doing the work here until that feature is in place.
  """
  if num_gpus <= 1:
    return batch_size

  remainder = batch_size % num_gpus
  if remainder:
    err = ("When running with multiple GPUs, batch size "
           "must be a multiple of the number of available GPUs. Found {} "
           "GPUs with a batch size of {}; try --batch_size={} instead."
          ).format(num_gpus, batch_size, batch_size - remainder)
    raise ValueError(err)
  return int(batch_size / num_gpus)


def main(_):
  # 手動設置參數
  # The file name of training and test dataset
  flags_data_dir="movielens-data/"
  flags_dataset = "ml-1m"
  # The Number of negative instances to pair with a positive instance, default=4.
  flags_num_neg = 4
  # The sizes of hidden layers for MLP, default=["64", "32", "16", "8"]
  flags_layers = ["32", "16", "8", "4"]
  # name="mlp_regularization", default=["0.", "0.", "0.", "0."],
  #   "The regularization factor for each MLP layer. See mf_regularization "
  flags_mlp_regularization = ["0.", "0.01", "0.01", "0."]
  flags_mf_regularization = 0.001
  # 
  flags_batch_size = 256
  # The Embedding size of MF model.
  flags_num_factors = 8
  # 
  flags_model_dir = "ncf-model/"
  # hit ratio threshold  For dataset ml-1m, the 
  # desired hr_threshold is 0.68 which is the result from the paper;
  # For dataset ml-20m, the threshold can be set as 0.95 which is
  # achieved by MLPerf implementation.
  flags_hr_threshold = 0.68
  # 
  flags_train_epochs = 2
  # The number of training epochs to run between evaluations, default=1.
  flags_epochs_between_evals = 1

  train_fname = os.path.join(
      flags_data_dir, flags_dataset + "-" + constants.TRAIN_RATINGS_FILENAME)
  test_fname = os.path.join(
      flags_data_dir, flags_dataset + "-" + constants.TEST_RATINGS_FILENAME)
  neg_fname = os.path.join(
      flags_data_dir, flags_dataset + "-" + constants.TEST_NEG_FILENAME)

  assert os.path.exists(train_fname), (
      "Run data_download.py first to download and extract {} dataset".format(
          flags_dataset))

  # ============================
  # 參數設置完,開始數據處理
  # ============================
  tf.logging.info("Data preprocessing...")
  ncf_dataset = dataset.data_preprocessing(
      train_fname, test_fname, neg_fname, flags_num_neg)

  # Create NeuMF model and convert it to Estimator
  tf.logging.info("Creating Estimator from Keras model...")
  layers = [int(layer) for layer in flags_layers]
  mlp_regularization = [float(reg) for reg in flags_mlp_regularization]
  keras_model = neumf_model.NeuMF(
      ncf_dataset.num_users, ncf_dataset.num_items, flags_num_factors,
      layers, flags_batch_size, flags_mf_regularization,
      mlp_regularization)
  
  num_gpus = get_gpus_num()
  estimator = convert_keras_to_estimator(keras_model, num_gpus, flags_model_dir)

  # Create hooks that log information about the training and metric values
  train_hooks = [tf.train.ProfilerHook(save_steps=1000, output_dir="hook-profile/")]

  # Training and evaluation cycle
  def train_input_fn():
    return dataset.input_fn(
        True, per_device_batch_size(flags_batch_size, num_gpus),
        ncf_dataset, flags_epochs_between_evals)

  total_training_cycle = flags_train_epochs // flags_epochs_between_evals

  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    # Train the model
    estimator.train(input_fn=train_input_fn, hooks=train_hooks)

    # Evaluate the model
    eval_results = evaluate_model(
        estimator, flags_batch_size, num_gpus, ncf_dataset)

    # Log the HR and NDCG results.
    hr = eval_results[_HR_KEY]
    ndcg = eval_results[_NDCG_KEY]
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # 如果達到了evaluation threshold
    # if model_helpers.past_stop_threshold(flags_hr_threshold, hr):
    if hr >= flags_hr_threshold:
      tf.logging.info(
        "Stop threshold of {} was passed with metric value {}.".format(
            flags_hr_threshold, hr))
      break

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()


if __name__ == "__main__":
  tf.logging.set_verbosity(tf.logging.INFO)
  
  absl_app.run(main)

得到結果打印:

result: HR = 0.1228, NDCG = 0.0575, Loss for final step: 0.57292986.

平均命中率還是挺高的。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章