相關理論指導請參考《推薦系統實踐》——基於物品的協同過濾算法,根據以上思想,以電影案例,使用python實現如下:
#-*- coding: utf-8 -*-
'''''
Created on 2015-06-22
@author: Lockvictor
'''
import sys
import random
import math
import os
from operator import itemgetter
from collections import defaultdict
random.seed(0)
class ItemBasedCF(object):
''''' TopN recommendation - Item Based Collaborative Filtering '''
def __init__(self):
#定義一個訓練集
self.trainset = {}
#定義一個測試集
self.testset = {}
#定義相似的電影數
self.n_sim_movie = 20
#定義推薦的電影數
self.n_rec_movie = 10
self.movie_sim_mat = {}
#定義電影流行度,有一個看過該電影,流行度+1,沒有人看過,流行度的值默認爲0
self.movie_popular = {}
# 記錄電影數量
self.movie_count = 0
# sys.stderr 是用來重定向標準錯誤信息的
print('Similar movie number = %d' % self.n_sim_movie, file=sys.stderr)
print('Recommended movie number = %d' %
self.n_rec_movie, file=sys.stderr)
@staticmethod
def loadfile(filename):
''''' load a file, return a generator. '''
# 以只讀的方式打開一個文件
fp = open(filename, 'r')
# enumerate()爲枚舉,i爲行號從0開始,line爲值
for i, line in enumerate(fp):
# yield 迭代去下一個值,類似next()
# line.strip()用於去除字符串頭尾指定的字符。
yield line.strip('\r\n')
# 計數
if i % 100000 == 0:
print ('loading %s(%s)' % (filename, i), file=sys.stderr)
fp.close()
# 打印加載文件成功
print ('load %s succ' % filename, file=sys.stderr)
# 劃分訓練集和測試集 pivot用來定義訓練集和測試集的比例
def generate_dataset(self, filename, pivot=0.7):
''''' load rating data and split it to training set and test set '''
trainset_len = 0
testset_len = 0
# 加載文件
for line in self.loadfile(filename):
# 依次獲取每行的user,movie,rating
user, movie, rating, _ = line.split('::')
# split the data by pivot
if random.random() < pivot:
self.trainset.setdefault(user, {})
self.trainset[user][movie] = int(rating)
trainset_len += 1
else:
self.testset.setdefault(user, {})
self.testset[user][movie] = int(rating)
testset_len += 1
print ('split training set and test set succ', file=sys.stderr)
print ('train set = %s' % trainset_len, file=sys.stderr)
print ('test set = %s' % testset_len, file=sys.stderr)
# 計算電影之間的相似度
def calc_movie_sim(self):
''''' calculate movie similarity matrix '''
print('counting movies number and popularity...', file=sys.stderr)
for user, movies in self.trainset.items():
for movie in movies:
# 計算電影的流行度,有一個看過該電影,流行度+1,沒有人看過,流行度的值默認爲0
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print('count movies number and popularity succ', file=sys.stderr)
# save the total number of movies
# 計算訓練集中所有的電影數
self.movie_count = len(self.movie_popular)
print('total movie number = %d' % self.movie_count, file=sys.stderr)
# count co-rated users between items
itemsim_mat = self.movie_sim_mat
print('building co-rated users matrix...', file=sys.stderr)
# 獲取物品-物品的共現矩陣
for user, movies in self.trainset.items():
for m1 in movies:
itemsim_mat.setdefault(m1, defaultdict(int))
for m2 in movies:
if m1 == m2:
continue
itemsim_mat[m1][m2] += 1
print('build co-rated users matrix succ', file=sys.stderr)
# 計算物品-物品相似矩陣
# calculate similarity matrix
print('calculating movie similarity matrix...', file=sys.stderr)
# 記錄計算用戶興趣相似度的次數
simfactor_count = 0
# 計算用戶興趣相似度複雜度上限值
PRINT_STEP = 2000000
for m1, related_movies in itemsim_mat.items():
for m2, count in related_movies.items():
# 計算物品-物品相似度,還可以進行歸一化優化
itemsim_mat[m1][m2] = count / math.sqrt(
self.movie_popular[m1] * self.movie_popular[m2])
simfactor_count += 1
if simfactor_count % PRINT_STEP == 0:
print('calculating movie similarity factor(%d)' %
simfactor_count, file=sys.stderr)
print('calculate movie similarity matrix(similarity factor) succ',
file=sys.stderr)
print('Total similarity factor number = %d' %
simfactor_count, file=sys.stderr)
# 進行具體推薦
def recommend(self, user):
''''' Find K similar movies and recommend N movies. '''
# 相似的電影數,需要根據實際的物品-物品共現矩陣來取值,即該電影的相似電影個數
K = self.n_sim_movie
# 推薦的電影數,對一個用戶推薦的總電影數
N = self.n_rec_movie
# 用於存放該用戶的推薦電影以及對應的興趣度(相似度*評分)
rank = {}
watched_movies = self.trainset[user]
for movie, rating in watched_movies.items():
for related_movie, similarity_factor in sorted(self.movie_sim_mat[movie].items(),
key=itemgetter(1), reverse=True)[:K]:
if related_movie in watched_movies:
continue
rank.setdefault(related_movie, 0)
rank[related_movie] += similarity_factor * rating
# return the N best movies
# 只返回前N個推薦的電影
return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]
# 計算 準確率,召回率,覆蓋率,流行度
def evaluate(self):
''''' print evaluation result: precision, recall, coverage and popularity '''
print('Evaluation start...', file=sys.stderr)
N = self.n_rec_movie
# varables for precision and recall
# 記錄推薦正確的電影數
hit = 0
# 記錄推薦電影的總數
rec_count = 0
# 記錄測試數據中總數
test_count = 0
# varables for coverage
# 記錄所有推薦的電影數
all_rec_movies = set()
# varables for popularity
popular_sum = 0
for i, user in enumerate(self.trainset):
if i % 500 == 0:
print ('recommended for %d users' % i, file=sys.stderr)
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user)
for movie, _ in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
popular_sum += math.log(1 + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
# 計算準確度
precision = hit / (1.0 * rec_count)
# 計算召回率
recall = hit / (1.0 * test_count)
# 計算覆蓋率
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
# 計算流行度
popularity = popular_sum / (1.0 * rec_count)
print ('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' %
(precision, recall, coverage, popularity), file=sys.stderr)
if __name__ == '__main__':
ratingfile = os.path.join('ml-1m', 'ratings.dat')
itemcf = ItemBasedCF()
itemcf.generate_dataset(ratingfile)
itemcf.calc_movie_sim()
itemcf.evaluate()