基於相似度的推薦系統
一. 排行榜單推薦
from sklearn.model_selection import train_test_split
triplet_dataset_sub_song_merged_set = triplet_dataset_sub_song_merged
train_data, test_data = train_test_split(triplet_dataset_sub_song_merged_set, test_size = 0.3, random_state = 42)
train_data.head()
def create_popularity_recommendation(train_data, user_id, item_id):
train_data_grouped = train_data.groupby([item_id]).agg({user_id:'count'}).reset_index()
train_data_grouped.rename(columns = {user_id:'score'}, inplace = True)
train_data_sort = train_data_grouped.sort_values(['score', item_id], ascending=[0, 1])
train_data_sort['Rank'] = train_data_sort['score'].rank(ascending = 0, method = 'first')
popularity_recommendations = train_data_sort.head(20)
return popularity_recommendations
commendations = creat_popularity_recommendation(triplet_dataset_sub_song_merged, 'user', 'title')
commendations
二. 基於歌曲相似度的推薦
- 選擇一小部分歌曲來實驗
song_count_subset = song_count_df.head(1000)
user_subset = list(play_count_subset.user)
song_subset = list(song_count_subset.song)
triplet_dataset_sub_song_merged_sub = triplet_dataset_sub_song_merged[triplet_dataset_sub_song_merged.song.isin(song_subset)]
triplet_dataset_sub_song_merged_sub.head()
# 推薦系統
class item_similarity_recommender_py(object):
def __init__(self):
self.train_data = None
self.user_id = None
self.item_id = None
self.coccurence_matrix = None
self.songs_dict = None
self.rev_songs_dict = None
self.item_similarity_recommendations = None
def get_user_items(self, user):
user_data = self.train_data[self.train_data[self.user_id] == user]
user_items = list(user_data[self.item_id].unique())
return user_items
def get_item_users(self, item):
item_data = self.train_data[self.train_data[self.item_id] == item]
item_users = self(item_data[self.user_id].unique())
return item_users
def get_all_items_train_data(self):
all_items = list(self.train_data[self.item_id].unique())
return all_items
def construct_cooccurence_matrix(self, user_songs, all_songs):
user_songs_users = []
for i in range(0, len(user_songs)):
user_songs_users.append(self.get_item_users(user_songs[i]))
cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
for i in range(0, len(all_songs)):
songs_i_data = self.train_data[self.train_data[self.item_id] == all_songs[i]]
user_i = set(songs_i_data[self.user_id].unique())
for j in range(0, len(user_songs)):
users_j = user_songs_users[j]
users_intersection = users_i.intersection(users_j) # 返回集合交集
if len(users_intersection) != 0
users_union = users_i.union(users_j) # 返回集合並集
cooccurence_matrix[j,i] = float(len(users_intersection)) / float(len(users_union))
else:
cooccurence_matrix[j,i] = 0
return cooccurence_matrix
def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
print('Non zero values in cooccurence_matrix:%d' % np.count_nonzero(cooccurence_matrix))
user_sim_scores = cooccurence_matrix.sum(axis=0) / float(cooccurence_matrix.shape[0])
user_sim_scores = np.array(user_sim_scores)[0].tolist()
sort_index = sorted(((e, i) for i,e in enumerate(list(user_sim_scores))), reverse = True)
columns = ['user_id', 'song', 'score', 'rank']
df = pd.DataFrame(columns = columns)
rank = 1
for i in range(0, len(sort_index)):
if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
df.loc[len(df)] = [user, all_songs[sort_index[i][1]], sort_index[i][0], rank]
rank += 1
if df.shape[0] == 0:
print('The current user has no songs for training the item similarity based recommendation model.')
return -1
else:
return df
def create(self, train_data, user_id, item_id):
self.train_data = train_data
self.user_id = user_id
self.item_id = item_id
def recommend(self, user):
user_songs = self.get_user_items(user)
print('No. of unique songs for the user: %d' % len(user_songs))
all_songs = self.get_all_items_train_data()
print('No. of unique songs in the training set: %d' % len(all_songs))
cooccurence_matrix = self.contruct_cooccurence_matrix(user_songs, all_songs)
df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
return df_recommendations
def get_similar_items(self, item_list):
user_songs = item_list
all_songs = self.get_all_items_train_data()
print('No. of unique songs in the training set: %d' % len(all_songs))
cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
user = ''
de_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
return de_recommendations
train_data, test_data = train_test_split(triplet_dataset_sub_song_merged_sub, test_size = 0.3, random_state = 42)
is_model = item_similarity_recommender_py()
is_model.create(train_data, 'user', 'title')
user_id = list(train_data.user)[7]
user_items = is_model.get_user_items(user_id)
is_model.recommend(user_id)
No. of unique songs for the user: 19
No. of unique songs in the training set: 997
Non zero values in cooccurence_matrix:18876