實驗環境:mac mini 8G 6核 i5;
實驗數據:MovieLens數據集;
實驗對比:多進程方式,利用循環方式
實驗結果:多進程方式花費131.77949714660645s;循環方式花費259.33885288238525
兩次所求的物品之間的餘弦相似度矩陣完全相同
多進程方式
import math
import functools
import numpy as np
import pandas as pd
from multiprocessing import Pool
from sklearn.preprocessing import Normalizer
def get_co_occurrence(interests, n=0):
"""
共現矩陣
:param n: 物品的總數
:param interests: 用戶的興趣列表{'user1':[],'user2':[],......}
:return:
"""
co_occurrence = np.zeros((n, n), np.int)
for value in interests:
value = value['movie_id']
increment = 1 / math.log(1 + len(value) * 1.0)
for i in value:
for j in value:
if i == j:
continue
co_occurrence[i - 1][j - 1] += increment
return co_occurrence
if __name__ == '__main__':
r_names = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('/Users/weicai/Desktop/cai/analysis/ml-1m/ratings.dat', sep='::',
header=None, names=r_names, engine='python')
del ratings['rating']
del ratings['timestamp']
ratings_group = ratings.groupby('user_id')
user_interests = [value for _, value in ratings_group]
# 設置成偏函數,n的大小爲電影的個數
get_co_occurrence_new = functools.partial(get_co_occurrence, n=3952)
with Pool(6) as p:
co_occurrence_ist = p.map(get_co_occurrence_new,
[user_interests[:658], user_interests[658:1316], user_interests[1316:1974],
user_interests[1974:2632], user_interests[2632:3290], user_interests[3290:]])
co_occurrence_nor = Normalizer().fit_transform(sum(co_occurrence_ist))
np.save('co_occurrence_nor1', co_occurrence_nor)
循環方式
import numpy as np
import pandas as pd
from sklearn.preprocessing import Normalizer
def get_co_occurrence(interests, n=0):
"""
共現矩陣
:param n: 物品的總數
:param interests: 用戶的興趣列表{'user1':[],'user2':[],......}
:return:
"""
co_occurrence = np.zeros((n, n), np.int)
for value in interests:
value = value[1]['movie_id']
for i in value:
for j in value:
if i == j:
continue
co_occurrence[i - 1][j - 1] += 1
return co_occurrence
if __name__ == '__main__':
r_names = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('/Users/weicai/Desktop/cai/analysis/ml-1m/ratings.dat', sep='::',
header=None, names=r_names, engine='python')
del ratings['rating']
del ratings['timestamp']
ratings_group = ratings.groupby('user_id')
# user_interests = [value for _, value in ratings_group]
# 設置成偏函數,n的大小爲電影的個數
co_occurrence_ist = get_co_occurrence(ratings_group, 3952)
co_occurrence_nor = Normalizer().fit_transform(co_occurrence_ist)
np.save('co_occurrence_nor2', co_occurrence_nor)