ItemCF算法(利用多進程加速)

實驗環境:mac mini 8G 6核 i5;
實驗數據:MovieLens數據集;
實驗對比:多進程方式,利用循環方式
實驗結果:多進程方式花費131.77949714660645s;循環方式花費259.33885288238525
兩次所求的物品之間的餘弦相似度矩陣完全相同

多進程方式

import math
import functools
import numpy as np
import pandas as pd

from multiprocessing import Pool
from sklearn.preprocessing import Normalizer

def get_co_occurrence(interests, n=0):
    """
    共現矩陣
    :param n: 物品的總數
    :param interests: 用戶的興趣列表{'user1':[],'user2':[],......}
    :return:
    """
    co_occurrence = np.zeros((n, n), np.int)
    for value in interests:
        value = value['movie_id']
        increment = 1 / math.log(1 + len(value) * 1.0)
        for i in value:
            for j in value:
                if i == j:
                    continue
                co_occurrence[i - 1][j - 1] += increment
    return co_occurrence


if __name__ == '__main__':
    r_names = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings = pd.read_table('/Users/weicai/Desktop/cai/analysis/ml-1m/ratings.dat', sep='::',
                            header=None, names=r_names, engine='python')

    del ratings['rating']
    del ratings['timestamp']
    ratings_group = ratings.groupby('user_id')
    user_interests = [value for _, value in ratings_group]
    # 設置成偏函數,n的大小爲電影的個數
    get_co_occurrence_new = functools.partial(get_co_occurrence, n=3952)
    with Pool(6) as p:
        co_occurrence_ist = p.map(get_co_occurrence_new,
                                  [user_interests[:658], user_interests[658:1316], user_interests[1316:1974],
                                   user_interests[1974:2632], user_interests[2632:3290], user_interests[3290:]])
    co_occurrence_nor = Normalizer().fit_transform(sum(co_occurrence_ist))
    np.save('co_occurrence_nor1', co_occurrence_nor)

循環方式

import numpy as np
import pandas as pd

from sklearn.preprocessing import Normalizer


def get_co_occurrence(interests, n=0):
    """
    共現矩陣
    :param n: 物品的總數
    :param interests: 用戶的興趣列表{'user1':[],'user2':[],......}
    :return:
    """
    co_occurrence = np.zeros((n, n), np.int)
    for value in interests:
        value = value[1]['movie_id']
        for i in value:
            for j in value:
                if i == j:
                    continue
                co_occurrence[i - 1][j - 1] += 1
    return co_occurrence


if __name__ == '__main__':
    r_names = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings = pd.read_table('/Users/weicai/Desktop/cai/analysis/ml-1m/ratings.dat', sep='::',
                            header=None, names=r_names, engine='python')

    del ratings['rating']
    del ratings['timestamp']
    ratings_group = ratings.groupby('user_id')
    # user_interests = [value for _, value in ratings_group]
    # 設置成偏函數,n的大小爲電影的個數
    co_occurrence_ist = get_co_occurrence(ratings_group, 3952)
    co_occurrence_nor = Normalizer().fit_transform(co_occurrence_ist)
    np.save('co_occurrence_nor2', co_occurrence_nor)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章