用矩陣分解來解決推薦問題

整體架構

  1. 讀取數據
  2. 訓練模型

分部細節

生成訓練數據

從hadoop生成兩個子文件
(1)docid,pv,cl
(2)mid,cl_docid_duration_dict, pv_docid_set
生成訓練數據:
get_docid_doctype_videotime_by_kv.py
gen_mid_docid.py
gen_train_data.py
生成的結果是mid,docid,score其中score的計算是重點,如下:

def evaluate_score(docid_duration, docid_doctype_videotime_dict):
	items0 = docid_duration.split(':')
	if len(items0) != 2:
		return 0
		docid = items0[0]
		duration = int(items0[1]) if items0[1] != '0' else 5
		doctype, videotime = docid_doctype_videotime_dict[docid]
		if doctype == 0:
			duration = 180 if duration > 180 else duration
            score = 0.5 + 0.5 * (duration/180.0)
		if doctype == 1:
            videotime = 60 if videotime < 60 else (360 if videotime > 360 else videotime)
            duration = videotime if duration > videotime else duration
            score = 0.5 + 0.5 * (duration*1.0 / videotime) * (0.5 + 0.1 * (videotime - 60)/60.0)
        if doctype == 2:
            videotime = 10 if videotime < 10 else (360 if videotime > 360 else videotime)
            duration = videotime if duration > videotime else duration
            score = 0.5 + 0.5 * (duration*1.0 / videotime)
        return score

bm25矩陣分解

讀取第一步得到的mid,docid,score,生成三個列表userid_list, itemid_list, score_list
訓練模型,得到item和user的embedding

from scipy import sparse
import implicit
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender, TFIDFRecommender, bm25_weight)

def train_model():
        global userid_list, itemid_list, score_list, max_userid, max_itemid, csr_item_user_mat, model
        coo_user_item_mat = sparse.coo_matrix((score_list, (userid_list, itemid_list)), shape=(max_userid + 1, max_itemid + 1))
        coo_item_user_mat = coo_user_item_mat.T
        coo_item_user_mat = bm25_weight(coo_item_user_mat, K1=100, B=0.8)
        csr_item_user_mat = coo_item_user_mat.tocsr()
        model = implicit.als.AlternatingLeastSquares(factors=100, iterations=10, calculate_training_loss=True)
        model.fit(csr_item_user_mat)


def output_embedding(user_embedding_filepath, item_embedding_filepath):
        global model
        fw0 = open(user_embedding_filepath, 'w')
        fw1 = open(item_embedding_filepath, 'w')
        for i,emb in enumerate(model.user_factors):
            fw0.write(str(i) + '\t' + ','.join(map(str,emb)) + '\n')
        for i,emb in enumerate(model.item_factors):
            fw1.write(str(i) + '\t' + ','.join(map(str,emb)) + '\n')
        fw0.close()
        fw1.close()

這裏介紹幾個知識點:

  1. sparse.coo_matrix()
    將此矩陣轉換爲壓縮稀疏行格式,重複的條目將彙總在一起
    參考
  2. bm25_weight
    其實我們可以用numpy或者是自己寫公式完成矩陣分解的過程,但是會比較慢,這裏提供一個比較快的方法就是利用implicit庫中的bm25算法
    參考

lightfm

from lightfm import LightFM
from lightfm.data import Dataset

def train_model():
        global mid_list, docid_list, score_list, dataset, _id_user_mapping, _id_item_mapping, model
        dataset = Dataset()
        dataset.fit(users=np.unique(mid_list), items=np.unique(docid_list))
        print 'len(dataset._user_id_mapping):', len(dataset._user_id_mapping)
        print 'len(dataset._item_id_mapping):', len(dataset._item_id_mapping) 
        _id_user_mapping = {v: k for k, v in dataset._user_id_mapping.items()}
        _id_item_mapping = {v: k for k, v in dataset._item_id_mapping.items()}
    
        train_interactions, train_weights = dataset.build_interactions((mid_list[i], docid_list[i], score_list[i]) for i in range(len(score_list)))
        model = LightFM(no_components=100, loss='warp')
        print 'model.fit start ...'
        model.fit(interactions=train_interactions, sample_weight=train_weights, epochs=my_epochs, num_threads=20, verbose=True)
        print 'model.fit end ...'
  
        item_biased, item_representations = model.get_item_representations()
        print item_biased.shape
        #print item_biased
        print item_representations.shape
        #print item_representations
    
        user_biases, user_embeddings = model.get_user_representations()
        print user_biases.shape
        #print user_biases
        print user_embeddings.shape
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章