推薦系統筆記-05-用戶基於內容召回集

1、相關環境

hadoop-2.10.0

hive-3.1.2

hbase-2.2.2

spark-2.4.4

2、相關表結構

HBase

alter 'multiple_recall', {NAME=>'content', TTL=>2592000, VERSIONS=>9999}

3、相關Python實現

# -*- coding:utf-8 -*-

import os
import sys
import numpy as np
from datetime import datetime
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH))
print sys.path
from offline import BaseSparkSession

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.exe'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'


class UserRecallBasedContent(BaseSparkSession):

    def __init__(self):
        self.SPARK_APP_NAME = 'user_recall_based_content'
        self.SPARK_MASTER_URL = 'yarn'
        self.SPARK_YARN_QUEUE = 'queue3'
        self.ENABLE_HIVE_SUPPORT = True
        self.spark_session = self.create_spark_session()

    # 生成用戶召回結果
    def gen_user_recall(self):
        self.spark_session.sql("use portal")
        # 用戶文章點擊行爲
        sql = "select user_id, article_id, channel_id from t_user_behaviour where click_flag = 1"
        user_article_click_behaviour_df = self.spark_session.sql(sql)
        user_article_click_behaviour_df.show()

        # 用戶召回數據存儲HBase
        def insert_user_recall(partition):
            sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
            import json
            from recoutils.hbase_utils import HBaseUtils
            hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)

            for row in partition:
                # 獲取文章相似文章列表
                article_id = row.article_id
                art_sim_art_result = hbase_utils.read_rows("article_similarity",
                                                           [b"{}".format(article_id)],
                                                           columns=[b"sim"])
                if art_sim_art_result:
                    # 排序TOPK
                    sorted_result = sorted(art_sim_art_result[0][1].items(), key=lambda item: item[1], reverse=True)
                    art_sim_art_list = [art_sim[0].split(":")[1] for art_sim in sorted_result][:5]
                    print "{} sim arts {}".format(article_id, art_sim_art_list)

                    # 獲取歷史召回數據
                    history_recall_list = hbase_utils.read_cells("history_recall",
                                                                 "u:{}".format(row.user_id).encode(),
                                                                 "channel:{}".format(row.channel_id).encode())
                    history_recommend_list = []
                    for history_recall in history_recall_list:
                        history_recommend_list.extend(eval(history_recall))

                    # 過濾歷史召回數據
                    recommend_list = list(set(art_sim_art_list) - set(history_recommend_list))

                    # 存儲召回數據和歷史召回數據
                    if recommend_list:
                        hbase_utils.insert("multiple_recall",
                                           "u:{}".format(row.user_id).encode(),
                                           {"content:{}".format(row.channel_id).encode(): json.dumps(recommend_list).encode()})
                        hbase_utils.insert("history_recall",
                                           "u:{}".format(row.user_id).encode(),
                                           {"channel:{}".format(row.channel_id).encode(): json.dumps(recommend_list).encode()})

        user_article_click_behaviour_df.foreachPartition(insert_user_recall)


if __name__ == '__main__':
    user_recall_based_content = UserRecallBasedContent()
    user_recall_based_content.gen_user_recall()

 

發佈了119 篇原創文章 · 獲贊 68 · 訪問量 34萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章