推薦系統筆記-03-用戶畫像

1、相關環境

hadoop-2.10.0

hive-3.1.2

hbase-2.2.2

spark-2.4.4

2、相關表結構

Hive

CREATE TABLE T_USER_OP_LOG(
USER_ID BIGINT,
ARTICLE_ID STRING,
CHANNEL_ID INT,
OP_TYPE INT COMMENT '1 display 2 click 3 collect 4 share',
OP_TIME STRING,
OP_DURATION INT,
ALGO INT
)
COMMENT 'user operation log table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/user_op_log';

CREATE TABLE T_USER_BEHAVIOUR(
USER_ID BIGINT,
ARTICLE_ID STRING,
CHANNEL_ID INT,
DISPLAY_FLAG INT,
CLICK_FLAG INT,
COLLECT_FLAG INT,
SHARE_FLAG INT,
OP_TIME STRING,
OP_DURATION INT
)
COMMENT 'user behaviour table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/user_behaviour';

HBase

create 'user_profile', 'b', 'p'

3、相關Python實現

# -*- coding:utf-8 -*-

import os
import sys
import numpy as np
from datetime import datetime
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH))
print sys.path
from offline import BaseSparkSession

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.exe'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'


class UserProfileGenerator(BaseSparkSession):

    def __init__(self):
        self.SPARK_APP_NAME = 'user_profile_generator'
        self.SPARK_MASTER_URL = 'yarn'
        self.SPARK_YARN_QUEUE = 'queue3'
        self.ENABLE_HIVE_SUPPORT = True
        self.spark_session = self.create_spark_session()

    # 生成用戶行爲數據
    def gen_user_behaviour(self):
        self.spark_session.sql("use portal")
        user_op_log_df = self.spark_session.sql("select * from t_user_op_log")

        def convert_to_behaviour(partition):
            for row in partition:
                if row.op_type == 1:
                    yield row.user_id, row.article_id, row.channel_id, 1, 0, 0, 0, row.op_time, row.op_duration
                elif row.op_type == 2:
                    yield row.user_id, row.article_id, row.channel_id, 1, 1, 0, 0, row.op_time, row.op_duration
                elif row.op_type == 3:
                    yield row.user_id, row.article_id, row.channel_id, 1, 1, 1, 0, row.op_time, row.op_duration
                elif row.op_type == 4:
                    yield row.user_id, row.article_id, row.channel_id, 1, 1, 0, 1, row.op_time, row.op_duration
        user_behaviour_df = user_op_log_df.rdd.mapPartitions(convert_to_behaviour) \
            .toDF(["user_id", "article_id", "channel_id", "display_flag", "click_flag",
                   "collect_flag", "share_flag", "op_time", "op_duration"])
        user_behaviour_df.show()

        # 合併用戶文章行爲
        user_behaviour_df.registerTempTable("tmp_user_behaviour")
        sql = "select user_id, article_id, min(channel_id) channel_id, max(display_flag) display_flag, " \
              "max(click_flag) click_flag, max(collect_flag) collect_flag, max(share_flag) share_flag," \
              "max(op_time) op_time, max(op_duration) op_duration from tmp_user_behaviour group by user_id, article_id"
        user_behaviour_df = self.spark_session.sql(sql)
        user_behaviour_df.show()
        user_behaviour_df.write.insertInto("t_user_behaviour")

    # 生成用戶基本畫像
    def gen_user_basic_profile(self):
        self.spark_session.sql("use portal")
        user_df = self.spark_session.sql("select * from t_user")
        user_df.show()

        def insert_user_profile(partition):

            sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
            from recoutils.hbase_utils import HBaseUtils
            hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)

            for row in partition:
                basic_info = {
                    "b:name": row.name,
                    "b:gender": str(row.gender),
                    "b:age": str(row.age),
                    "b:mobile_phone": row.mobile_phone,
                    "b:email": row.email
                }
                hbase_utils.insert("user_profile", "u:{}".format(row.id).encode(), basic_info)

        user_df.foreachPartition(insert_user_profile)

    # 生成用戶偏愛喜好畫像
    def gen_user_preference_profile(self):
        self.spark_session.sql("use portal")
        # 關聯用戶文章主題詞
        sql = "select ub.*, ap.topics from t_user_behaviour ub " \
              "left join t_article_profile ap on ub.article_id = ap.article_id " \
              "where (ub.click_flag != 0 or ub.collect_flag != 0 or ub.share_flag != 0) and ap.topics is not null"
        user_behaviour_df = self.spark_session.sql(sql)
        user_behaviour_df.show()

        import pyspark.sql.functions as F
        user_behaviour_df = user_behaviour_df.withColumn("topic", F.explode("topics")).drop("topics")
        user_behaviour_df.show()

        def insert_user_profile(partition):

            sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
            from recoutils.hbase_utils import HBaseUtils
            hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)

            for row in partition:
                # 計算頻道主題標籤權值 時間衰減係數 * 行爲權重和
                t = datetime.now() - datetime.strptime(row.op_time, '%Y-%m-%d %H:%M:%S')
                time_exp = 1 / (np.log(t.days + 1) + 1)
                duration_weight = 2 if row.op_duration > 1000 else 1
                weight = time_exp * (row.click_flag * 5 + row.collect_flag * 3 + row.share_flag * 4 + duration_weight)

                hbase_utils.insert("user_profile", "u:{}".format(row.user_id).encode(),
                    {"p:{}:{}".format(row.channel_id, row.topic).encode(): b'%0.4f' % weight})

        user_behaviour_df.foreachPartition(insert_user_profile)


if __name__ == '__main__':
    user_profile_generator = UserProfileGenerator()
    user_profile_generator.gen_user_behaviour()
    user_profile_generator.gen_user_basic_profile()
    user_profile_generator.gen_user_preference_profile()

人生偌只如初見

發佈了119 篇原創文章 · 獲贊 68 · 訪問量 34萬+

私信關注

推薦系統筆記-03-用戶畫像

使用c#強大的表達式樹實現對象的深克隆之解決循環引用的問題

痞子衡嵌入式：恩智浦i.MX RT1xxx系列MCU啓動那些事（12.A）- uSDHC eMMC啓動時間(RT1170)

GPT-4o 引領人機交互新風向，向量數據庫賽道沸騰了

企業大模型如何成爲自己數據的“百科全書”？

本地SSL證書過期輸入命令在IIS自動生成

基於Ubuntu-22.04安裝K8s-v1.28.2實驗（二）使用kube-vip實現集羣VIP訪問

.NET週刊【5月第2期 2024-05-12】

Python學習筆記-WXPY語音信息識別

推薦系統筆記-01-文章畫像

Python學習筆記-WXPY初識

Activiti學習筆記-整合SpringBoot與簡單使用

ElasticSearch學習筆記-ngram、中文拼音、簡繁體搜索記錄

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結