推薦系統筆記-01-文章畫像

1、相關環境

hadoop-2.10.0

hive-3.1.2

hbase-2.2.2

spark-2.4.4

2、相關Hive表結構

CREATE TABLE T_CHANNEL(
ID BIGINT,
EN_NAME STRING,
CH_NAME STRING
)
COMMENT 'channel table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/channel';

CREATE TABLE T_ARTICLE(
ID STRING,
CHANNEL_ID INT,
USER_ID BIGINT,
URL STRING,
TITLE STRING,
CONTENT STRING,
STATUS INT,
CREATE_TIME STRING,
LAST_UPDATE_TIME STRING
)
COMMENT 'article table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/article';

CREATE TABLE T_COMPLETE_ARTICLE(
ID STRING,
CHANNEL_ID INT,
CHANNEL_NAME STRING,
USER_ID BIGINT,
TITLE STRING,
CONTENT STRING,
CREATE_TIME STRING,
COMPLETE_CONTENT STRING,
COMPLETE_CONTENT_WORDS ARRAY<STRING>
)
COMMENT 'complete article table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/complete_article';

CREATE TABLE T_INDEX_KEYWORD_IDF(
INDEX INT,
KEYWORD STRING,
IDF DOUBLE
)
COMMENT 'index keyword idf table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/index_keyword_idf';

CREATE TABLE T_ARTICLE_KEYWORD_TFIDF(
ARTICLE_ID STRING,
CHANNEL_ID INT,
KEYWORD STRING,
TFIDF DOUBLE
)
COMMENT 'article keyword tfidf table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/article_keyword_tfidf';

CREATE TABLE T_ARTICLE_KEYWORD_TEXTRANK(
ARTICLE_ID STRING,
CHANNEL_ID INT,
KEYWORD STRING,
TEXTRANK DOUBLE
)
COMMENT 'article keyword textrank table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/article_keyword_textrank';

CREATE TABLE T_ARTICLE_PROFILE(
ARTICLE_ID STRING,
CHANNEL_ID INT,
KEYWORDS MAP<STRING, DOUBLE>,
TOPICS ARRAY<STRING>
)
COMMENT 'article profile table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/article_profile';

3、相關Python實現

# -*- coding:utf-8 -*-

import os
import sys
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH))
print sys.path
from offline import BaseSparkSession

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.exe'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'


class ModelUtils(object):

    # 生成CountVectorizer模型
    @staticmethod
    def gen_count_vectorizer_model(input_col, output_col, vocab_size, min_df, complete_article_df, hdfs_path):
        from pyspark.ml.feature import CountVectorizer
        count_vectorizer = CountVectorizer(inputCol=input_col, outputCol=output_col, vocabSize=vocab_size, minDF=min_df)
        count_vectorizer_model = count_vectorizer.fit(complete_article_df)
        count_vectorizer_model.write().overwrite().save(hdfs_path)

    # 獲取CountVectorizer模型
    @staticmethod
    def get_count_vectorizer_model(hdfs_path):
        from pyspark.ml.feature import CountVectorizerModel
        return CountVectorizerModel.load(hdfs_path)

    # 生成IDF模型
    @staticmethod
    def gen_idf_model(input_col, output_col, count_vectorizer_result, hdfs_path):
        from pyspark.ml.feature import IDF
        idf = IDF(inputCol=input_col, outputCol=output_col)
        idf_model = idf.fit(count_vectorizer_result)
        idf_model.write().overwrite().save(hdfs_path)

    # 獲取IDF模型
    @staticmethod
    def get_idf_model(hdfs_path):
        from pyspark.ml.feature import IDFModel
        return IDFModel.load(hdfs_path)


class ArticleProfileGenerator(BaseSparkSession):

    def __init__(self):
        self.SPARK_APP_NAME = 'article_profile_generator'
        self.SPARK_MASTER_URL = 'yarn'
        self.SPARK_YARN_QUEUE = 'queue3'
        self.ENABLE_HIVE_SUPPORT = True
        self.spark_session = self.create_spark_session()

    # 生成完整文章數據
    def gen_complete_article(self):
        self.spark_session.sql("use portal")
        sql = "select a.*, c.ch_name channel_name from t_article a inner join t_channel c on a.channel_id = c.id"
        article_df = self.spark_session.sql(sql)

        def completing(partition):
            sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
            from recoutils.segment_utils import Segmentation
            segmentation = Segmentation()
            for article in partition:
                complete_content = article.channel_name + " " + article.title + " " + article.content
                complete_content_words = [word for word in segmentation.segment(complete_content) if
                                          word not in [' ', ' ']]
                yield (article.id, article.channel_id, article.channel_name, article.user_id, article.title,
                       article.content, article.create_time, complete_content, complete_content_words)

        c_article_columns = ["id", "channel_id", "channel_name", "user_id", "title", "content",
                             "create_time", "complete_content", "complete_content_words"]
        article_df = article_df.rdd.mapPartitions(completing).toDF(c_article_columns)
        article_df.write.insertInto("t_complete_article")
        return article_df

    # 生成模型
    def gen_article_model(self, count_vectorizer_model_path, idf_model_path):
        complete_article_df = self.spark_session.sql("select * from t_complete_article")
        ModelUtils.gen_count_vectorizer_model(input_col="complete_content_words",
                                              output_col="complete_content_count_features",
                                              vocab_size=200000, min_df=1.0,
                                              complete_article_df=complete_article_df,
                                              hdfs_path=count_vectorizer_model_path)
        count_vectorizer_model = ModelUtils.get_count_vectorizer_model(count_vectorizer_model_path)
        count_vectorizer_result = count_vectorizer_model.transform(complete_article_df)

        ModelUtils.gen_idf_model(input_col="complete_content_count_features",
                                 output_col="complete_content_idf_features",
                                 count_vectorizer_result=count_vectorizer_result,
                                 hdfs_path=idf_model_path)

    # 計算文章 TFIDF、TEXTRANK
    def calculate_article_tfidf_textrank(self, complete_article_df, count_vectorizer_model_path, idf_model_path):
        # 獲取模型計算TFIDF
        count_vectorizer_model = ModelUtils.get_count_vectorizer_model(count_vectorizer_model_path)
        count_vectorizer_result = count_vectorizer_model.transform(complete_article_df)
        idf_model = ModelUtils.get_idf_model(idf_model_path)
        tfidf_result = idf_model.transform(count_vectorizer_result)

        # 抽取TFIDF TOP詞
        def extract_tfidf_top_words(partition):
            for row in partition:
                index_tfidf_list = list(
                    zip(row.complete_content_idf_features.indices, row.complete_content_idf_features.values))
                index_tfidf_list = sorted(index_tfidf_list, key=lambda x: x[1], reverse=True)
                index_tfidf_list = index_tfidf_list[:20]
                for index_tfidf in index_tfidf_list:
                    yield row.id, row.channel_id, int(index_tfidf[0]), round(float(index_tfidf[1]), 4)

        article_index_tfidf_df = tfidf_result.rdd.mapPartitions(extract_tfidf_top_words).toDF(
            ["article_id", "channel_id", "index", "tfidf"])
        article_index_tfidf_df.show()

        # 索引詞映射
        keyword_idf_list = list(zip(count_vectorizer_model.vocabulary, idf_model.idf.toArray()))
        for i in xrange(len(keyword_idf_list)):
            keyword_idf = keyword_idf_list[i]
            keyword_idf_list[i] = [i, keyword_idf[0], float(keyword_idf[1])]

        index_keyword_idf_df = self.spark_session.sparkContext.parallelize(keyword_idf_list).toDF(["index", "keyword", "idf"])
        index_keyword_idf_df.show()
        index_keyword_idf_df.write.insertInto("t_index_keyword_idf")

        article_keyword_tfidf_df = article_index_tfidf_df.join(index_keyword_idf_df, ["index"], "inner") \
            .select(["article_id", "channel_id", "keyword", "tfidf"])
        article_keyword_tfidf_df.show()
        article_keyword_tfidf_df.write.insertInto('t_article_keyword_tfidf')

        ALLOW_POS = ['n', 'nr', 'ns', 'nt', 'nw', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd', 'f', 's', 't', 'x']

        # 抽取TEXTRANK TOP詞
        def extract_textrank_top_words(partition):
            sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
            from recoutils.segment_utils import Segmentation
            for row in partition:
                results = Segmentation.text_rank(row.complete_content, topK=20, allowPOS=ALLOW_POS, withWeight=True)
                for result in results:
                    yield row.id, row.channel_id, result[0], result[1]

        article_keyword_textrank_df = complete_article_df.rdd.mapPartitions(extract_textrank_top_words) \
            .toDF(["article_id", "channel_id", "keyword", "textrank"])
        article_keyword_textrank_df.show()
        article_keyword_textrank_df.write.insertInto('t_article_keyword_textrank')

        return index_keyword_idf_df, article_keyword_tfidf_df, article_keyword_textrank_df

    # 生成文章畫像
    def gen_article_profile(self, index_keyword_idf_df, article_keyword_tfidf_df, article_keyword_textrank_df):
        index_keyword_idf_df.registerTempTable("tmp_index_keyword_idf")
        article_keyword_tfidf_df.registerTempTable("tmp_article_keyword_tfidf")
        article_keyword_textrank_df.registerTempTable("tmp_article_keyword_textrank")
        self.spark_session.sql("use portal")
        # 關鍵詞
        sql = "select akt.article_id, akt.channel_id, akt.keyword, akt.textrank, iki.idf " \
              "from tmp_article_keyword_textrank akt left join tmp_index_keyword_idf iki on akt.keyword = iki.keyword"
        article_keyword_df = self.spark_session.sql(sql)

        article_keyword_weight_df = article_keyword_df.withColumn("weight", article_keyword_df.textrank * article_keyword_df.idf) \
            .select(["article_id", "channel_id", "keyword", "weight"])

        article_keyword_weight_df.registerTempTable("tmp_article_keyword_weight")

        sql = "select article_id, min(channel_id) channel_id, collect_list(keyword) keywords, " \
              "collect_list(weight) weights from tmp_article_keyword_weight group by article_id"
        article_keywords_weights_df = self.spark_session.sql(sql)
        article_keywords_weights_df.show()

        def combine_keywords_weights(partition):
            for row in partition:
                yield row.article_id, row.channel_id, dict(zip(row.keywords, row.weights))

        article_keywords_df = article_keywords_weights_df.rdd.mapPartitions(combine_keywords_weights) \
            .toDF(["article_id", "channel_id", "keywords"])
        article_keywords_df.show()

        # 主題詞 TFIDF TEXTRANK 共現詞
        sql = "select akt1.article_id article_id, collect_set(akt1.keyword) topics from tmp_article_keyword_tfidf akt1 " \
              "inner join tmp_article_keyword_textrank akt2 on akt1.keyword = akt2.keyword group by akt1.article_id"
        article_topics_df = self.spark_session.sql(sql)
        article_topics_df.show()

        article_keywords_topics_df = article_keywords_df.join(article_topics_df, ["article_id"]) \
            .select("article_id", "channel_id", "keywords", "topics")
        article_keywords_topics_df.show()
        article_keywords_topics_df.write.insertInto("t_article_profile")


if __name__ == '__main__':
    article_profile_generator = ArticleProfileGenerator()
    complete_article_df = article_profile_generator.gen_complete_article()
    index_keyword_idf_df, article_keyword_tfidf_df, article_keyword_textrank_df = article_profile_generator \
        .calculate_article_tfidf_textrank(complete_article_df,
                                          "hdfs://192.168.0.1:9000/user/models/count_vectorizer/cv.model",
                                          "hdfs://192.168.0.1:9000/user/models/idf/idf.model")
    article_profile_generator.gen_article_profile(index_keyword_idf_df, article_keyword_tfidf_df, article_keyword_textrank_df)

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章