1、相關環境
hadoop-2.10.0
hive-3.1.2
hbase-2.2.2
spark-2.4.4
2、相關Hive表結構
CREATE TABLE T_CHANNEL(
ID BIGINT,
EN_NAME STRING,
CH_NAME STRING
)
COMMENT 'channel table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/channel';
CREATE TABLE T_ARTICLE(
ID STRING,
CHANNEL_ID INT,
USER_ID BIGINT,
URL STRING,
TITLE STRING,
CONTENT STRING,
STATUS INT,
CREATE_TIME STRING,
LAST_UPDATE_TIME STRING
)
COMMENT 'article table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/article';
CREATE TABLE T_COMPLETE_ARTICLE(
ID STRING,
CHANNEL_ID INT,
CHANNEL_NAME STRING,
USER_ID BIGINT,
TITLE STRING,
CONTENT STRING,
CREATE_TIME STRING,
COMPLETE_CONTENT STRING,
COMPLETE_CONTENT_WORDS ARRAY<STRING>
)
COMMENT 'complete article table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/complete_article';
CREATE TABLE T_INDEX_KEYWORD_IDF(
INDEX INT,
KEYWORD STRING,
IDF DOUBLE
)
COMMENT 'index keyword idf table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/index_keyword_idf';
CREATE TABLE T_ARTICLE_KEYWORD_TFIDF(
ARTICLE_ID STRING,
CHANNEL_ID INT,
KEYWORD STRING,
TFIDF DOUBLE
)
COMMENT 'article keyword tfidf table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/article_keyword_tfidf';
CREATE TABLE T_ARTICLE_KEYWORD_TEXTRANK(
ARTICLE_ID STRING,
CHANNEL_ID INT,
KEYWORD STRING,
TEXTRANK DOUBLE
)
COMMENT 'article keyword textrank table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/article_keyword_textrank';
CREATE TABLE T_ARTICLE_PROFILE(
ARTICLE_ID STRING,
CHANNEL_ID INT,
KEYWORDS MAP<STRING, DOUBLE>,
TOPICS ARRAY<STRING>
)
COMMENT 'article profile table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/article_profile';
3、相關Python實現
# -*- coding:utf-8 -*-
import os
import sys
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH))
print sys.path
from offline import BaseSparkSession
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.exe'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'
class ModelUtils(object):
# 生成CountVectorizer模型
@staticmethod
def gen_count_vectorizer_model(input_col, output_col, vocab_size, min_df, complete_article_df, hdfs_path):
from pyspark.ml.feature import CountVectorizer
count_vectorizer = CountVectorizer(inputCol=input_col, outputCol=output_col, vocabSize=vocab_size, minDF=min_df)
count_vectorizer_model = count_vectorizer.fit(complete_article_df)
count_vectorizer_model.write().overwrite().save(hdfs_path)
# 獲取CountVectorizer模型
@staticmethod
def get_count_vectorizer_model(hdfs_path):
from pyspark.ml.feature import CountVectorizerModel
return CountVectorizerModel.load(hdfs_path)
# 生成IDF模型
@staticmethod
def gen_idf_model(input_col, output_col, count_vectorizer_result, hdfs_path):
from pyspark.ml.feature import IDF
idf = IDF(inputCol=input_col, outputCol=output_col)
idf_model = idf.fit(count_vectorizer_result)
idf_model.write().overwrite().save(hdfs_path)
# 獲取IDF模型
@staticmethod
def get_idf_model(hdfs_path):
from pyspark.ml.feature import IDFModel
return IDFModel.load(hdfs_path)
class ArticleProfileGenerator(BaseSparkSession):
def __init__(self):
self.SPARK_APP_NAME = 'article_profile_generator'
self.SPARK_MASTER_URL = 'yarn'
self.SPARK_YARN_QUEUE = 'queue3'
self.ENABLE_HIVE_SUPPORT = True
self.spark_session = self.create_spark_session()
# 生成完整文章數據
def gen_complete_article(self):
self.spark_session.sql("use portal")
sql = "select a.*, c.ch_name channel_name from t_article a inner join t_channel c on a.channel_id = c.id"
article_df = self.spark_session.sql(sql)
def completing(partition):
sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from recoutils.segment_utils import Segmentation
segmentation = Segmentation()
for article in partition:
complete_content = article.channel_name + " " + article.title + " " + article.content
complete_content_words = [word for word in segmentation.segment(complete_content) if
word not in [' ', ' ']]
yield (article.id, article.channel_id, article.channel_name, article.user_id, article.title,
article.content, article.create_time, complete_content, complete_content_words)
c_article_columns = ["id", "channel_id", "channel_name", "user_id", "title", "content",
"create_time", "complete_content", "complete_content_words"]
article_df = article_df.rdd.mapPartitions(completing).toDF(c_article_columns)
article_df.write.insertInto("t_complete_article")
return article_df
# 生成模型
def gen_article_model(self, count_vectorizer_model_path, idf_model_path):
complete_article_df = self.spark_session.sql("select * from t_complete_article")
ModelUtils.gen_count_vectorizer_model(input_col="complete_content_words",
output_col="complete_content_count_features",
vocab_size=200000, min_df=1.0,
complete_article_df=complete_article_df,
hdfs_path=count_vectorizer_model_path)
count_vectorizer_model = ModelUtils.get_count_vectorizer_model(count_vectorizer_model_path)
count_vectorizer_result = count_vectorizer_model.transform(complete_article_df)
ModelUtils.gen_idf_model(input_col="complete_content_count_features",
output_col="complete_content_idf_features",
count_vectorizer_result=count_vectorizer_result,
hdfs_path=idf_model_path)
# 計算文章 TFIDF、TEXTRANK
def calculate_article_tfidf_textrank(self, complete_article_df, count_vectorizer_model_path, idf_model_path):
# 獲取模型計算TFIDF
count_vectorizer_model = ModelUtils.get_count_vectorizer_model(count_vectorizer_model_path)
count_vectorizer_result = count_vectorizer_model.transform(complete_article_df)
idf_model = ModelUtils.get_idf_model(idf_model_path)
tfidf_result = idf_model.transform(count_vectorizer_result)
# 抽取TFIDF TOP詞
def extract_tfidf_top_words(partition):
for row in partition:
index_tfidf_list = list(
zip(row.complete_content_idf_features.indices, row.complete_content_idf_features.values))
index_tfidf_list = sorted(index_tfidf_list, key=lambda x: x[1], reverse=True)
index_tfidf_list = index_tfidf_list[:20]
for index_tfidf in index_tfidf_list:
yield row.id, row.channel_id, int(index_tfidf[0]), round(float(index_tfidf[1]), 4)
article_index_tfidf_df = tfidf_result.rdd.mapPartitions(extract_tfidf_top_words).toDF(
["article_id", "channel_id", "index", "tfidf"])
article_index_tfidf_df.show()
# 索引詞映射
keyword_idf_list = list(zip(count_vectorizer_model.vocabulary, idf_model.idf.toArray()))
for i in xrange(len(keyword_idf_list)):
keyword_idf = keyword_idf_list[i]
keyword_idf_list[i] = [i, keyword_idf[0], float(keyword_idf[1])]
index_keyword_idf_df = self.spark_session.sparkContext.parallelize(keyword_idf_list).toDF(["index", "keyword", "idf"])
index_keyword_idf_df.show()
index_keyword_idf_df.write.insertInto("t_index_keyword_idf")
article_keyword_tfidf_df = article_index_tfidf_df.join(index_keyword_idf_df, ["index"], "inner") \
.select(["article_id", "channel_id", "keyword", "tfidf"])
article_keyword_tfidf_df.show()
article_keyword_tfidf_df.write.insertInto('t_article_keyword_tfidf')
ALLOW_POS = ['n', 'nr', 'ns', 'nt', 'nw', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd', 'f', 's', 't', 'x']
# 抽取TEXTRANK TOP詞
def extract_textrank_top_words(partition):
sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from recoutils.segment_utils import Segmentation
for row in partition:
results = Segmentation.text_rank(row.complete_content, topK=20, allowPOS=ALLOW_POS, withWeight=True)
for result in results:
yield row.id, row.channel_id, result[0], result[1]
article_keyword_textrank_df = complete_article_df.rdd.mapPartitions(extract_textrank_top_words) \
.toDF(["article_id", "channel_id", "keyword", "textrank"])
article_keyword_textrank_df.show()
article_keyword_textrank_df.write.insertInto('t_article_keyword_textrank')
return index_keyword_idf_df, article_keyword_tfidf_df, article_keyword_textrank_df
# 生成文章畫像
def gen_article_profile(self, index_keyword_idf_df, article_keyword_tfidf_df, article_keyword_textrank_df):
index_keyword_idf_df.registerTempTable("tmp_index_keyword_idf")
article_keyword_tfidf_df.registerTempTable("tmp_article_keyword_tfidf")
article_keyword_textrank_df.registerTempTable("tmp_article_keyword_textrank")
self.spark_session.sql("use portal")
# 關鍵詞
sql = "select akt.article_id, akt.channel_id, akt.keyword, akt.textrank, iki.idf " \
"from tmp_article_keyword_textrank akt left join tmp_index_keyword_idf iki on akt.keyword = iki.keyword"
article_keyword_df = self.spark_session.sql(sql)
article_keyword_weight_df = article_keyword_df.withColumn("weight", article_keyword_df.textrank * article_keyword_df.idf) \
.select(["article_id", "channel_id", "keyword", "weight"])
article_keyword_weight_df.registerTempTable("tmp_article_keyword_weight")
sql = "select article_id, min(channel_id) channel_id, collect_list(keyword) keywords, " \
"collect_list(weight) weights from tmp_article_keyword_weight group by article_id"
article_keywords_weights_df = self.spark_session.sql(sql)
article_keywords_weights_df.show()
def combine_keywords_weights(partition):
for row in partition:
yield row.article_id, row.channel_id, dict(zip(row.keywords, row.weights))
article_keywords_df = article_keywords_weights_df.rdd.mapPartitions(combine_keywords_weights) \
.toDF(["article_id", "channel_id", "keywords"])
article_keywords_df.show()
# 主題詞 TFIDF TEXTRANK 共現詞
sql = "select akt1.article_id article_id, collect_set(akt1.keyword) topics from tmp_article_keyword_tfidf akt1 " \
"inner join tmp_article_keyword_textrank akt2 on akt1.keyword = akt2.keyword group by akt1.article_id"
article_topics_df = self.spark_session.sql(sql)
article_topics_df.show()
article_keywords_topics_df = article_keywords_df.join(article_topics_df, ["article_id"]) \
.select("article_id", "channel_id", "keywords", "topics")
article_keywords_topics_df.show()
article_keywords_topics_df.write.insertInto("t_article_profile")
if __name__ == '__main__':
article_profile_generator = ArticleProfileGenerator()
complete_article_df = article_profile_generator.gen_complete_article()
index_keyword_idf_df, article_keyword_tfidf_df, article_keyword_textrank_df = article_profile_generator \
.calculate_article_tfidf_textrank(complete_article_df,
"hdfs://192.168.0.1:9000/user/models/count_vectorizer/cv.model",
"hdfs://192.168.0.1:9000/user/models/idf/idf.model")
article_profile_generator.gen_article_profile(index_keyword_idf_df, article_keyword_tfidf_df, article_keyword_textrank_df)