TF-IDF计算相似文章

%spark_recommend.pyspark
from pyspark.sql.types import BooleanType,LongType
from scipy.stats import norm, t
from pyspark.sql import SparkSession, DataFrame, functions as F
from pyspark.sql.functions import udf,lit
from dateutil.parser import parse
import string
from pyspark.sql.types import IntegerType, ArrayType
import pandas as pd
import pymysql
import jieba
import re
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import col,udf, json_tuple, lit

dp='2020-04-20'
start_date='2020-04-20'
end_date='2020-04-20'

STOP_WORDS =set(['的', '之', '吗','了'])
def is_chinese(content):
    """检查是否是纯中文"""
    return re.compile('^[\u4e00-\u9fa5]*$').match(content) is not None
def cut_post(content):
    # """分词且只保留中文单词。
    return [i for i in jieba.cut(content) if is_chinese(i) and i not in STOP_WORDS]

spark = SparkSession.builder \
    .appName("cold_start_decay_group_1") \
    .enableHiveSupport() \
    .getOrCreate()
    
tb_post = 'dwd.dwd_post_fact'

# 步骤一:取最近600天的帖子
# 步骤二:计算热帖:最近30点击数大于2
# 步骤三:热帖 union c类帖,然后distinct
# 步骤四:注册分词udf(分词-过滤非中文-过滤停用词),返回ArrayType(StringType())类型。

df = spark.table(tb_post)
df = df.filter(df.post_type.isin([2, 5, 8])).filter(df.display_yn == 1)
df = df.filter('post_video_yn = 0')
df = df.filter('content is not null')

df = df.filter(df.create_date >=start_date)
df = df.selectExpr('post_id', 'content', 'create_date', 'post_video_yn','tags.name tags', 'uid')
assert df.count() > 1000, '表%s数据源出问题了,帖子数量应该大于1000条' % tb_post

post =df.distinct()
spark.udf.register('cut_udf', cut_post)
post =post.selectExpr('post_id', 'create_date', 'cut_udf(content) as words')
word_cnt = pp.select(F.explode('words').alias('word')).distinct().count()
print('去重后词总数为%d'% word_cnt)
# post.write.mode('overwrite').saveAsTable('tmp.post_words3') 
post.show()

处理后的数据。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章