TF-IDF計算相似文章

%spark_recommend.pyspark
from pyspark.sql.types import BooleanType,LongType
from scipy.stats import norm, t
from pyspark.sql import SparkSession, DataFrame, functions as F
from pyspark.sql.functions import udf,lit
from dateutil.parser import parse
import string
from pyspark.sql.types import IntegerType, ArrayType
import pandas as pd
import pymysql
import jieba
import re
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import col,udf, json_tuple, lit

dp='2020-04-20'
start_date='2020-04-20'
end_date='2020-04-20'

STOP_WORDS =set(['的', '之', '嗎','了'])
def is_chinese(content):
    """檢查是否是純中文"""
    return re.compile('^[\u4e00-\u9fa5]*$').match(content) is not None
def cut_post(content):
    # """分詞且只保留中文單詞。
    return [i for i in jieba.cut(content) if is_chinese(i) and i not in STOP_WORDS]

spark = SparkSession.builder \
    .appName("cold_start_decay_group_1") \
    .enableHiveSupport() \
    .getOrCreate()
    
tb_post = 'dwd.dwd_post_fact'

# 步驟一:取最近600天的帖子
# 步驟二:計算熱帖:最近30點擊數大於2
# 步驟三:熱帖 union c類帖,然後distinct
# 步驟四:註冊分詞udf(分詞-過濾非中文-過濾停用詞),返回ArrayType(StringType())類型。

df = spark.table(tb_post)
df = df.filter(df.post_type.isin([2, 5, 8])).filter(df.display_yn == 1)
df = df.filter('post_video_yn = 0')
df = df.filter('content is not null')

df = df.filter(df.create_date >=start_date)
df = df.selectExpr('post_id', 'content', 'create_date', 'post_video_yn','tags.name tags', 'uid')
assert df.count() > 1000, '表%s數據源出問題了,帖子數量應該大於1000條' % tb_post

post =df.distinct()
spark.udf.register('cut_udf', cut_post)
post =post.selectExpr('post_id', 'create_date', 'cut_udf(content) as words')
word_cnt = pp.select(F.explode('words').alias('word')).distinct().count()
print('去重後詞總數爲%d'% word_cnt)
# post.write.mode('overwrite').saveAsTable('tmp.post_words3') 
post.show()

處理後的數據。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章