ItemCF-jaccard相似度計算相似item

from pyspark.sql import SparkSession, functions as F
import heapq
from pyspark.sql.types import StructType, StructField, LongType,FloatType, ArrayType, IntegerType

spark = SparkSession.builder.appName("search_statistics").getOrCreate()
log = spark.read.format("csv").option("header","true").load("日記點擊日誌.csv")
print("現有的dataframe結構")
log.printSchema()
log.show(3)
log = log.withColumn("uid", log["uid"].cast(IntegerType())).withColumn("target_id", log["target_id"].cast(IntegerType())).withColumn("device_id", log["device_id"].cast(IntegerType()))
print("格式轉換後的dataframe")
log.printSchema()
print("去重前size %d" % log.count())
log = log.selectExpr('uid', 'target_id group_id', 'device_id').distinct() #去重
print("去重後size %d" % log.count())
log.show()

bad_device = spark.read.format("csv").option("header","true").load("作弊用戶device_id.csv") #加載作弊用戶設備id
bad_device = bad_device.withColumn("device_id", bad_device["device_id"].cast(IntegerType()))
print("作用用戶size %d" % bad_device.count())

df = log.join(bad_device, 'device_id', 'left_anti').filter('uid!=0') #使用left_anti對黑名單數據進行剔除。同時剔除uid=0
print("log大小 %d  df大小 %d" % (log.count(), df.count()))

# df =df.select(F.when(F.col('uid') == 0, F.col('device_id') * -1).otherwise(F.col('uid')).alias('uid'), 'group_id') #對於uid等於0的,採取-device_id作爲uid

df = df.groupBy('group_id').agg(F.collect_set('uid').alias('uids')).filter('size(uids) > 5') #把每個group_id的uid聚合成set。並剔除瀏覽用戶小於5個的日記(數據太小,不利於算相似日記)
df.show()

pdf = df.select('group_id', 'uids').toPandas() #sparkDF轉pandas數據,方便本地計算
group_ids = [int(i) for i in pdf['group_id'].tolist()]
uid_sets = pdf['uids'].apply(lambda x: set(x))



#  自定義udf返回類型
TOP_N_TYPE = StructType([
    StructField('group_id', LongType()),
    StructField("score", FloatType()),
])
def jaccard_similarity(a, b):
    a_len = len(a)
    b_len = len(b)
    if a_len == 0 and b_len == 0:
        return 0
    else:
        return len(a & b) / len(a | b)
# return len(a & b) / sqrt(a_len * b_len)
def item_recommend(group_ids, uid_sets, n_rec, group_id, uids):
    """相關的topN帖子推薦
    :param group_ids: {list} 所有的group_id,用於將group_id和傑卡得相似度關聯 n個group_id
    :param uid_sets: {pandas.core.series.Series} 所有帖子的所有瀏覽用戶集合  n行用戶id的set 和group_id順序對應
    :param n_rec: {int} 保留n個最相關的帖子
    :param uids: {list} 當前帖子的瀏覽用戶id集合
    :param group_id: {list} 當前帖子id,用於在相關貼子中去除當前帖子
    :return: {list} [(post_id, js_similarity),...]
    """
    uids = set(uids)  # uids是list,轉爲set
    jaccard_sims = uid_sets.apply(lambda x: jaccard_similarity(x, uids)).tolist() #計算每個日記的uid和當前uid的計算相似度
    recs = (i for i in zip(group_ids, jaccard_sims) if i[0] != group_id)
    return heapq.nlargest(n_rec, recs, key=lambda x: x[1]) #堆排序取Top100



n_rec =100 #每個日記推薦最相似的100個日記
jaccard_item_rec = F.udf(lambda group_id, uids:item_recommend(group_ids, uid_sets, n_rec, group_id, uids), ArrayType(TOP_N_TYPE)) #js_item_rec就是一個註冊後的udf函數
"""冒號左側是udf的輸入參數,右側是uid對應的函數"""

df_sim = df.select('group_id',jaccard_item_rec('group_id', 'uids').alias('recs'))
df_sim.show()

表:一個是item的瀏覽日誌表。還要一個作弊用戶id表。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章