基於關聯規則分析的推薦算法(Apriori)-附python代碼實現

關於關聯規則分析算法的規則見基於關聯規則分析的推薦算法,這裏只是基於以上理論,給出實現的代碼:

#!/usr/bin/env python
# coding: utf-8

# File Name: Apriori_update.py
# Author   : john
# Mail     : [email protected]
# Created Time: 2019/1/7 11:17
# Describe : 基於關聯規則的推薦系統

import math
import collections
import pandas as pd
import re
## 加載數據,用戶-物品倒排列表,返回一個字典,key=user,value=(user,items*)
def load_data(file_path):
    dataSet = dict()
    f = open(file_path, "r", encoding="utf-8")
    for line in f:
        data = line.strip().split(",")
        user = data[0]
        del data[0]
        dataSet[user] = data
	f.close()

    return dataSet

## 計算k=1時的支持度,返回符合支持度和不符合支持度,類型爲字典,key=item_name,value=item_count
def cut_tree(data_count, data_num, support):
    data = dict([(phone, num) for phone, num in data_count.items() if (num*1.0/data_num)>=support])  # 第一次剪枝
    
    data_cut = dict([(phone, num) for phone, num in data_count.items() if (num*1.0/data_num) < support])  # 第一次剪枝
    return data, data_cut



## 計算k個項的組合項集,利用遞歸的思想
def Combinations(data, k):  
    n = len(data)
    result = []
    for i in range(n-k+1):
        if k > 1:
            newL = data[i+1: ]
            Comb = Combinations(newL, k - 1)
            for item in Comb:
                item.insert(0, data[i])
                result.append(item)
        else:
            result.append([data[i]])
            
    return result

## 獲取k個元素的組合項集,除去k-1不符合支持度的子集(這個值通過剪枝得到)
def move_cut(data, data_cut, K):
    phone = []
    phone_move = []
    for key, value in data.items():
        phone += key.split("、")
        
    phone = list(set(list(phone)))
    data_list = Combinations(phone, K)  # 獲取子集
    if len(data_list) == 0:
        return data
    
    for key, value in data_cut.items():
        phone_move.append(key.split("、"))
        
    for i in phone_move:
        for j in data_list:
            if set(list(i)).issubset(list(j)):
                data_list.remove(j)
    
    return data_list


## 計算組合項集中的元素在用戶-物品倒排表當中出現的次數,主要用於計算支持度
def num_count(dataSet, data):
    data_list = collections.OrderedDict()
    for user, phone in dataSet.items():
        phone = list(phone)
        print 'phone:',phone
        for i in data:
            if set(list(i)).issubset(list(phone)):
                print str(set(list(i)))+":"+str(list(phone))
                keys = "、".join(list(i))
                data_list.setdefault(keys, 0)
                data_list[keys] += 1
    
    return data_list


## 計算所有用戶items的購買次數,返回一個字典,key=item_name,value=item_count,其實就是k=1時的num_count
def first_num_count(dataSet):
    data_list = dict()
    for user, phone in dataSet.items():
        for keys in phone:
            data_list.setdefault(keys, 0)
            data_list[keys] += 1
    
    return data_list

## 函數主程序入口
if __name__ == '__main__':
    
#     dataSet = load_data(file_path)
    dataSet = {'A':['Mix3','XR','mate20'],'B':['Mix3','P20','nexs'],'C':['Mix3','P20','nexs','mate20'],'D':['P20','nexs']}
    print("用戶-物品倒排列表: ", dataSet)
    
    ## 獲取所有用戶items的購買次數
    data_count = first_num_count(dataSet)
    print("第1次剪枝前拓展項計數: ", data_count)
    
    ## 獲取用戶-物品倒排列表的大小
    data_num = len(dataSet)
    print data_num
    ## 物品的項集爲1時,根據支持度進行剪枝
    data, data_cut = cut_tree(data_count, data_num, 0.5)
    print("第1次剪枝後拓展項計數: ", data)

    ## 將物品的項集置爲2
    K = 2
    while True:
        ## 獲取k個元素的組合項集,除去k-1不符合支持度的子集:data_cut
        data = move_cut(data, data_cut, K)
        print("第%d次拓展初始集合: %s" % (K, data))
        ## 計算組合項集中每個元素在用戶-物品倒排表當中出現的次數
        data_count = num_count(dataSet, data)
        print("第%d次剪枝前拓展項計數: %s" % (K, data_count))
        
        if len(data_count) == 0:  # 如果無法拓展,表示已經完成,data爲最後的拓展項集
            print(">>>>>拓展結束")
            break
            
        # 剪枝,剪去不滿足支持度的項
        data, data_cut = cut_tree(data_count, data_num, 0.5)  
        print("第%d次剪枝後拓展項計數: %s" % (K, data))
        print("第%d次被剪枝數據: %s" % (K, data_cut))
        
        K += 1
        
    print '最後的拓展項集爲:',data
    
    phone = []
    for key, value in data.items():
        phone = key.split("、")
        num = value
    
    # 獲取列表的非空子集
    print("phone: ", phone)
    data_num = []
    for i in range(1, len(phone)):
        data_num += Combinations(phone, i)
        
    print("非空子集:", data_num)
    
    conf_data = {}
    # 置信度計算
    for i in data_num:
        count = 0
        for u, v in dataSet.items():
            if set(i).issubset(list(v)):
                count += 1
                conf_data.setdefault(str(i),0)
                conf_data[str(i)]=(float(num)/count)
    # 輸出各子集置信度    
    print '各子集置信度:',conf_data
    
    # 篩選掉不符合置信度的選項
    new_conf_data = dict([(conf,num) for conf,num in conf_data.items() if num>=0.75])
    
    print '符合置信度的項集:',new_conf_data
    
    ## 計算提升度,需要get到support(X),support(Y),support(X交Y)
    ## 定義一個列表,用於存放所有項集的集合
    dim_conf_gather=[]
    for conf_i in new_conf_data:
        ## 定義一個list,用於存放計算提升度的項集集合
        conf_gather=[]
        conf_gather.append(conf_i[1:len(conf_i)-1].replace("'","").replace(", ",",").split(","))
        conf_gather.append(list(set(phone)-set(conf_i[1:len(conf_i)-1].replace("'","").replace(", ",",").split(","))))
        conf_gather.append(phone)
        dim_conf_gather.append(conf_gather)
#         print conf_i[1:len(conf_i)-1].replace("'","").replace(", ",",").split(",")
    print '所有項集的集合:',dim_conf_gather
    
    ## 帶入計算,每個項集的在用戶-物品倒排表出現的次數
    ## 定義一個列表用於存放data_count
    list_data_count=[]
    for i in dim_conf_gather:
        data_count= num_count(dataSet,i)
        list_data_count.append(data_count)
    print list_data_count
    ## 計算提升度
    lift={}
    for i in list_data_count:
        for index in range(len(i.items())):
            index_name = i.items()[0][0]
            if index==0:
                support_X=i.items()[0][1]
            elif index==1:
                support_Y=i.items()[1][1]
            elif index==2:
                support_XY=i.items()[2][1]
        ## 根據公式計算提升度
        lift.setdefault(index_name,0)
        lift[index_name]=(float(support_XY)/len(dataSet))/((float(support_X)/len(dataSet))*(float(support_Y)/len(dataSet)))
    
    for i in lift.items():
        if i[1]>1:
            print '由於{0}大於1,所以購買了{1}的用戶,很可能會購買{2}'.format(i[1],i[0],list(set(phone)-set(i[0].split("、"))))

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章