關於關聯規則分析算法的規則見基於關聯規則分析的推薦算法,這裏只是基於以上理論,給出實現的代碼:
#!/usr/bin/env python
# coding: utf-8
# File Name: Apriori_update.py
# Author : john
# Mail : [email protected]
# Created Time: 2019/1/7 11:17
# Describe : 基於關聯規則的推薦系統
import math
import collections
import pandas as pd
import re
## 加載數據,用戶-物品倒排列表,返回一個字典,key=user,value=(user,items*)
def load_data(file_path):
dataSet = dict()
f = open(file_path, "r", encoding="utf-8")
for line in f:
data = line.strip().split(",")
user = data[0]
del data[0]
dataSet[user] = data
f.close()
return dataSet
## 計算k=1時的支持度,返回符合支持度和不符合支持度,類型爲字典,key=item_name,value=item_count
def cut_tree(data_count, data_num, support):
data = dict([(phone, num) for phone, num in data_count.items() if (num*1.0/data_num)>=support]) # 第一次剪枝
data_cut = dict([(phone, num) for phone, num in data_count.items() if (num*1.0/data_num) < support]) # 第一次剪枝
return data, data_cut
## 計算k個項的組合項集,利用遞歸的思想
def Combinations(data, k):
n = len(data)
result = []
for i in range(n-k+1):
if k > 1:
newL = data[i+1: ]
Comb = Combinations(newL, k - 1)
for item in Comb:
item.insert(0, data[i])
result.append(item)
else:
result.append([data[i]])
return result
## 獲取k個元素的組合項集,除去k-1不符合支持度的子集(這個值通過剪枝得到)
def move_cut(data, data_cut, K):
phone = []
phone_move = []
for key, value in data.items():
phone += key.split("、")
phone = list(set(list(phone)))
data_list = Combinations(phone, K) # 獲取子集
if len(data_list) == 0:
return data
for key, value in data_cut.items():
phone_move.append(key.split("、"))
for i in phone_move:
for j in data_list:
if set(list(i)).issubset(list(j)):
data_list.remove(j)
return data_list
## 計算組合項集中的元素在用戶-物品倒排表當中出現的次數,主要用於計算支持度
def num_count(dataSet, data):
data_list = collections.OrderedDict()
for user, phone in dataSet.items():
phone = list(phone)
print 'phone:',phone
for i in data:
if set(list(i)).issubset(list(phone)):
print str(set(list(i)))+":"+str(list(phone))
keys = "、".join(list(i))
data_list.setdefault(keys, 0)
data_list[keys] += 1
return data_list
## 計算所有用戶items的購買次數,返回一個字典,key=item_name,value=item_count,其實就是k=1時的num_count
def first_num_count(dataSet):
data_list = dict()
for user, phone in dataSet.items():
for keys in phone:
data_list.setdefault(keys, 0)
data_list[keys] += 1
return data_list
## 函數主程序入口
if __name__ == '__main__':
# dataSet = load_data(file_path)
dataSet = {'A':['Mix3','XR','mate20'],'B':['Mix3','P20','nexs'],'C':['Mix3','P20','nexs','mate20'],'D':['P20','nexs']}
print("用戶-物品倒排列表: ", dataSet)
## 獲取所有用戶items的購買次數
data_count = first_num_count(dataSet)
print("第1次剪枝前拓展項計數: ", data_count)
## 獲取用戶-物品倒排列表的大小
data_num = len(dataSet)
print data_num
## 物品的項集爲1時,根據支持度進行剪枝
data, data_cut = cut_tree(data_count, data_num, 0.5)
print("第1次剪枝後拓展項計數: ", data)
## 將物品的項集置爲2
K = 2
while True:
## 獲取k個元素的組合項集,除去k-1不符合支持度的子集:data_cut
data = move_cut(data, data_cut, K)
print("第%d次拓展初始集合: %s" % (K, data))
## 計算組合項集中每個元素在用戶-物品倒排表當中出現的次數
data_count = num_count(dataSet, data)
print("第%d次剪枝前拓展項計數: %s" % (K, data_count))
if len(data_count) == 0: # 如果無法拓展,表示已經完成,data爲最後的拓展項集
print(">>>>>拓展結束")
break
# 剪枝,剪去不滿足支持度的項
data, data_cut = cut_tree(data_count, data_num, 0.5)
print("第%d次剪枝後拓展項計數: %s" % (K, data))
print("第%d次被剪枝數據: %s" % (K, data_cut))
K += 1
print '最後的拓展項集爲:',data
phone = []
for key, value in data.items():
phone = key.split("、")
num = value
# 獲取列表的非空子集
print("phone: ", phone)
data_num = []
for i in range(1, len(phone)):
data_num += Combinations(phone, i)
print("非空子集:", data_num)
conf_data = {}
# 置信度計算
for i in data_num:
count = 0
for u, v in dataSet.items():
if set(i).issubset(list(v)):
count += 1
conf_data.setdefault(str(i),0)
conf_data[str(i)]=(float(num)/count)
# 輸出各子集置信度
print '各子集置信度:',conf_data
# 篩選掉不符合置信度的選項
new_conf_data = dict([(conf,num) for conf,num in conf_data.items() if num>=0.75])
print '符合置信度的項集:',new_conf_data
## 計算提升度,需要get到support(X),support(Y),support(X交Y)
## 定義一個列表,用於存放所有項集的集合
dim_conf_gather=[]
for conf_i in new_conf_data:
## 定義一個list,用於存放計算提升度的項集集合
conf_gather=[]
conf_gather.append(conf_i[1:len(conf_i)-1].replace("'","").replace(", ",",").split(","))
conf_gather.append(list(set(phone)-set(conf_i[1:len(conf_i)-1].replace("'","").replace(", ",",").split(","))))
conf_gather.append(phone)
dim_conf_gather.append(conf_gather)
# print conf_i[1:len(conf_i)-1].replace("'","").replace(", ",",").split(",")
print '所有項集的集合:',dim_conf_gather
## 帶入計算,每個項集的在用戶-物品倒排表出現的次數
## 定義一個列表用於存放data_count
list_data_count=[]
for i in dim_conf_gather:
data_count= num_count(dataSet,i)
list_data_count.append(data_count)
print list_data_count
## 計算提升度
lift={}
for i in list_data_count:
for index in range(len(i.items())):
index_name = i.items()[0][0]
if index==0:
support_X=i.items()[0][1]
elif index==1:
support_Y=i.items()[1][1]
elif index==2:
support_XY=i.items()[2][1]
## 根據公式計算提升度
lift.setdefault(index_name,0)
lift[index_name]=(float(support_XY)/len(dataSet))/((float(support_X)/len(dataSet))*(float(support_Y)/len(dataSet)))
for i in lift.items():
if i[1]>1:
print '由於{0}大於1,所以購買了{1}的用戶,很可能會購買{2}'.format(i[1],i[0],list(set(phone)-set(i[0].split("、"))))