【Machine Learning in Action】使用Apriori算法进行关联分析

apriori.py

# -*- coding: utf-8 -*-                                               
import numpy                                                          
# 加载数据                                                                
def loadDataSet():                                                    
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]               
# 加载出数据集中每一个物品项单独成一个集合                                                
def creatC1(dataSet):                                                 
    C1 = []                                                           
    for transaction in dataSet:                                       
        for item in transaction:                                      
            if not [item] in C1:                                      
                C1.append([item])                                     
    C1.sort()                                                         
    return map(frozenset, C1)                                         
# D是候选集,就是数据集,c1代表单物品项集合,minSupport代表最小支持度                            
def scanD(D,Ck,minSupport):                                           
    ssCnt = {}#ssCat用来存放键值对:键是单物品/二物品/三物品/...,值是所有数据集中包含该物品项的个数       
    for tid in D:                                                     
        for can in Ck:                                                
            # issubset代表子集的意思                                         
            if can.issubset(tid):                                     
                # ssCnt.has_key(can)代表判断ssCat中是否存在一个叫做can的键           
                if not ssCnt.has_key(can):ssCnt[can] = 1              
                else:ssCnt[can] += 1                                  
    numItems = float(len(D))                                          
    retList = []  #用来存放满足最小支持度的集合                                     
    supportData = {}                                                  
    for key in ssCnt:                                                 
        support = ssCnt[key]/numItems   #计算支持度                        
        if support >= minSupport:   #如果支持度满足最小支持度                     
            retList.insert(0,key)  #将相应的值放入retList                    
        supportData[key] = support  #同时将支持度放到supportData              
    return retList, supportData                                       
D = loadDataSet()                                                     
C1 = creatC1(D)                                                       
L1, suppData0 = scanD(D,C1,0.5)                                       
def aprioriGen(Lk,k):#creates CK                                      
    retList = []                                                      
    lenLk = len(Lk)                                                   
    for i in range(lenLk):                                            
        for j in range(i+1, lenLk):                                   
            # 第一次调用的时候k-2=0,所以相当于L1与L2里面均没有元素了                        
            L1 = list(Lk[i])[:k-2]                                    
            L2 = list(Lk[j])[:k-2]                                    
            L1.sort()                                                 
            L2.sort()                                                 
            if L1==L2:                                                
                retList.append(Lk[i]|Lk[j])                           
    return retList                                                    
# dataSet是数据集,minSupport是最小支持量                                        
def  apriori(dataSet, minSupport = 0.5):                              
    C1 = creatC1(dataSet)                                             
    D = map(set, dataSet)                                             
    # 得到L1,和支持数据                                                      
    L1, supportData = scanD(D, C1, minSupport)                        
    # L用来存放L1,L2,L3...                                                
    L = [L1]                                                          
    k = 2                                                             
    while(len(L[k-2])>0):                                             
        Ck = aprioriGen(L[k-2], k)                                    
        # 下面的函数起过滤作用,过滤掉Ck中不满足最小支持率的值                                 
        Lk, supk = scanD(D, Ck, minSupport)                           
        supportData.update(supk)                                      
        L.append(Lk)                                                  
        k += 1                                                        
    return L, supportData                                             
                                                                      
                                                                      
                                                                      
'''                                                                   
以上部分生成了满足最小支持度的频繁项目集合                                                 
'''                                                                   
                                                                      
                                                                      
'''                                                                   
下面的内容是从频繁项集中挖掘关联规则                                                    
'''                                                                   
# minConf为最小可信度阈值,supportDate里面存放了每一个频繁项集的对应的支持度                      
def generateRules(L, supportData, minConf=0.7):                       
    bigRuleList = []                                                  
    for i in range(1, len(L)):                                        
        for freqSet in L[i]:                                          
            H1 = [frozenset([item]) for item in freqSet]              
            if(i>1):     #当想生成的频繁项集中包含2个元素以上时调用这个函数                   
                rulesFromConseq(freqSet, H1, supportData, bigRuleList,
            else:   #当想生成的频繁项集中只包含2个元素时直接调用这个函数计算可信度                  
                calcConf(freqSet, H1, supportData, bigRuleList, minCon
    return bigRuleList   #生成一个包含可信度的规则列表                              
# 计算可信度值                                                              
def calcConf(freqSet, H, supportData, br1, minConf = 0.7):            
    prunedH = []                                                      
    for conseq in H:                                                  
        # freqSet-conseq是集合减去集合。即使freqSet中的元素减去conseq中的元素,而不是数减数      
        conf = supportData[freqSet]/supportData[freqSet-conseq]   #可信度
        if conf >=minConf:                                            
            print freqSet-conseq, '-->', conseq, 'conf:', conf        
            br1.append((freqSet-conseq, conseq, conf))                
            prunedH.append(conseq)                                    
    return prunedH                                                    
# 用于生成候选规则集合                                                          
def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.7):       
    m = len(H[0])                                                     
    if(len(freqSet) > (m+1)):                                         
        Hmp1 = aprioriGen(H, m+1)                                     
        Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf)     
        if(len(Hmp1)>1):                                              
            rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf) 



test.py :在这里面运行查看结果

# -*- coding: utf-8 -*-
import numpy
import apriori

dataSet = apriori.loadDataSet()
L,suppData = apriori.apriori(dataSet)
# print L
# print suppData  #suppData是每一个项集的支持度
rules = apriori.generateRules(L, suppData, 0.5)
print rules


运行结果:

frozenset([3]) --> frozenset([1]) conf: 0.666666666667
frozenset([1]) --> frozenset([3]) conf: 1.0
frozenset([5]) --> frozenset([2]) conf: 1.0
frozenset([2]) --> frozenset([5]) conf: 1.0
frozenset([3]) --> frozenset([2]) conf: 0.666666666667
frozenset([2]) --> frozenset([3]) conf: 0.666666666667
frozenset([5]) --> frozenset([3]) conf: 0.666666666667
frozenset([3]) --> frozenset([5]) conf: 0.666666666667
frozenset([5]) --> frozenset([2, 3]) conf: 0.666666666667
frozenset([3]) --> frozenset([2, 5]) conf: 0.666666666667
frozenset([2]) --> frozenset([3, 5]) conf: 0.666666666667
[(frozenset([3]), frozenset([1]), 0.6666666666666666), (frozenset([1]), frozenset([3]), 1.0), (frozenset([5]), frozenset([2]), 1.0), (frozenset([2]), frozenset([5]), 1.0), (frozenset([3]), frozenset([2]), 0.6666666666666666), (frozenset([2]), frozenset([3]), 0.6666666666666666), (frozenset([5]), frozenset([3]), 0.6666666666666666), (frozenset([3]), frozenset([5]), 0.6666666666666666), (frozenset([5]), frozenset([2, 3]), 0.6666666666666666), (frozenset([3]), frozenset([2, 5]), 0.6666666666666666), (frozenset([2]), frozenset([3, 5]), 0.6666666666666666)]



参考资料:【美】Peter Harrington.《Machine Learning in Action》

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章