相關分析
函數關係:
相關關係:影響不存在方向性,比如身高越高體重越重,但不能說身高增加1cm體重增加2kg
相關分析不具有傳遞性,A和C相關,B和C相關,A和B不一定相關
相關係數的顯著性檢驗
#1.兩兩相關性[有相關係數有p值]
correlation=[]
for I in car_corr[['weight','circle','horsepower']].columns:
correlation.append(stats.pearsonr(car_corr['max_speed'],car_corr[I]))
#1.2 僅有p值
from sklearn.feature_selection import f_regression
F,P_value=f_regression(car_corr[['weight','circle','horsepower']],car_corr['max_speed'])
#2. df的相關係數矩陣[只有相關係數,沒有p值]
car_corr[['weight','circle','horsepower','Max_speed']].corr()
#2.2
np.corrcoef((car_corr['Weight'],car_corr['circle'],car_corr['horsepower'],car_corr['Max_speed']))
偏相關分析
發動機作爲汽車的心臟,對各項指標有影響。因此,在研究其他指標和最高時速指標之間的相關關係是,會不知不覺在變量之間加入發動機相關指標,對所研究的變量有影響,而這種影響由於相關關係的不可傳遞性,往往會得到錯誤的結論
剔除其他變量影響之後再進行相關分析
def partial_corr(x,y,partical=[]):
#x,y爲考察相關關係的變量,partical爲控制變量
xy,xyp=stats.pearsonr(x,y)
xp,xpp=stats.pearsonr(x,partical)
yp,ypp=stats.pearsonr(y,partical)
n=len(x)
df=n-3
r=(xy-xp*yp)/(np.sqrt(1-xp*xp)*np.sqrt(1-yp*yp))
if abs(r)==1:
prob=0.0
else:
t=(r*np.sqrt(df))/np.sqrt(1-r*r)
prob=(1-stats.t.cdf(abs(t),df))*2
return r,prob
pcorrelation=[]
for I in car_corr[['weight','circle']].columns:
pcorrelation.append(partical_corr(car_corr[I],car_corr['max speed'],partial=car_corr['horsepower']))
點二列相關分析
一個連續變量一個分類變量
stats.pointbiserialr(scorebygender['gender'],scorebygender['score'])
# 第一個參數要求是0,1布爾形式的數據
非參數相關分析
spearman相關係數Kendall相關係數hoeffding相關係數
rho,p=stats.spearmanr(graduate)
kt=[]
for I in graduate[[],[],[]].columns():
kt.append(stats.kendalltau(graduate[I],graduate['Tutor']))
關聯分析
數據變成0,1型
sign='-->'
class Apriori(object):
def __init__(self,minsupport=0.1,minconfidence=0.4):
self.minsupport=minsupport
self.minconfidence=minconfidence
def link(self,x,sign):
'''
該函數用於連接前項和後項
'''
x=list(map(lambda i:sorted(i.split(sign)),x))
l=len(x[0])
r=[]
for i in range(len(x)):
for j in range(i,len(x)):
if x[i][:l-1]==x[j][:l-1] and x[i][l-1]!=x[j][l-1]:
r.append(x[i][:l-1]+sorted([x[j][l-1],x[i][l-1]]))
return r
def apriori(self,data):
'''
該函數用於頻繁項集的挖掘
'''
final = pd.DataFrame(index=['support','confidence'])
support_series=1.0*data.sum()/len(data)
column=list(support_series[support_series>self.minsupport].index)
k=0
while len(column)>1:
k = k+1
column=self.link(column,sign)
sf=lambda i :data[i].prod(axis=1,numeric_only=True)
data_2=pd.DataFrame(list(map(sf,column)),
index=[sign.join(i) for i in column]).T
support_series_2=1.0*data_2[[sign.join(i) for i in column]].sum()/len(data)
#
column=list(support_series_2[support_series_2>self.minsupport].index)
#
support_series=support_series.append(support_series_2)
column2=[]
for i in column:
i=i.split(sign)
for j in range(len(i)):
column2.append(i[:j]+i[j+1:]+i[j:j+1])
#
cofidence_series=pd.Series(index=[sign.join(i) for i in column2])
for i in column2:
confidence_series[sign.join(i)]=
support_series[sign.join(sorted(i))]/support_series[sign.join(i[:len(i)-1])]
for i in confidence_series[confidence_series>self.minconfidence].index:
final[i]=0.0
final[i]['confidence']=confidence_series[i]
final[i]['support']=support_series[sign.join(sorted(i.split(sign)))]
final=final.T.sort(['confidence','support'],ascending=False)
return final
rule=Apriori()
rule.apriori(mpb)
FP-growth要求數據格式:出現就出現爲原名字,不出現這一行就NaN
from fp_growth import find_frequent_itemsets as ffi
for itemset in ffi(array(mpb_fpg),minimum_support=int(len(array(mpb_fpg)*0.1)):
if nan in itemset:
pass
elif len(itemset)==1:
pass
else:
print(itemset[::-1])