定义总平方和分解公式:
利用检验统计量F定义检验方法:
'''实现单因素方差分析'''
# 导入相关包
import pandas as pd
import numpy as np
import math
import scipy
from scipy import stats
# 自定义函数
def SST(Y):
sst = sum(np.power(Y - np.mean(Y), 2))
return sst
def SSA(data, x_name, y_name):
total_avg = np.mean(data[y_name])
df = data.groupby([x_name]).agg(['mean', 'count'])
df = df[y_name]
ssa = sum(df["count"]*(np.power(df["mean"] - total_avg, 2)))
return ssa
def SSE(data, x_name, y_name):
df = data.groupby([x_name]).agg(['mean'])
df = df[y_name]
#dict_ = dict(df["mean"]) 用dict函数报错
dict_=df["mean"].to_dict()
data_ = data[[x_name, y_name]]
data_["add_mean"] = data_[x_name].map(lambda x: dict_[x])
sse = sum(np.power(data_[y_name] - data_["add_mean"], 2))
return sse
def one_way_anova(data, x_name, y_name, alpha=0.05):
n = len(data) # 总观测值数
k = len(data[x_name].unique()) # 变量水平个数
sst = SST(data[y_name]) # 总平方和
ssa = SSA(data, x_name, y_name) # 组间平方和
sse = SSE(data, x_name, y_name) # 组内平方和
msa = ssa / (k-1) # 组间均方 或 组间方差
mse = sse / (n-k) # 组内均方 或 组内方差
F = msa / mse # 检验统计量F
pf = scipy.stats.f.sf(F, k-1, n-k)
Fa = scipy.stats.f.isf(alpha, dfn=k-1, dfd=n-k) # F临界值
r_square = ssa / sst # 自变量与因变量的关系强度表示
table = pd.DataFrame({'差异源':['组间', '组内', '总和'],
'平方和SS':[ssa, sse, sst],
'自由度df':[k-1, n-k, n-1],
'均方MS':[msa, mse, '_'],
'F值':[F, '_', '_'],
'P值':[pf, '_', '_'],
'F临界值':[Fa, '_', '_'],
'R^2':[r_square, '_', '_']})
return table
实例测试结果:
d1=pd.read_excel(r'C:/Users/LHL/Desktop/方差分析.xlsx')
one_way_anova(d1, 'X', 'Y', alpha=0.05)