數據下載鏈接:http://www.statsci.org/data/general/cofreewy.html
下面先使用R 的逐步迴歸選取AIC最小的普通線性模型實行最小二乘估計:
a = lm(CO~.,w)
summary(a)
b = step(a, direction = 'backward')
summary(b)
shapiro.test(b$res)
from __future__ import division
import numpy as np
import scipy as sci
import pandas as pd
from math import log
from sklearn import linear_model
from sklearn.gaussian_process import GaussianProcess
from sklearn.cross_validation import LeaveOneOut
import itertools
co_file = open("cofreewy.txt","r")
lines_list = co_file.readlines()
column_length = len(lines_list[0].split("\t"))
column_names = [name.strip() for name in lines_list[0].split("\t")]
data_array = None
for element in lines_list[1:]:
if type(data_array) == type(None):
data_array = np.array([[name.strip() for name in element.split("\t")]])
else:
data_array = np.append(data_array, [[name.strip() for name in element.split("\t")]], axis = 0)
co_file.close()
data_frame = pd.DataFrame(data_array, columns = column_names, dtype = np.float64)
y = data_frame.loc[:, ["CO"]].values.ravel()
def generate_AIC(RSquare, y, k):
SST = np.linalg.norm(y - y.mean()) ** 2
SSR = (1 - RSquare) * SST
return 2 * k + y.shape[0] * log(SSR / y.shape[0])
def generate_AIC_combinations():
column_names.remove("CO")
conclusion_dict = dict()
for i in range(len(column_names)):
for names in list(itertools.combinations(column_names, i + 1)):
array_data = data_frame.loc[:, names].values
clf = linear_model.LinearRegression()
clf.fit(array_data, y)
RSquare = clf.score(array_data, y)
AIC_value = generate_AIC(RSquare, y, len(names))
conclusion_dict[names] = (AIC_value, clf)
return conclusion_dict
smallest_aic = np.inf
smallest_aic_names = None
smallest_aic_clf = None
for k, v in generate_AIC_combinations().items():
if v[0] < smallest_aic:
smallest_aic = v[0]
smallest_aic_names = k
smallest_aic_clf = v[1]
print "The samllest aic is :" + str(smallest_aic)
print "The correspond paras are :" + str(smallest_aic_names)
y_hat = smallest_aic_clf.predict(data_frame.loc[:, smallest_aic_names].values)
res = y - y_hat
print "The shapiro Test value :" + str(sci.stats.shapiro(res))
print "The score of The smallest_aic model :" + str(smallest_aic_clf.score(data_frame.loc[:, smallest_aic_names].values, y))
# Now fit gaussian process of the whole data , and Test the conclusion by leave One out method
# mix_lm is the model build by the book.
new_data_dict = copy.deepcopy(dict(data_frame))
new_data_dict["Wind_sq"] = (data_frame.loc[:,["Wind"]].values ** 2).ravel()
new_data_dict["Hour_cos_0"] = (np.cos(data_frame.loc[:, ["Hour"]].values * 2 * np.pi / 24)).ravel()
new_data_dict["Hour_cos_1"] = (np.cos(data_frame.loc[:, ["Hour"]].values * 4 * np.pi / 24)).ravel()
new_data_frame = pd.DataFrame(new_data_dict)
new_names = ["Traffic", "Wind", "Wind_sq", "Hour_cos_0", "Hour_cos_1"]
gp_res_square_sum = 0
lm_res_square_sum = 0
mix_lm_res_square_sum = 0
loo = LeaveOneOut(len(y))
for train, test in loo:
gp = GaussianProcess()
gp_clf = gp.fit(data_frame.loc[train ,column_names].values, y[train])
res = y[test] - gp_clf.predict(data_frame.loc[test ,column_names].values)
gp_res_square_sum += res[0] ** 2
lm = linear_model.LinearRegression()
lm_clf = lm.fit(data_frame.loc[train, smallest_aic_names].values, y[train])
res = y[test] - lm_clf.predict(data_frame.loc[test, smallest_aic_names])
lm_res_square_sum += res[0] ** 2
mix_lm = linear_model.LinearRegression()
mix_lm_clf = lm.fit(new_data_frame.loc[train, new_names].values, y[train])
res = y[test] - mix_lm_clf.predict(new_data_frame.loc[test, new_names])
mix_lm_res_square_sum += res[0] ** 2
print "The residual of gaussian_process :" + str(gp_res_square_sum)
print "The residual of linear_model :" + str(lm_res_square_sum)
print "The residual of mix_linear_model :" + str(mix_lm_res_square_sum)
“概率插值”以期望得到更好的模型,並與複雜數據統計方法中利用AIC得到的較優模型
進行比較
故採取one leave out的Cross_validation方法,比較二者的殘差平方和,
The residual of linear_model :6.60246279043
The residual of mix_linear_model :1.66428991979
這裏使用gaussian過程進行迴歸主要考慮到問題的小樣本性。(“概率插值”花費時間
較長)