本項目利用boston房價數據集聯繫簡單的線性迴歸,若預測效果不夠理想,可進一步進行非線性迴歸嘗試。
# -*- coding: utf-8 -*-
from sklearn import datasets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model
import warnings
warnings.filterwarnings('ignore')
boston=datasets.load_boston()
x=boston.data
y=boston.target
print(x.shape)
print(boston.DESCR)
'''
#練習
clf = linear_model.LinearRegression()
x=np.array([2,3,5,7,6]).reshape(-1,1)
y=np.array([6,10,14.5,21,18.5])
print(plt.scatter(x,y,color='blue'))
clf.fit(x,y) #訓練模型
b,a=clf.coef_, clf.intercept_
print(b,a)
x=[[4]]
print(clf.predict(x))
print(plt.plot(x, a+b*x, color = 'red'))
'''
#波士頓房價
x=pd.DataFrame(boston.data, columns=boston.feature_names)
y=pd.DataFrame(boston.target,columns=['MEDV'])
print(plt.scatter(x['RM'],y,color='blue'))
print(plt.scatter(x['LSTAT'],y,color='blue'))
import statsmodels.api as sm
#statsmodels中線性迴歸模型沒有截距項,下行給訓練集加上一列數值爲1的特徵
x_add1=sm.add_constant(x)
model=sm.OLS(y,x_add1).fit()
print(model.summary())
'''
#輸出結果
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 36.4595 5.103 7.144 0.000 26.432 46.487
CRIM -0.1080 0.033 -3.287 0.001 -0.173 -0.043
ZN 0.0464 0.014 3.382 0.001 0.019 0.073
INDUS 0.0206 0.061 0.334 0.738 -0.100 0.141
CHAS 2.6867 0.862 3.118 0.002 0.994 4.380
NOX -17.7666 3.820 -4.651 0.000 -25.272 -10.262
RM 3.8099 0.418 9.116 0.000 2.989 4.631
AGE 0.0007 0.013 0.052 0.958 -0.025 0.027
DIS -1.4756 0.199 -7.398 0.000 -1.867 -1.084
RAD 0.3060 0.066 4.613 0.000 0.176 0.436
TAX -0.0123 0.004 -3.280 0.001 -0.020 -0.005
PTRATIO -0.9527 0.131 -7.283 0.000 -1.210 -0.696
B 0.0093 0.003 3.467 0.001 0.004 0.015
LSTAT -0.5248 0.051 -10.347 0.000 -0.624 -0.425
==============================================================================
Omnibus: 178.041 Durbin-Watson: 1.078
Prob(Omnibus): 0.000 Jarque-Bera (JB): 783.126
Skew: 1.521 Prob(JB): 8.84e-171
Kurtosis: 8.281 Cond. No. 1.51e+04
'''
#移除2個P>=0.05的屬性
x.drop('INDUS',axis=1,inplace=True)
#重新訓練
x_add1=sm.add_constant(x)
model=sm.OLS(y,x_add1).fit()
print(model.summary())
'''
輸出結果
OLS Regression Results
==============================================================================
Dep. Variable: MEDV R-squared: 0.741
Model: OLS Adj. R-squared: 0.734
Method: Least Squares F-statistic: 117.3
Date: Sat, 02 May 2020 Prob (F-statistic): 6.42e-136
Time: 12:30:27 Log-Likelihood: -1498.9
No. Observations: 506 AIC: 3024.
Df Residuals: 493 BIC: 3079.
Df Model: 12
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 36.3639 5.091 7.143 0.000 26.361 46.366
CRIM -0.1084 0.033 -3.304 0.001 -0.173 -0.044
ZN 0.0459 0.014 3.368 0.001 0.019 0.073
CHAS 2.7164 0.856 3.173 0.002 1.034 4.399
NOX -17.4295 3.681 -4.735 0.000 -24.662 -10.197
RM 3.7970 0.416 9.132 0.000 2.980 4.614
AGE 0.0007 0.013 0.053 0.958 -0.025 0.027
DIS -1.4896 0.195 -7.648 0.000 -1.872 -1.107
RAD 0.2999 0.064 4.710 0.000 0.175 0.425
TAX -0.0118 0.003 -3.489 0.001 -0.018 -0.005
PTRATIO -0.9471 0.130 -7.308 0.000 -1.202 -0.692
B 0.0093 0.003 3.461 0.001 0.004 0.015
LSTAT -0.5235 0.051 -10.361 0.000 -0.623 -0.424
==============================================================================
Omnibus: 178.124 Durbin-Watson: 1.079
Prob(Omnibus): 0.000 Jarque-Bera (JB): 784.481
Skew: 1.521 Prob(JB): 4.49e-171
Kurtosis: 8.287 Cond. No. 1.50e+04
==============================================================================
'''
print(model.params)
最終結果:
onst 36.363882
CRIM -0.108419
ZN 0.045932
CHAS 2.716403
NOX -17.429527
RM 3.797021
AGE 0.000697
DIS -1.489643
RAD 0.299875
TAX -0.011784
PTRATIO -0.947065
B 0.009282
LSTAT -0.523467
Adj.R-squared=0.734
擬合效果一般般