03_決策樹案例二:波士頓房屋租賃價格預測

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model.coordinate_descent import ConvergenceWarning
def notEmpty(s):
    return s != ''
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
## 攔截異常
warnings.filterwarnings(action = 'ignore', category=ConvergenceWarning)
names = ['CRIM','ZN', 'INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT']
path = "datas/boston_housing.data"
## 由於數據文件格式不統一,所以讀取的時候,先按照一行一個字段屬性讀取數據,然後再按照每行數據進行處理
fd = pd.read_csv(path, header=None)
data = np.empty((len(fd), 14))
for i, d in enumerate(fd.values):
    d = map(float, filter(notEmpty, d[0].split(' ')))
    data[i] = list(d)
x, y = np.split(data, (13,), axis=1)
y = y.ravel()
print ("樣本數據量:%d, 特徵個數:%d" % x.shape)
print ("target樣本數據量:%d" % y.shape[0])
樣本數據量:506, 特徵個數:13
target樣本數據量:506
#數據的分割,
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, train_size=0.8, random_state=14)
x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1
print ("訓練數據集樣本數目:%d, 測試數據集樣本數目:%d" % (x_train.shape[0], x_test.shape[0]))
訓練數據集樣本數目:404, 測試數據集樣本數目:102
#標準化
ss = MinMaxScaler()

x_train = ss.fit_transform(x_train, y_train)
x_test = ss.transform(x_test)

print ("原始數據各個特徵屬性的調整最小值:",ss.min_)
print ("原始數據各個特徵屬性的縮放數據值:",ss.scale_)
原始數據各個特徵屬性的調整最小值: [-7.10352762e-05  0.00000000e+00 -1.68621701e-02  0.00000000e+00
 -7.92181070e-01 -6.82314620e-01 -2.98661174e-02 -1.02719857e-01
 -4.34782609e-02 -3.56870229e-01 -1.34042553e+00 -6.38977636e-03
 -4.90780142e-02]
原始數據各個特徵屬性的縮放數據值: [1.12397589e-02 1.00000000e-02 3.66568915e-02 1.00000000e+00
 2.05761317e+00 1.91607588e-01 1.02986612e-02 9.09347180e-02
 4.34782609e-02 1.90839695e-03 1.06382979e-01 2.53562554e-03
 2.83687943e-02]
#構建模型(迴歸)
model = DecisionTreeRegressor(criterion='mae',max_depth=7)
#模型訓練
model.fit(x_train, y_train)
#模型預測
y_test_hat = model.predict(x_test) 
#評估模型
score = model.score(x_test, y_test)
print ("Score:", score)
Score: 0.8176247353755538
#構建線性迴歸
lr = LinearRegression()
lr.fit(x_train,y_train)
lr_y_test_hat = lr.predict(x_test)
lr_score = lr.score(x_test, y_test)
print ("lr:", lr_score)
#構建lasso
lasso = LassoCV(alphas=np.logspace(-3,1,20))
lasso.fit(x_train, y_train)
lasso_y_test_hat = lasso.predict(x_test)
lasso_score = lasso.score(x_test, y_test)
print ("lasso:", lasso_score)
#構建嶺迴歸
ridge = RidgeCV(alphas=np.logspace(-3,1,20))
ridge.fit(x_train, y_train)
ridge_y_test_hat = ridge.predict(x_test)
ridge_score = ridge.score(x_test, y_test)
print ("ridge:", ridge_score)
lr: 0.6177265992293741
lasso: 0.6178877460212682
ridge: 0.6209247731652285
## 7. 畫圖
plt.figure(figsize=(12,6), facecolor='w')
ln_x_test = range(len(x_test))
plt.plot(ln_x_test, y_test, 'r-', lw=2, label=u'實際值')
plt.plot(ln_x_test, lr_y_test_hat, 'b-', lw=2, label=u'Linear迴歸,$R^2$=%.3f' % lr_score)
plt.plot(ln_x_test, lasso_y_test_hat, 'y-', lw=2, label=u'Lasso迴歸,$R^2$=%.3f' % lasso_score)
plt.plot(ln_x_test, ridge_y_test_hat, 'c-', lw=2, label=u'Ridge迴歸,$R^2$=%.3f' % ridge_score)
plt.plot(ln_x_test, y_test_hat, 'g-', lw=4, label=u'迴歸決策樹預測值,$R^2$=%.3f' % score)
plt.xlabel(u'數據編碼')
plt.ylabel(u'租賃價格')
plt.legend(loc = 'lower right')
plt.grid(True)
plt.title(u'波士頓房屋租賃數據預測')
plt.show()

在這裏插入圖片描述

#參數優化
pipes = [
    Pipeline([
            ('mms', MinMaxScaler()), ## 歸一化操作
            ('pca', PCA()), ## 降緯
            ('decision', DecisionTreeRegressor(criterion='mse'))
        ]),
    Pipeline([
            ('mms', MinMaxScaler()),
            ('decision', DecisionTreeRegressor(criterion='mse'))
        ]),
    Pipeline([
            ('decision', DecisionTreeRegressor(criterion='mse'))
        ])
]

# 參數
parameters = [
    {
    "pca__n_components": [0.25,0.5,0.75,1],
    "decision__max_depth":  np.linspace(1,20,20).astype(np.int8)
    },
    {
    "decision__max_depth":  np.linspace(1,20,20).astype(np.int8)
    },
    {
    "decision__max_depth":  np.linspace(1,20,20).astype(np.int8)
    }
]
#獲取數據
x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1

for t in range(3):
    pipe = pipes[t]

    gscv = GridSearchCV(pipe, param_grid=parameters[t])

    gscv.fit(x_train2, y_train2)
    
    print (t,"score值:",gscv.best_score_,"最優參數列表:", gscv.best_params_)
0 score值: 0.39216027888649446 最優參數列表: {'decision__max_depth': 7, 'pca__n_components': 0.75}
1 score值: 0.7421721457495921 最優參數列表: {'decision__max_depth': 9}
2 score值: 0.7394834975342223 最優參數列表: {'decision__max_depth': 7}
#使用最優參數看看正確率
mms_best = MinMaxScaler()
decision3 = DecisionTreeRegressor(criterion='mse', max_depth=4)

x_train3, x_test3, y_train3, y_test3 = x_train1, x_test1, y_train1, y_test1
x_train3 = mms_best.fit_transform(x_train3, y_train3)
x_test3 = mms_best.transform(x_test3)
decision3.fit(x_train3, y_train3)

print ("正確率:", decision3.score(x_test3, y_test3))
正確率: 0.8435980902870441
#查看各個不同深度的錯誤率
x_train4, x_test4, y_train4, y_test4 = x_train1, x_test1, y_train1, y_test1

depths = np.arange(1, 20)
err_list = []
for d in depths:
    clf = DecisionTreeRegressor(criterion='mse', max_depth=d)
    clf.fit(x_train4, y_train4)
    
    score1 = clf.score(x_test4, y_test4)
    err = 1 - score1
    err_list.append(err)
    print ("%d深度,正確率%.5f" % (d, score1))

## 畫圖
plt.figure(facecolor='w')
plt.plot(depths, err_list, 'ro-', lw=3)
plt.xlabel(u'決策樹深度', fontsize=16)
plt.ylabel(u'錯誤率', fontsize=16)
plt.grid(True)
plt.title(u'決策樹層次太多導致的擬合問題(欠擬合和過擬合)', fontsize=18)
plt.show()
1深度,正確率0.32761
2深度,正確率0.62189
3深度,正確率0.78241
4深度,正確率0.84360
5深度,正確率0.83827
6深度,正確率0.80707
7深度,正確率0.80470
8深度,正確率0.79568
9深度,正確率0.80153
10深度,正確率0.81297
11深度,正確率0.79145
12深度,正確率0.81686
13深度,正確率0.78466
14深度,正確率0.77404
15深度,正確率0.82007
16深度,正確率0.80494
17深度,正確率0.78672
18深度,正確率0.80029
19深度,正確率0.79174

在這裏插入圖片描述

# 方式三:直接生成圖片
from sklearn import tree
from IPython.display import Image  
import pydotplus
dot_data = tree.export_graphviz(decision3, out_file=None,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章