03_決策樹案例二：波士頓房屋租賃價格預測

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model.coordinate_descent import ConvergenceWarning

def notEmpty(s):
    return s != ''

mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
## 攔截異常
warnings.filterwarnings(action = 'ignore', category=ConvergenceWarning)

names = ['CRIM','ZN', 'INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT']
path = "datas/boston_housing.data"
## 由於數據文件格式不統一，所以讀取的時候，先按照一行一個字段屬性讀取數據，然後再按照每行數據進行處理
fd = pd.read_csv(path, header=None)
data = np.empty((len(fd), 14))
for i, d in enumerate(fd.values):
    d = map(float, filter(notEmpty, d[0].split(' ')))
    data[i] = list(d)
x, y = np.split(data, (13,), axis=1)
y = y.ravel()
print ("樣本數據量:%d, 特徵個數：%d" % x.shape)
print ("target樣本數據量:%d" % y.shape[0])

樣本數據量:506, 特徵個數：13
target樣本數據量:506

#數據的分割，
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, train_size=0.8, random_state=14)
x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1
print ("訓練數據集樣本數目：%d, 測試數據集樣本數目：%d" % (x_train.shape[0], x_test.shape[0]))

訓練數據集樣本數目：404, 測試數據集樣本數目：102

#標準化
ss = MinMaxScaler()

x_train = ss.fit_transform(x_train, y_train)
x_test = ss.transform(x_test)

print ("原始數據各個特徵屬性的調整最小值:",ss.min_)
print ("原始數據各個特徵屬性的縮放數據值:",ss.scale_)

原始數據各個特徵屬性的調整最小值: [-7.10352762e-05  0.00000000e+00 -1.68621701e-02  0.00000000e+00
 -7.92181070e-01 -6.82314620e-01 -2.98661174e-02 -1.02719857e-01
 -4.34782609e-02 -3.56870229e-01 -1.34042553e+00 -6.38977636e-03
 -4.90780142e-02]
原始數據各個特徵屬性的縮放數據值: [1.12397589e-02 1.00000000e-02 3.66568915e-02 1.00000000e+00
 2.05761317e+00 1.91607588e-01 1.02986612e-02 9.09347180e-02
 4.34782609e-02 1.90839695e-03 1.06382979e-01 2.53562554e-03
 2.83687943e-02]

#構建模型（迴歸）
model = DecisionTreeRegressor(criterion='mae',max_depth=7)
#模型訓練
model.fit(x_train, y_train)
#模型預測
y_test_hat = model.predict(x_test)

#評估模型
score = model.score(x_test, y_test)
print ("Score：", score)

Score： 0.8176247353755538

#構建線性迴歸
lr = LinearRegression()
lr.fit(x_train,y_train)
lr_y_test_hat = lr.predict(x_test)
lr_score = lr.score(x_test, y_test)
print ("lr:", lr_score)
#構建lasso
lasso = LassoCV(alphas=np.logspace(-3,1,20))
lasso.fit(x_train, y_train)
lasso_y_test_hat = lasso.predict(x_test)
lasso_score = lasso.score(x_test, y_test)
print ("lasso:", lasso_score)
#構建嶺迴歸
ridge = RidgeCV(alphas=np.logspace(-3,1,20))
ridge.fit(x_train, y_train)
ridge_y_test_hat = ridge.predict(x_test)
ridge_score = ridge.score(x_test, y_test)
print ("ridge:", ridge_score)

lr: 0.6177265992293741
lasso: 0.6178877460212682
ridge: 0.6209247731652285

## 7. 畫圖
plt.figure(figsize=(12,6), facecolor='w')
ln_x_test = range(len(x_test))
plt.plot(ln_x_test, y_test, 'r-', lw=2, label=u'實際值')
plt.plot(ln_x_test, lr_y_test_hat, 'b-', lw=2, label=u'Linear迴歸，$R^2$=%.3f' % lr_score)
plt.plot(ln_x_test, lasso_y_test_hat, 'y-', lw=2, label=u'Lasso迴歸，$R^2$=%.3f' % lasso_score)
plt.plot(ln_x_test, ridge_y_test_hat, 'c-', lw=2, label=u'Ridge迴歸，$R^2$=%.3f' % ridge_score)
plt.plot(ln_x_test, y_test_hat, 'g-', lw=4, label=u'迴歸決策樹預測值，$R^2$=%.3f' % score)
plt.xlabel(u'數據編碼')
plt.ylabel(u'租賃價格')
plt.legend(loc = 'lower right')
plt.grid(True)
plt.title(u'波士頓房屋租賃數據預測')
plt.show()

#參數優化
pipes = [
    Pipeline([
            ('mms', MinMaxScaler()), ## 歸一化操作
            ('pca', PCA()), ## 降緯
            ('decision', DecisionTreeRegressor(criterion='mse'))
        ]),
    Pipeline([
            ('mms', MinMaxScaler()),
            ('decision', DecisionTreeRegressor(criterion='mse'))
        ]),
    Pipeline([
            ('decision', DecisionTreeRegressor(criterion='mse'))
        ])
]

# 參數
parameters = [
    {
    "pca__n_components": [0.25,0.5,0.75,1],
    "decision__max_depth":  np.linspace(1,20,20).astype(np.int8)
    },
    {
    "decision__max_depth":  np.linspace(1,20,20).astype(np.int8)
    },
    {
    "decision__max_depth":  np.linspace(1,20,20).astype(np.int8)
    }
]
#獲取數據
x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1

for t in range(3):
    pipe = pipes[t]

    gscv = GridSearchCV(pipe, param_grid=parameters[t])

    gscv.fit(x_train2, y_train2)
    
    print (t,"score值:",gscv.best_score_,"最優參數列表:", gscv.best_params_)

0 score值: 0.39216027888649446 最優參數列表: {'decision__max_depth': 7, 'pca__n_components': 0.75}
1 score值: 0.7421721457495921 最優參數列表: {'decision__max_depth': 9}
2 score值: 0.7394834975342223 最優參數列表: {'decision__max_depth': 7}

#使用最優參數看看正確率
mms_best = MinMaxScaler()
decision3 = DecisionTreeRegressor(criterion='mse', max_depth=4)

x_train3, x_test3, y_train3, y_test3 = x_train1, x_test1, y_train1, y_test1
x_train3 = mms_best.fit_transform(x_train3, y_train3)
x_test3 = mms_best.transform(x_test3)
decision3.fit(x_train3, y_train3)

print ("正確率:", decision3.score(x_test3, y_test3))

正確率: 0.8435980902870441

#查看各個不同深度的錯誤率
x_train4, x_test4, y_train4, y_test4 = x_train1, x_test1, y_train1, y_test1

depths = np.arange(1, 20)
err_list = []
for d in depths:
    clf = DecisionTreeRegressor(criterion='mse', max_depth=d)
    clf.fit(x_train4, y_train4)
    
    score1 = clf.score(x_test4, y_test4)
    err = 1 - score1
    err_list.append(err)
    print ("%d深度，正確率%.5f" % (d, score1))

## 畫圖
plt.figure(facecolor='w')
plt.plot(depths, err_list, 'ro-', lw=3)
plt.xlabel(u'決策樹深度', fontsize=16)
plt.ylabel(u'錯誤率', fontsize=16)
plt.grid(True)
plt.title(u'決策樹層次太多導致的擬合問題(欠擬合和過擬合)', fontsize=18)
plt.show()

1深度，正確率0.32761
2深度，正確率0.62189
3深度，正確率0.78241
4深度，正確率0.84360
5深度，正確率0.83827
6深度，正確率0.80707
7深度，正確率0.80470
8深度，正確率0.79568
9深度，正確率0.80153
10深度，正確率0.81297
11深度，正確率0.79145
12深度，正確率0.81686
13深度，正確率0.78466
14深度，正確率0.77404
15深度，正確率0.82007
16深度，正確率0.80494
17深度，正確率0.78672
18深度，正確率0.80029
19深度，正確率0.79174

# 方式三：直接生成圖片
from sklearn import tree
from IPython.display import Image  
import pydotplus
dot_data = tree.export_graphviz(decision3, out_file=None,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

03_決策樹案例二：波士頓房屋租賃價格預測

linux安裝cuda和cudnn

模擬手機設備：使用 Playwright 實現移動端自動化測試

Mellanox網卡開啓SR-IOV

全面系統的AI學習路徑，幫助普通人也能玩轉AI

HTML 00 Tutorial

uni-app實現上拉加載

vue3編譯優化之“靜態提升”

又是一個月-20240513

flask 如何保證返回json有序

linux服務器設置ssh免密

20200308——多項式迴歸預測工資

20191226_2_淘寶乒乓球商品分析

20200203_knn分類算法

深度之眼_Week2 編程作業1_梯度下降

機器學習作業班_python實現支持向量機

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結