import numpy as np
import matplotlib as mpl
import matplotlib. pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn. linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn. pipeline import Pipeline
from sklearn. model_selection import train_test_split
from sklearn. grid_search import GridSearchCV
from sklearn. tree import DecisionTreeRegressor
from sklearn. pipeline import Pipeline
from sklearn. feature_selection import SelectKBest
from sklearn. feature_selection import chi2
from sklearn. preprocessing import MinMaxScaler
from sklearn. decomposition import PCA
from sklearn. pipeline import Pipeline
from sklearn. model_selection import GridSearchCV
from sklearn. linear_model. coordinate_descent import ConvergenceWarning
def notEmpty ( s) :
return s != ''
mpl. rcParams[ 'font.sans-serif' ] = [ u'simHei' ]
mpl. rcParams[ 'axes.unicode_minus' ] = False
warnings. filterwarnings( action = 'ignore' , category= ConvergenceWarning)
names = [ 'CRIM' , 'ZN' , 'INDUS' , 'CHAS' , 'NOX' , 'RM' , 'AGE' , 'DIS' , 'RAD' , 'TAX' , 'PTRATIO' , 'B' , 'LSTAT' ]
path = "datas/boston_housing.data"
fd = pd. read_csv( path, header= None )
data = np. empty( ( len ( fd) , 14 ) )
for i, d in enumerate ( fd. values) :
d = map ( float , filter ( notEmpty, d[ 0 ] . split( ' ' ) ) )
data[ i] = list ( d)
x, y = np. split( data, ( 13 , ) , axis= 1 )
y = y. ravel( )
print ( "樣本數據量:%d, 特徵個數:%d" % x. shape)
print ( "target樣本數據量:%d" % y. shape[ 0 ] )
樣本數據量:506, 特徵個數:13
target樣本數據量:506
x_train1, x_test1, y_train1, y_test1 = train_test_split( x, y, train_size= 0.8 , random_state= 14 )
x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1
print ( "訓練數據集樣本數目:%d, 測試數據集樣本數目:%d" % ( x_train. shape[ 0 ] , x_test. shape[ 0 ] ) )
訓練數據集樣本數目:404, 測試數據集樣本數目:102
ss = MinMaxScaler( )
x_train = ss. fit_transform( x_train, y_train)
x_test = ss. transform( x_test)
print ( "原始數據各個特徵屬性的調整最小值:" , ss. min_)
print ( "原始數據各個特徵屬性的縮放數據值:" , ss. scale_)
原始數據各個特徵屬性的調整最小值: [-7.10352762e-05 0.00000000e+00 -1.68621701e-02 0.00000000e+00
-7.92181070e-01 -6.82314620e-01 -2.98661174e-02 -1.02719857e-01
-4.34782609e-02 -3.56870229e-01 -1.34042553e+00 -6.38977636e-03
-4.90780142e-02]
原始數據各個特徵屬性的縮放數據值: [1.12397589e-02 1.00000000e-02 3.66568915e-02 1.00000000e+00
2.05761317e+00 1.91607588e-01 1.02986612e-02 9.09347180e-02
4.34782609e-02 1.90839695e-03 1.06382979e-01 2.53562554e-03
2.83687943e-02]
model = DecisionTreeRegressor( criterion= 'mae' , max_depth= 7 )
model. fit( x_train, y_train)
y_test_hat = model. predict( x_test)
score = model. score( x_test, y_test)
print ( "Score:" , score)
Score: 0.8176247353755538
lr = LinearRegression( )
lr. fit( x_train, y_train)
lr_y_test_hat = lr. predict( x_test)
lr_score = lr. score( x_test, y_test)
print ( "lr:" , lr_score)
lasso = LassoCV( alphas= np. logspace( - 3 , 1 , 20 ) )
lasso. fit( x_train, y_train)
lasso_y_test_hat = lasso. predict( x_test)
lasso_score = lasso. score( x_test, y_test)
print ( "lasso:" , lasso_score)
ridge = RidgeCV( alphas= np. logspace( - 3 , 1 , 20 ) )
ridge. fit( x_train, y_train)
ridge_y_test_hat = ridge. predict( x_test)
ridge_score = ridge. score( x_test, y_test)
print ( "ridge:" , ridge_score)
lr: 0.6177265992293741
lasso: 0.6178877460212682
ridge: 0.6209247731652285
plt. figure( figsize= ( 12 , 6 ) , facecolor= 'w' )
ln_x_test = range ( len ( x_test) )
plt. plot( ln_x_test, y_test, 'r-' , lw= 2 , label= u'實際值' )
plt. plot( ln_x_test, lr_y_test_hat, 'b-' , lw= 2 , label= u'Linear迴歸,$R^2$=%.3f' % lr_score)
plt. plot( ln_x_test, lasso_y_test_hat, 'y-' , lw= 2 , label= u'Lasso迴歸,$R^2$=%.3f' % lasso_score)
plt. plot( ln_x_test, ridge_y_test_hat, 'c-' , lw= 2 , label= u'Ridge迴歸,$R^2$=%.3f' % ridge_score)
plt. plot( ln_x_test, y_test_hat, 'g-' , lw= 4 , label= u'迴歸決策樹預測值,$R^2$=%.3f' % score)
plt. xlabel( u'數據編碼' )
plt. ylabel( u'租賃價格' )
plt. legend( loc = 'lower right' )
plt. grid( True )
plt. title( u'波士頓房屋租賃數據預測' )
plt. show( )
pipes = [
Pipeline( [
( 'mms' , MinMaxScaler( ) ) ,
( 'pca' , PCA( ) ) ,
( 'decision' , DecisionTreeRegressor( criterion= 'mse' ) )
] ) ,
Pipeline( [
( 'mms' , MinMaxScaler( ) ) ,
( 'decision' , DecisionTreeRegressor( criterion= 'mse' ) )
] ) ,
Pipeline( [
( 'decision' , DecisionTreeRegressor( criterion= 'mse' ) )
] )
]
parameters = [
{
"pca__n_components" : [ 0.25 , 0.5 , 0.75 , 1 ] ,
"decision__max_depth" : np. linspace( 1 , 20 , 20 ) . astype( np. int8)
} ,
{
"decision__max_depth" : np. linspace( 1 , 20 , 20 ) . astype( np. int8)
} ,
{
"decision__max_depth" : np. linspace( 1 , 20 , 20 ) . astype( np. int8)
}
]
x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1
for t in range ( 3 ) :
pipe = pipes[ t]
gscv = GridSearchCV( pipe, param_grid= parameters[ t] )
gscv. fit( x_train2, y_train2)
print ( t, "score值:" , gscv. best_score_, "最優參數列表:" , gscv. best_params_)
0 score值: 0.39216027888649446 最優參數列表: {'decision__max_depth': 7, 'pca__n_components': 0.75}
1 score值: 0.7421721457495921 最優參數列表: {'decision__max_depth': 9}
2 score值: 0.7394834975342223 最優參數列表: {'decision__max_depth': 7}
mms_best = MinMaxScaler( )
decision3 = DecisionTreeRegressor( criterion= 'mse' , max_depth= 4 )
x_train3, x_test3, y_train3, y_test3 = x_train1, x_test1, y_train1, y_test1
x_train3 = mms_best. fit_transform( x_train3, y_train3)
x_test3 = mms_best. transform( x_test3)
decision3. fit( x_train3, y_train3)
print ( "正確率:" , decision3. score( x_test3, y_test3) )
正確率: 0.8435980902870441
x_train4, x_test4, y_train4, y_test4 = x_train1, x_test1, y_train1, y_test1
depths = np. arange( 1 , 20 )
err_list = [ ]
for d in depths:
clf = DecisionTreeRegressor( criterion= 'mse' , max_depth= d)
clf. fit( x_train4, y_train4)
score1 = clf. score( x_test4, y_test4)
err = 1 - score1
err_list. append( err)
print ( "%d深度,正確率%.5f" % ( d, score1) )
plt. figure( facecolor= 'w' )
plt. plot( depths, err_list, 'ro-' , lw= 3 )
plt. xlabel( u'決策樹深度' , fontsize= 16 )
plt. ylabel( u'錯誤率' , fontsize= 16 )
plt. grid( True )
plt. title( u'決策樹層次太多導致的擬合問題(欠擬合和過擬合)' , fontsize= 18 )
plt. show( )
1深度,正確率0.32761
2深度,正確率0.62189
3深度,正確率0.78241
4深度,正確率0.84360
5深度,正確率0.83827
6深度,正確率0.80707
7深度,正確率0.80470
8深度,正確率0.79568
9深度,正確率0.80153
10深度,正確率0.81297
11深度,正確率0.79145
12深度,正確率0.81686
13深度,正確率0.78466
14深度,正確率0.77404
15深度,正確率0.82007
16深度,正確率0.80494
17深度,正確率0.78672
18深度,正確率0.80029
19深度,正確率0.79174
from sklearn import tree
from IPython. display import Image
import pydotplus
dot_data = tree. export_graphviz( decision3, out_file= None ,
filled= True , rounded= True ,
special_characters= True )
graph = pydotplus. graph_from_dot_data( dot_data)
Image( graph. create_png( ) )