使用Titanic 數據集,通過特徵篩選的方法一步步提升決策樹的預測性能

運行python機器學習及實踐代碼59會出錯,本人經過調試對其進行改進,主要原因是因爲python 的版本不同,我的是python3

源代碼:

import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')

y = titanic['survived']#將survived屬性列抽取出來賦給y作爲標籤值
x = titanic.drop(['row.names', 'name', 'survived'], axis=1)#將row.names,names,survived三列去掉,不作爲屬性特徵
x.info()

x['age'].fillna(x['age'].mean(), inplace=True)#age屬性列有些值缺少了,採用這一屬性的其他值的平均值來進行填充
x.fillna('UNKNOWN', inplace=True)#對於其他屬性值的缺少,全部用unknown來填充

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
x_train = vec.fit_transform(x_train.to_dict(orient='record'))
x_test = vec.transform(x_test.to_dict(orient='record'))
print(len(vec.feature_names_))

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(x_train, y_train)
print(dt.score(x_test, y_test))

#從sklearn導入特徵篩選器
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20)
x_train_fs = fs.fit_transform(x_train, y_train)
dt.fit(x_train_fs, y_train)
x_test_fs = fs.transform(x_test)
print(dt.score(x_test_fs, y_test))


from sklearn.model_selection import cross_val_score
import numpy as np
percentiles = range(1, 100, 2)
results = []
for i in percentiles:
    fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
    x_train_fs = fs.fit_transform(x_train, y_train)
    scores = cross_val_score(dt, x_train_fs, y_train, cv=5)
    results = np.append(results, scores.mean())
print(results)

opt = np.where(results == results.max())[0]
print('Optimal number of features %d'%percentiles[opt])

import pylab as pl
pl.plot(percentiles, results)
pl.xlabel('percentiles of features')
pl.ylabel('accuracy')
pl.show()

from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=7)
x_train_fs = fs.fit_transform(x_train, y_train)
dt.fit(x_train_fs, y_train)
x_test_fs = fs.transform(x_test)
dt.score(x_test_fs, y_test)
運行源代碼會出現如下錯誤:

Traceback (most recent call last):
  File "D:/Python362/a_機器學習及實戰/features_choose.py", line 51, in <module>
    print('Optimal number of features %d'%percentiles[opt])
TypeError: only integer scalar arrays can be converted to a scalar index
經過多次調試錯誤代碼鎖定在opt = np.where(results == results.max())[0]上,一下代碼是本人對其進行修改並註釋的代碼:

import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')

y = titanic['survived']#將survived屬性列抽取出來賦給y作爲標籤值
x = titanic.drop(['row.names', 'name', 'survived'], axis=1)#將row.names,names,survived三列去掉,不作爲屬性特徵
x.info()

x['age'].fillna(x['age'].mean(), inplace=True)#age屬性列有些值缺少了,採用這一屬性的其他值的平均值來進行填充
x.fillna('UNKNOWN', inplace=True)#對於其他屬性值的缺少,全部用unknown來填充

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
x_train = vec.fit_transform(x_train.to_dict(orient='record'))
x_test = vec.transform(x_test.to_dict(orient='record'))
print(len(vec.feature_names_))

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(x_train, y_train)
print(dt.score(x_test, y_test))

#從sklearn導入特徵篩選器
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20)
x_train_fs = fs.fit_transform(x_train, y_train)
dt.fit(x_train_fs, y_train)
x_test_fs = fs.transform(x_test)
print(dt.score(x_test_fs, y_test))


from sklearn.model_selection import cross_val_score
import numpy as np
percentiles = range(1, 100, 2)
results = []
for i in percentiles:
    fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
    x_train_fs = fs.fit_transform(x_train, y_train)
    scores = cross_val_score(dt, x_train_fs, y_train, cv=5)
    results = np.append(results, scores.mean())
print(results)
#print(results.max())
#print(results == results.max())調試所用

print(np.where(results == results.max()))#返回的是一個(array([3], dtype=int64),)元組形式的數據,我們需要的是這個results.max的索引,3正是索引,
#我們就要想辦法把3提取出來,可以看出[3]是一個array也就是矩陣形式的,那麼3所在的位置是一行一列,所以在下一步驟做相應的提取
opt = np.where(results == results.max())[0][0]#這一句跟源代碼有出入,查看文檔np.where返回的是 ndarray or tuple of ndarrays類型數據

print('Optimal number of features %d'%percentiles[opt])

import pylab as pl
pl.plot(percentiles, results)
pl.xlabel('percentiles of features')
pl.ylabel('accuracy')
pl.show()

from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=7)
x_train_fs = fs.fit_transform(x_train, y_train)
dt.fit(x_train_fs, y_train)
x_test_fs = fs.transform(x_test)
dt.score(x_test_fs, y_test)

最終定位結果如下:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 8 columns):
pclass       1313 non-null object
age          633 non-null float64
embarked     821 non-null object
home.dest    754 non-null object
room         77 non-null object
ticket       69 non-null object
boat         347 non-null object
sex          1313 non-null object
dtypes: float64(1), object(7)
memory usage: 82.1+ KB
474
0.814589665653
0.817629179331
[ 0.85063904  0.85673057  0.87602556  0.88622964  0.86691404  0.86896516
  0.86691404  0.87404659  0.86692435  0.86997526  0.86694496  0.86795506
  0.86692435  0.86791383  0.85981241  0.86284271  0.86791383  0.86487322
  0.86690373  0.858792    0.86588332  0.86386312  0.87302618  0.86691404
  0.86489384  0.86791383  0.87098536  0.87199546  0.86690373  0.87202639
  0.87300557  0.87201608  0.86791383  0.87198516  0.86996496  0.87402597
  0.86996496  0.86894455  0.86691404  0.86688312  0.86692435  0.87198516
  0.86184292  0.86284271  0.8598021   0.8597918   0.86491445  0.85981241
  0.86285302  0.85876108]
Optimal number of features 7
效果圖如下:



發佈了50 篇原創文章 · 獲贊 40 · 訪問量 17萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章