運行python機器學習及實踐代碼59會出錯,本人經過調試對其進行改進,主要原因是因爲python 的版本不同,我的是python3
源代碼:
import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
y = titanic['survived']#將survived屬性列抽取出來賦給y作爲標籤值
x = titanic.drop(['row.names', 'name', 'survived'], axis=1)#將row.names,names,survived三列去掉,不作爲屬性特徵
x.info()
x['age'].fillna(x['age'].mean(), inplace=True)#age屬性列有些值缺少了,採用這一屬性的其他值的平均值來進行填充
x.fillna('UNKNOWN', inplace=True)#對於其他屬性值的缺少,全部用unknown來填充
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
x_train = vec.fit_transform(x_train.to_dict(orient='record'))
x_test = vec.transform(x_test.to_dict(orient='record'))
print(len(vec.feature_names_))
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(x_train, y_train)
print(dt.score(x_test, y_test))
#從sklearn導入特徵篩選器
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20)
x_train_fs = fs.fit_transform(x_train, y_train)
dt.fit(x_train_fs, y_train)
x_test_fs = fs.transform(x_test)
print(dt.score(x_test_fs, y_test))
from sklearn.model_selection import cross_val_score
import numpy as np
percentiles = range(1, 100, 2)
results = []
for i in percentiles:
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
x_train_fs = fs.fit_transform(x_train, y_train)
scores = cross_val_score(dt, x_train_fs, y_train, cv=5)
results = np.append(results, scores.mean())
print(results)
opt = np.where(results == results.max())[0]
print('Optimal number of features %d'%percentiles[opt])
import pylab as pl
pl.plot(percentiles, results)
pl.xlabel('percentiles of features')
pl.ylabel('accuracy')
pl.show()
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=7)
x_train_fs = fs.fit_transform(x_train, y_train)
dt.fit(x_train_fs, y_train)
x_test_fs = fs.transform(x_test)
dt.score(x_test_fs, y_test)
運行源代碼會出現如下錯誤:
Traceback (most recent call last):
File "D:/Python362/a_機器學習及實戰/features_choose.py", line 51, in <module>
print('Optimal number of features %d'%percentiles[opt])
TypeError: only integer scalar arrays can be converted to a scalar index
經過多次調試錯誤代碼鎖定在opt = np.where(results == results.max())[0]上,一下代碼是本人對其進行修改並註釋的代碼:
import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
y = titanic['survived']#將survived屬性列抽取出來賦給y作爲標籤值
x = titanic.drop(['row.names', 'name', 'survived'], axis=1)#將row.names,names,survived三列去掉,不作爲屬性特徵
x.info()
x['age'].fillna(x['age'].mean(), inplace=True)#age屬性列有些值缺少了,採用這一屬性的其他值的平均值來進行填充
x.fillna('UNKNOWN', inplace=True)#對於其他屬性值的缺少,全部用unknown來填充
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
x_train = vec.fit_transform(x_train.to_dict(orient='record'))
x_test = vec.transform(x_test.to_dict(orient='record'))
print(len(vec.feature_names_))
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(x_train, y_train)
print(dt.score(x_test, y_test))
#從sklearn導入特徵篩選器
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20)
x_train_fs = fs.fit_transform(x_train, y_train)
dt.fit(x_train_fs, y_train)
x_test_fs = fs.transform(x_test)
print(dt.score(x_test_fs, y_test))
from sklearn.model_selection import cross_val_score
import numpy as np
percentiles = range(1, 100, 2)
results = []
for i in percentiles:
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
x_train_fs = fs.fit_transform(x_train, y_train)
scores = cross_val_score(dt, x_train_fs, y_train, cv=5)
results = np.append(results, scores.mean())
print(results)
#print(results.max())
#print(results == results.max())調試所用
print(np.where(results == results.max()))#返回的是一個(array([3], dtype=int64),)元組形式的數據,我們需要的是這個results.max的索引,3正是索引,
#我們就要想辦法把3提取出來,可以看出[3]是一個array也就是矩陣形式的,那麼3所在的位置是一行一列,所以在下一步驟做相應的提取
opt = np.where(results == results.max())[0][0]#這一句跟源代碼有出入,查看文檔np.where返回的是 ndarray or tuple of ndarrays類型數據
print('Optimal number of features %d'%percentiles[opt])
import pylab as pl
pl.plot(percentiles, results)
pl.xlabel('percentiles of features')
pl.ylabel('accuracy')
pl.show()
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=7)
x_train_fs = fs.fit_transform(x_train, y_train)
dt.fit(x_train_fs, y_train)
x_test_fs = fs.transform(x_test)
dt.score(x_test_fs, y_test)
最終定位結果如下:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 8 columns):
pclass 1313 non-null object
age 633 non-null float64
embarked 821 non-null object
home.dest 754 non-null object
room 77 non-null object
ticket 69 non-null object
boat 347 non-null object
sex 1313 non-null object
dtypes: float64(1), object(7)
memory usage: 82.1+ KB
474
0.814589665653
0.817629179331
[ 0.85063904 0.85673057 0.87602556 0.88622964 0.86691404 0.86896516
0.86691404 0.87404659 0.86692435 0.86997526 0.86694496 0.86795506
0.86692435 0.86791383 0.85981241 0.86284271 0.86791383 0.86487322
0.86690373 0.858792 0.86588332 0.86386312 0.87302618 0.86691404
0.86489384 0.86791383 0.87098536 0.87199546 0.86690373 0.87202639
0.87300557 0.87201608 0.86791383 0.87198516 0.86996496 0.87402597
0.86996496 0.86894455 0.86691404 0.86688312 0.86692435 0.87198516
0.86184292 0.86284271 0.8598021 0.8597918 0.86491445 0.85981241
0.86285302 0.85876108]
Optimal number of features 7
效果圖如下: