第3章練習題
1. 爲MNIST數據集構建一個分類器,並在測試集上達成超過97%的精度。(提示:KNeighborsClassifier 對這個任務非常有效,只要找到合適的超參數即可,試試weight和n_neighbors這兩個超參數進行網格搜索。)
from sklearn.model_selection import GridSearchCV
param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]
knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3, n_jobs=-1)
grid_search.fit(X_train, y_train)
grid_search.best_params_
grid_search.best_score_
from sklearn.metrics import accuracy_score
y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)
2. 寫一個可以將 MINIST圖片向任意方向(上下左右)移動一個像素的功能。然後,對訓練集中的每張圖片,創建四個位移後的副本(每個方向一個),添加到訓練集。 最後,在這個擴展過的訓練集上訓練模型,衡量其在測試集上的精度。 你會發現,模型的表現更好了!這種人工擴展訓練集的技術稱爲數據增廣
或訓練集擴展
。
from scipy.ndimage.interpolation import shift
#圖片像素移動,dx dy爲移動的像素點,
def shift_image(image, dx, dy):
image = image.reshape((28, 28))
shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
return shifted_image.reshape([-1])
測試一下方法shift_image:
# 測試一下.......
image = X_train[1000]
shifted_image_down = shift_image(image, 0, 5)
shifted_image_left = shift_image(image, -5, 0)
#原始圖
plt.figure(figsize=(12,3))
plt.subplot(131)
plt.title("Original", fontsize=14)
plt.imshow(image.reshape(28, 28), interpolation="nearest", cmap="Greys")
#down圖
plt.subplot(132)
plt.title("Shifted down", fontsize=14)
plt.imshow(shifted_image_down.reshape(28, 28), interpolation="nearest", cmap="Greys")
#left圖
plt.subplot(133)
plt.title("Shifted left", fontsize=14)
plt.imshow(shifted_image_left.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.show()
擴展數據集:
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]
for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
for image, label in zip(X_train, y_train):
X_train_augmented.append(shift_image(image, dx, dy))
y_train_augmented.append(label)
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)
打散數據集,隨機排列:
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]
訓練預測:
knn_clf = KNeighborsClassifier(**grid_search.best_params_)
knn_clf.fit(X_train_augmented, y_train_augmented)
y_pred = knn_clf.predict(X_test)
accuracy_score(y_test, y_pred)
3.處理泰坦尼克數據集(數據已提前下載)。
# 加載數據
TITANIC_PATH = os.path.join("datasets", "titanic")
def load_titanic_data(filename, titanic_path=TITANIC_PATH):
csv_path = os.path.join(titanic_path, filename)
return pd.read_csv(csv_path)
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")
#觀察數據
train_data.info()
train_data.describe()
#查看分類屬性
train_data["Survived"].value_counts()
train_data["Pclass"].value_counts()
train_data["Sex"].value_counts()
train_data["Embarked"].value_counts()
#自定義特徵選擇器
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names]
from sklearn.pipeline import Pipeline
try:
from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
from sklearn.preprocessing import Imputer as SimpleImputer
#數值類型流水線(用median處理了缺失值)
num_pipeline = Pipeline([
("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
("imputer", SimpleImputer(strategy="median")),
])
num_pipeline.fit_transform(train_data)
#自定義類別型特徵缺失值處理器
#SimpleImputer只能處理數值型,不能處理類別型,所以自定義類別型缺失值處理器
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
#按每種類別出現數量最多的類別填充缺失值!!
self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
index=X.columns)
return self
def transform(self, X, y=None):
return X.fillna(self.most_frequent_)
#自定義編碼器
class MyLabelBinarizer(TransformerMixin):
def __init__(self, *args, **kwargs):
self.encoder = LabelBinarizer(*args, **kwargs)
def fit(self, x, y=0):
for
self.encoder.fit(x)
return self
def transform(self, x, y=0):
# print("X=",x)
return self.encoder.transform(x)
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import FeatureUnion
# 類別型流水線
cat_pipeline_step1 = Pipeline([
("select_cat", DataFrameSelector(["Pclass"])),
("cat_encoder", MyLabelBinarizer()),
])
cat_pipeline_step2 = Pipeline([
("select_cat", DataFrameSelector(["Sex"])),
("cat_encoder", MyLabelBinarizer()),
])
cat_pipeline_step3 = Pipeline([
("select_cat", DataFrameSelector(["Embarked"])),
("imputer", MostFrequentImputer()),
("cat_encoder", MyLabelBinarizer()),
])
preprocess_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline1", cat_pipeline_step1),
("cat_pipeline2", cat_pipeline_step2),
("cat_pipeline3", cat_pipeline_step3),
])
X_train = preprocess_pipeline.fit_transform(train_data)
X_train
y_train = train_data["Survived"]
隨機森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()
SVM
from sklearn.svm import SVC
svm_clf = SVC(gamma="auto")
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()
畫圖
plt.figure(figsize=(8, 4))
plt.plot([1]*10, svm_scores, ".")
plt.plot([2]*10, forest_scores, ".")
plt.boxplot([svm_scores, forest_scores], labels=("SVM","Random Forest"))
plt.ylabel("Accuracy", fontsize=14)
plt.show()
分箱分桶
#年齡按15分桶
train_data["AgeBucket"] = train_data["Age"] // 15 * 15
train_data[["AgeBucket", "Survived"]].groupby(['AgeBucket']).mean()
#親屬數量分桶
#船上的親屬數量 分組後的存活比例
train_data["RelativesOnboard"] = train_data["SibSp"] + train_data["Parch"]
train_data[["RelativesOnboard", "Survived"]].groupby(['RelativesOnboard']).mean()