《機器學習實戰學習筆記》（五）：練習題（第3章分類）

原創

2020-06-22 22:42

第3章練習題

1. 爲MNIST數據集構建一個分類器，並在測試集上達成超過97%的精度。(提示：KNeighborsClassifier 對這個任務非常有效，只要找到合適的超參數即可，試試weight和n_neighbors這兩個超參數進行網格搜索。)

from sklearn.model_selection import GridSearchCV

param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]

knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

grid_search.best_params_
grid_search.best_score_

from sklearn.metrics import accuracy_score

y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

2. 寫一個可以將 MINIST圖片向任意方向（上下左右）移動一個像素的功能。然後，對訓練集中的每張圖片，創建四個位移後的副本(每個方向一個)，添加到訓練集。最後，在這個擴展過的訓練集上訓練模型，衡量其在測試集上的精度。你會發現，模型的表現更好了！這種人工擴展訓練集的技術稱爲`數據增廣`或`訓練集擴展`。

from scipy.ndimage.interpolation import shift
#圖片像素移動，dx dy爲移動的像素點，
def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])

測試一下方法shift_image：

# 測試一下.......
image = X_train[1000]
shifted_image_down = shift_image(image, 0, 5)
shifted_image_left = shift_image(image, -5, 0)

#原始圖
plt.figure(figsize=(12,3))
plt.subplot(131)
plt.title("Original", fontsize=14)
plt.imshow(image.reshape(28, 28), interpolation="nearest", cmap="Greys")

#down圖
plt.subplot(132)
plt.title("Shifted down", fontsize=14)
plt.imshow(shifted_image_down.reshape(28, 28), interpolation="nearest", cmap="Greys")

#left圖
plt.subplot(133)
plt.title("Shifted left", fontsize=14)
plt.imshow(shifted_image_left.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.show()

擴展數據集：

X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

打散數據集，隨機排列：

shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

訓練預測：

knn_clf = KNeighborsClassifier(**grid_search.best_params_)
knn_clf.fit(X_train_augmented, y_train_augmented)
y_pred = knn_clf.predict(X_test)
accuracy_score(y_test, y_pred)

3.處理泰坦尼克數據集(數據已提前下載)。

# 加載數據
TITANIC_PATH = os.path.join("datasets", "titanic")
def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

#觀察數據
train_data.info()

train_data.describe()

#查看分類屬性
train_data["Survived"].value_counts()
train_data["Pclass"].value_counts()
train_data["Sex"].value_counts()
train_data["Embarked"].value_counts()


#自定義特徵選擇器
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

from sklearn.pipeline import Pipeline
try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer
#數值類型流水線(用median處理了缺失值)
num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
        ("imputer", SimpleImputer(strategy="median")),
    ])
num_pipeline.fit_transform(train_data)

#自定義類別型特徵缺失值處理器
#SimpleImputer只能處理數值型，不能處理類別型，所以自定義類別型缺失值處理器
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
    	#按每種類別出現數量最多的類別填充缺失值！！
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

#自定義編碼器
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        for 
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
#         print("X=",x)
        return self.encoder.transform(x)

from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import FeatureUnion
# 類別型流水線
cat_pipeline_step1 = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass"])),
        ("cat_encoder", MyLabelBinarizer()),
    ])
cat_pipeline_step2 = Pipeline([
        ("select_cat", DataFrameSelector(["Sex"])),
        ("cat_encoder", MyLabelBinarizer()),
    ])
cat_pipeline_step3 = Pipeline([
        ("select_cat", DataFrameSelector(["Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", MyLabelBinarizer()),
    ])


preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline1", cat_pipeline_step1),
        ("cat_pipeline2", cat_pipeline_step2),
        ("cat_pipeline3", cat_pipeline_step3),
    ])

X_train = preprocess_pipeline.fit_transform(train_data)
X_train
y_train = train_data["Survived"]

隨機森林

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

SVM

from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

畫圖

plt.figure(figsize=(8, 4))
plt.plot([1]*10, svm_scores, ".")
plt.plot([2]*10, forest_scores, ".")
plt.boxplot([svm_scores, forest_scores], labels=("SVM","Random Forest"))
plt.ylabel("Accuracy", fontsize=14)
plt.show()

分箱分桶

#年齡按15分桶
train_data["AgeBucket"] = train_data["Age"] // 15 * 15
train_data[["AgeBucket", "Survived"]].groupby(['AgeBucket']).mean()

#親屬數量分桶
#船上的親屬數量 分組後的存活比例
train_data["RelativesOnboard"] = train_data["SibSp"] + train_data["Parch"]
train_data[["RelativesOnboard", "Survived"]].groupby(['RelativesOnboard']).mean()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

《機器學習實戰學習筆記》（五）：練習題（第3章分類）

第3章練習題

1. 爲MNIST數據集構建一個分類器，並在測試集上達成超過97%的精度。(提示：KNeighborsClassifier 對這個任務非常有效，只要找到合適的超參數即可，試試weight和n_neighbors這兩個超參數進行網格搜索。)

測試一下方法shift_image：

擴展數據集：

打散數據集，隨機排列：

訓練預測：

3.處理泰坦尼克數據集(數據已提前下載)。

隨機森林

SVM

畫圖

分箱分桶

《神經網絡和深度學習學習筆記》（一）運行TensorFlow

《機器學習實戰學習筆記》（三）：練習題（第2章端到端的機器學習項目）

jieba分詞詞性標註

---------------站內導航【算法和機器學習】---------------

《機器學習實戰學習筆記》（七）：訓練模型練習題

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

《機器學習實戰 學習筆記》（五）：練習題（第3章 分類）

第3章練習題

1. 爲MNIST數據集構建一個分類器，並在測試集上達成超過97%的精度。(提示：KNeighborsClassifier 對這個任務非常有效，只要找到合適的超參數即可，試試weight和n_neighbors這兩個超參數進行網格搜索。)

測試一下方法shift_image：

擴展數據集：

打散數據集，隨機排列：

訓練預測：

3.處理泰坦尼克數據集(數據已提前下載)。

隨機森林

SVM

畫圖

分箱分桶

《機器學習實戰學習筆記》（五）：練習題（第3章分類）