《机器学习实战 学习笔记》(五):练习题(第3章 分类)

第3章练习题

1. 为MNIST数据集构建一个分类器,并在测试集上达成超过97%的精度。(提示:KNeighborsClassifier 对这个任务非常有效,只要找到合适的超参数即可,试试weight和n_neighbors这两个超参数进行网格搜索。)

from sklearn.model_selection import GridSearchCV

param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]

knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

在这里插入图片描述

grid_search.best_params_
grid_search.best_score_

在这里插入图片描述
在这里插入图片描述

from sklearn.metrics import accuracy_score

y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

在这里插入图片描述

2. 写一个可以将 MINIST图片向任意方向(上下左右)移动一个像素的功能。然后,对训练集中的每张图片,创建四个位移后的副本(每个方向一个),添加到训练集。 最后,在这个扩展过的训练集上训练模型,衡量其在测试集上的精度。 你会发现,模型的表现更好了!这种人工扩展训练集的技术称为数据增广训练集扩展

from scipy.ndimage.interpolation import shift
#图片像素移动,dx dy为移动的像素点,
def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])
   测试一下方法shift_image:
# 测试一下.......
image = X_train[1000]
shifted_image_down = shift_image(image, 0, 5)
shifted_image_left = shift_image(image, -5, 0)

#原始图
plt.figure(figsize=(12,3))
plt.subplot(131)
plt.title("Original", fontsize=14)
plt.imshow(image.reshape(28, 28), interpolation="nearest", cmap="Greys")

#down图
plt.subplot(132)
plt.title("Shifted down", fontsize=14)
plt.imshow(shifted_image_down.reshape(28, 28), interpolation="nearest", cmap="Greys")

#left图
plt.subplot(133)
plt.title("Shifted left", fontsize=14)
plt.imshow(shifted_image_left.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.show()

在这里插入图片描述

   扩展数据集:
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)
   打散数据集,随机排列:
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]
   训练预测:
knn_clf = KNeighborsClassifier(**grid_search.best_params_)
knn_clf.fit(X_train_augmented, y_train_augmented)
y_pred = knn_clf.predict(X_test)
accuracy_score(y_test, y_pred)

3.处理泰坦尼克数据集(数据已提前下载)。

  
# 加载数据
TITANIC_PATH = os.path.join("datasets", "titanic")
def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

在这里插入图片描述

#观察数据
train_data.info()

在这里插入图片描述

train_data.describe()

在这里插入图片描述

#查看分类属性
train_data["Survived"].value_counts()
train_data["Pclass"].value_counts()
train_data["Sex"].value_counts()
train_data["Embarked"].value_counts()

在这里插入图片描述


#自定义特征选择器
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

from sklearn.pipeline import Pipeline
try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer
#数值类型流水线(用median处理了缺失值)
num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
        ("imputer", SimpleImputer(strategy="median")),
    ])
num_pipeline.fit_transform(train_data)

#自定义类别型特征缺失值处理器
#SimpleImputer只能处理数值型,不能处理类别型,所以自定义类别型缺失值处理器
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
    	#按每种类别出现数量最多的类别填充缺失值!!
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

#自定义编码器
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        for 
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
#         print("X=",x)
        return self.encoder.transform(x)

from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import FeatureUnion
# 类别型流水线
cat_pipeline_step1 = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass"])),
        ("cat_encoder", MyLabelBinarizer()),
    ])
cat_pipeline_step2 = Pipeline([
        ("select_cat", DataFrameSelector(["Sex"])),
        ("cat_encoder", MyLabelBinarizer()),
    ])
cat_pipeline_step3 = Pipeline([
        ("select_cat", DataFrameSelector(["Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", MyLabelBinarizer()),
    ])


preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline1", cat_pipeline_step1),
        ("cat_pipeline2", cat_pipeline_step2),
        ("cat_pipeline3", cat_pipeline_step3),
    ])

X_train = preprocess_pipeline.fit_transform(train_data)
X_train
y_train = train_data["Survived"]


随机森林

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

在这里插入图片描述

SVM

from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

在这里插入图片描述

画图

plt.figure(figsize=(8, 4))
plt.plot([1]*10, svm_scores, ".")
plt.plot([2]*10, forest_scores, ".")
plt.boxplot([svm_scores, forest_scores], labels=("SVM","Random Forest"))
plt.ylabel("Accuracy", fontsize=14)
plt.show()

在这里插入图片描述

分箱分桶

#年龄按15分桶
train_data["AgeBucket"] = train_data["Age"] // 15 * 15
train_data[["AgeBucket", "Survived"]].groupby(['AgeBucket']).mean()

在这里插入图片描述

#亲属数量分桶
#船上的亲属数量 分组后的存活比例
train_data["RelativesOnboard"] = train_data["SibSp"] + train_data["Parch"]
train_data[["RelativesOnboard", "Survived"]].groupby(['RelativesOnboard']).mean()

在这里插入图片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章