第3章练习题
1. 为MNIST数据集构建一个分类器,并在测试集上达成超过97%的精度。(提示:KNeighborsClassifier 对这个任务非常有效,只要找到合适的超参数即可,试试weight和n_neighbors这两个超参数进行网格搜索。)
from sklearn.model_selection import GridSearchCV
param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]
knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3, n_jobs=-1)
grid_search.fit(X_train, y_train)
grid_search.best_params_
grid_search.best_score_
from sklearn.metrics import accuracy_score
y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)
2. 写一个可以将 MINIST图片向任意方向(上下左右)移动一个像素的功能。然后,对训练集中的每张图片,创建四个位移后的副本(每个方向一个),添加到训练集。 最后,在这个扩展过的训练集上训练模型,衡量其在测试集上的精度。 你会发现,模型的表现更好了!这种人工扩展训练集的技术称为数据增广
或训练集扩展
。
from scipy.ndimage.interpolation import shift
#图片像素移动,dx dy为移动的像素点,
def shift_image(image, dx, dy):
image = image.reshape((28, 28))
shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
return shifted_image.reshape([-1])
测试一下方法shift_image:
# 测试一下.......
image = X_train[1000]
shifted_image_down = shift_image(image, 0, 5)
shifted_image_left = shift_image(image, -5, 0)
#原始图
plt.figure(figsize=(12,3))
plt.subplot(131)
plt.title("Original", fontsize=14)
plt.imshow(image.reshape(28, 28), interpolation="nearest", cmap="Greys")
#down图
plt.subplot(132)
plt.title("Shifted down", fontsize=14)
plt.imshow(shifted_image_down.reshape(28, 28), interpolation="nearest", cmap="Greys")
#left图
plt.subplot(133)
plt.title("Shifted left", fontsize=14)
plt.imshow(shifted_image_left.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.show()
扩展数据集:
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]
for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
for image, label in zip(X_train, y_train):
X_train_augmented.append(shift_image(image, dx, dy))
y_train_augmented.append(label)
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)
打散数据集,随机排列:
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]
训练预测:
knn_clf = KNeighborsClassifier(**grid_search.best_params_)
knn_clf.fit(X_train_augmented, y_train_augmented)
y_pred = knn_clf.predict(X_test)
accuracy_score(y_test, y_pred)
3.处理泰坦尼克数据集(数据已提前下载)。
# 加载数据
TITANIC_PATH = os.path.join("datasets", "titanic")
def load_titanic_data(filename, titanic_path=TITANIC_PATH):
csv_path = os.path.join(titanic_path, filename)
return pd.read_csv(csv_path)
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")
#观察数据
train_data.info()
train_data.describe()
#查看分类属性
train_data["Survived"].value_counts()
train_data["Pclass"].value_counts()
train_data["Sex"].value_counts()
train_data["Embarked"].value_counts()
#自定义特征选择器
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names]
from sklearn.pipeline import Pipeline
try:
from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
from sklearn.preprocessing import Imputer as SimpleImputer
#数值类型流水线(用median处理了缺失值)
num_pipeline = Pipeline([
("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
("imputer", SimpleImputer(strategy="median")),
])
num_pipeline.fit_transform(train_data)
#自定义类别型特征缺失值处理器
#SimpleImputer只能处理数值型,不能处理类别型,所以自定义类别型缺失值处理器
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
#按每种类别出现数量最多的类别填充缺失值!!
self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
index=X.columns)
return self
def transform(self, X, y=None):
return X.fillna(self.most_frequent_)
#自定义编码器
class MyLabelBinarizer(TransformerMixin):
def __init__(self, *args, **kwargs):
self.encoder = LabelBinarizer(*args, **kwargs)
def fit(self, x, y=0):
for
self.encoder.fit(x)
return self
def transform(self, x, y=0):
# print("X=",x)
return self.encoder.transform(x)
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import FeatureUnion
# 类别型流水线
cat_pipeline_step1 = Pipeline([
("select_cat", DataFrameSelector(["Pclass"])),
("cat_encoder", MyLabelBinarizer()),
])
cat_pipeline_step2 = Pipeline([
("select_cat", DataFrameSelector(["Sex"])),
("cat_encoder", MyLabelBinarizer()),
])
cat_pipeline_step3 = Pipeline([
("select_cat", DataFrameSelector(["Embarked"])),
("imputer", MostFrequentImputer()),
("cat_encoder", MyLabelBinarizer()),
])
preprocess_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline1", cat_pipeline_step1),
("cat_pipeline2", cat_pipeline_step2),
("cat_pipeline3", cat_pipeline_step3),
])
X_train = preprocess_pipeline.fit_transform(train_data)
X_train
y_train = train_data["Survived"]
随机森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()
SVM
from sklearn.svm import SVC
svm_clf = SVC(gamma="auto")
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()
画图
plt.figure(figsize=(8, 4))
plt.plot([1]*10, svm_scores, ".")
plt.plot([2]*10, forest_scores, ".")
plt.boxplot([svm_scores, forest_scores], labels=("SVM","Random Forest"))
plt.ylabel("Accuracy", fontsize=14)
plt.show()
分箱分桶
#年龄按15分桶
train_data["AgeBucket"] = train_data["Age"] // 15 * 15
train_data[["AgeBucket", "Survived"]].groupby(['AgeBucket']).mean()
#亲属数量分桶
#船上的亲属数量 分组后的存活比例
train_data["RelativesOnboard"] = train_data["SibSp"] + train_data["Parch"]
train_data[["RelativesOnboard", "Survived"]].groupby(['RelativesOnboard']).mean()