1.繪製交叉驗證預測。
# Plotting Cross-Validated Predictions
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
# Generate cross-validated estimates for each input data point
from sklearn import linear_model
import matplotlib.pyplot as plt
lr = linear_model.LinearRegression()
boston = datasets.load_boston()
# Load and return the boston house-prices dataset (regression).
# from sklearn.datasets import load_boston
# boston = load_boston()
# print(boston.data.shape)
y = boston.target
# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation:
predicted = cross_val_predict(lr, boston.data, y, cv=10)
fig, ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
結果:
2.保序迴歸。
print(__doc__)
# Author: Nelle Varoquaux <[email protected]>
# Alexandre Gramfort <[email protected]>
# License: BSD
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from sklearn.linear_model import LinearRegression
# LinearRegression 普通最小二乘線性迴歸
from sklearn.isotonic import IsotonicRegression
# IsotonicRegression 保序迴歸模型
from sklearn.utils import check_random_state
# check_random_state:Turn seed into a np.random.RandomState instance
n = 100
x = np.arange(n)
rs = check_random_state(0) # seed = 0
y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
# Fit IsotonicRegression and LinearRegression models
ir = IsotonicRegression()
y_ = ir.fit_transform(x, y)
lr = LinearRegression()
lr.fit(x[:, np.newaxis], y) # x needs to be 2d for LinearRegression
# plot result
segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
lc = LineCollection(segments, zorder=0)
lc.set_array(np.ones(len(y))) # 全1
lc.set_linewidths(0.5 * np.ones(n))
fig = plt.figure()
plt.plot(x, y, 'r.', markersize=12)
plt.plot(x, y_, 'g.-', markersize=12)
plt.plot(x, lr.predict(x[:, np.newaxis]), 'b-')
plt.gca().add_collection(lc)
plt.legend(('Data', 'Isotonic Fit', 'Linear Fit'), loc='lower right')
plt.title('Isotonic regression')
plt.show()
結果:
3.流水線:鏈接PCA和線性迴歸。
PCA:主成分分析。
# Pipelining: chaining a PCA and a logistic regression
# PCA進行無監督降維,logistic迴歸進行預測。
# PCA:主成分分析
print(__doc__)
# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
# Pipeline of transforms with a final estimator.
from sklearn.model_selection import GridSearchCV
# GridSearchCV:Exhaustive search over specified parameter values for an estimator.
logistic = linear_model.LogisticRegression()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
digits = datasets.load_digits() # Load and return the digits dataset (classification).
X_digits = digits.data
y_digits = digits.target
# Plot the PCA spectrum
pca.fit(X_digits)
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
# Prediction
n_components = [20, 40, 64]
Cs = np.logspace(-4, 4, 3) # Return numbers spaced evenly on a log scale.
#Parameters of pipelines can be set using ‘__’ separated parameter names:
estimator = GridSearchCV(pipe,
dict(pca__n_components=n_components,
logistic__C=Cs))
estimator.fit(X_digits, y_digits)
plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components,
linestyle=':', label='n_components chosen')
plt.legend(prop=dict(size=12))
plt.show()
結果:
4.運用 Pipline 和 GridSearchCV 降維。不知道爲什麼報錯。
# Selecting dimensionality reduction with Pipeline and GridSearchCV
# This example constructs a pipeline that does dimensionality reduction
# followed by prediction with a support vector classifier.
# It demonstrates the use of GridSearchCV and Pipeline to optimize over
# different classes of estimators in a single CV run – unsupervised PCA and NMF
# dimensionality reductions are compared to univariate feature selection during
# the grid search.
# Authors: Robert McGibbon, Joel Nothman
from __future__ import print_function, division
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
# Load and return the digits dataset (classification).
from sklearn.model_selection import GridSearchCV
# Exhaustive search over specified parameter values for an estimator.
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
# Linear Support Vector Classification.
from sklearn.decomposition import PCA, NMF
# Principal component analysis (PCA)
# Non-Negative Matrix Factorization (NMF)
from sklearn.feature_selection import SelectKBest, chi2
# Select features according to the k highest scores.
# chi2:Compute chi-squared stats between each non-negative feature and class.
print(__doc__)
pipe = Pipeline([
('reduce_dim', PCA()),
('classify', LinearSVC())
])
N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
{
'reduce_dim': [PCA(iterated_power=7), NMF()],
'reduce_dim__n_components': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},
{
'reduce_dim': [SelectKBest(chi2)],
'reduce_dim__k': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},
]
reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']
grid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)
digits = load_digits()
grid.fit(digits.data, digits.target)
mean_scores = np.array(grid.cv_results_['mean_test_score'])
# scores are in the order of param_grid iteration, which is alphabetical
mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
# select score for best C
mean_scores = mean_scores.max(axis=0)
bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
(len(reducer_labels) + 1) + .5)
plt.figure()
COLORS = 'bgrcmyk'
for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])
plt.title("Comparing feature reduction techniques")
plt.xlabel('Reduced number of features')
plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
plt.ylabel('Digit classification accuracy')
plt.ylim((0, 1))
plt.legend(loc='upper left')
plt.show()
無結果,報錯。
5.多種方法的面部補完。
# Face completion with a multi-output estimators
# This example shows the use of multi-output estimator to complete images.
# The goal is to predict the lower half of a face given its upper half.
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
# Loader for the Olivetti faces data-set from AT&T.
from sklearn.utils.validation import check_random_state
# Turn seed into a np.random.RandomState instance
from sklearn.ensemble import ExtraTreesRegressor
# An extra-trees regressor.
from sklearn.neighbors import KNeighborsRegressor
# Regression based on k-nearest neighbors.
from sklearn.linear_model import LinearRegression
# Ordinary least squares Linear Regression.
from sklearn.linear_model import RidgeCV
# Ridge regression with built-in cross-validation.
# By default, it performs Generalized Cross-Validation,
# which is a form of efficient Leave-One-Out cross-validation.
# Load the faces datasets
data = fetch_olivetti_faces()
targets = data.target
data = data.images.reshape((len(data.images), -1))
train = data[targets < 30]
test = data[targets >= 30] # Test on independent people
# Test on a subset of people
n_faces = 5
rng = check_random_state(4)
face_ids = rng.randint(test.shape[0], size=(n_faces, ))
test = test[face_ids, :]
n_pixels = data.shape[1]
# Upper half of the faces
X_train = train[:, :(n_pixels + 1) // 2]
# Lower half of the faces
y_train = train[:, n_pixels // 2:]
X_test = test[:, :(n_pixels + 1) // 2]
y_test = test[:, n_pixels // 2:]
# Fit estimators
ESTIMATORS = {
"Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32,
random_state=0),
"K-nn": KNeighborsRegressor(),
"Linear regression": LinearRegression(),
"Ridge": RidgeCV(),
}
y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
estimator.fit(X_train, y_train)
y_test_predict[name] = estimator.predict(X_test)
# Plot the completed faces
image_shape = (64, 64)
n_cols = 1 + len(ESTIMATORS)
plt.figure(figsize=(2. * n_cols, 2.26 * n_faces))
plt.suptitle("Face completion with multi-output estimators", size=16)
for i in range(n_faces):
true_face = np.hstack((X_test[i], y_test[i]))
if i:
sub = plt.subplot(n_faces, n_cols, i * n_cols + 1)
else:
sub = plt.subplot(n_faces, n_cols, i * n_cols + 1,
title="true faces")
sub.axis("off")
sub.imshow(true_face.reshape(image_shape),
cmap=plt.cm.gray,
interpolation="nearest")
for j, est in enumerate(sorted(ESTIMATORS)):
completed_face = np.hstack((X_test[i], y_test_predict[est][i]))
if i:
sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j)
else:
sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j,
title=est)
sub.axis("off")
sub.imshow(completed_face.reshape(image_shape),
cmap=plt.cm.gray,
interpolation="nearest")
plt.show()
結果: