Scikit-Learn 官方示例

1.繪製交叉驗證預測。

# Plotting Cross-Validated Predictions
from sklearn import datasets
from sklearn.model_selection import cross_val_predict 
# Generate cross-validated estimates for each input data point
from sklearn import linear_model
import matplotlib.pyplot as plt

lr = linear_model.LinearRegression()
boston = datasets.load_boston() 
# Load and return the boston house-prices dataset (regression).
# from sklearn.datasets import load_boston
# boston = load_boston()
# print(boston.data.shape)
y = boston.target

# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation:
predicted = cross_val_predict(lr, boston.data, y, cv=10)

fig, ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()



結果:



2.保序迴歸。

print(__doc__)

# Author: Nelle Varoquaux <[email protected]>
#         Alexandre Gramfort <[email protected]>
# License: BSD

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

from sklearn.linear_model import LinearRegression
# LinearRegression 普通最小二乘線性迴歸
from sklearn.isotonic import IsotonicRegression
# IsotonicRegression 保序迴歸模型
from sklearn.utils import check_random_state
# check_random_state:Turn seed into a np.random.RandomState instance

n = 100
x = np.arange(n)
rs = check_random_state(0) # seed = 0
y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))

# Fit IsotonicRegression and LinearRegression models
ir = IsotonicRegression()

y_ = ir.fit_transform(x, y)

lr = LinearRegression()
lr.fit(x[:, np.newaxis], y)  # x needs to be 2d for LinearRegression

# plot result
segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
lc = LineCollection(segments, zorder=0)
lc.set_array(np.ones(len(y))) # 全1
lc.set_linewidths(0.5 * np.ones(n))

fig = plt.figure()
plt.plot(x, y, 'r.', markersize=12)
plt.plot(x, y_, 'g.-', markersize=12)
plt.plot(x, lr.predict(x[:, np.newaxis]), 'b-')
plt.gca().add_collection(lc)
plt.legend(('Data', 'Isotonic Fit', 'Linear Fit'), loc='lower right')
plt.title('Isotonic regression')
plt.show()




結果:




3.流水線:鏈接PCA和線性迴歸。

PCA:主成分分析。

# Pipelining: chaining a PCA and a logistic regression
# PCA進行無監督降維,logistic迴歸進行預測。
# PCA:主成分分析
print(__doc__)


# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause


import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline 
# Pipeline of transforms with a final estimator.
from sklearn.model_selection import GridSearchCV
# GridSearchCV:Exhaustive search over specified parameter values for an estimator.

logistic = linear_model.LogisticRegression()

pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

digits = datasets.load_digits() # Load and return the digits dataset (classification).
X_digits = digits.data
y_digits = digits.target

# Plot the PCA spectrum
pca.fit(X_digits)

plt.figure(1, figsize=(4, 3))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')

# Prediction
n_components = [20, 40, 64]
Cs = np.logspace(-4, 4, 3) # Return numbers spaced evenly on a log scale.

#Parameters of pipelines can be set using ‘__’ separated parameter names:

estimator = GridSearchCV(pipe,
                         dict(pca__n_components=n_components,
                              logistic__C=Cs))
estimator.fit(X_digits, y_digits)

plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components,
            linestyle=':', label='n_components chosen')
plt.legend(prop=dict(size=12))
plt.show()

結果:



4.運用 Pipline 和 GridSearchCV 降維。不知道爲什麼報錯。

# Selecting dimensionality reduction with Pipeline and GridSearchCV

# This example constructs a pipeline that does dimensionality reduction 
# followed by prediction with a support vector classifier. 
# It demonstrates the use of GridSearchCV and Pipeline to optimize over 
# different classes of estimators in a single CV run – unsupervised PCA and NMF 
# dimensionality reductions are compared to univariate feature selection during 
# the grid search.

# Authors: Robert McGibbon, Joel Nothman

from __future__ import print_function, division

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
# Load and return the digits dataset (classification).
from sklearn.model_selection import GridSearchCV
# Exhaustive search over specified parameter values for an estimator.
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
# Linear Support Vector Classification.
from sklearn.decomposition import PCA, NMF
# Principal component analysis (PCA)
# Non-Negative Matrix Factorization (NMF)
from sklearn.feature_selection import SelectKBest, chi2
# Select features according to the k highest scores.
# chi2:Compute chi-squared stats between each non-negative feature and class.
print(__doc__)

pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('classify', LinearSVC())
])

N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
]
reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']

grid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)
digits = load_digits()
grid.fit(digits.data, digits.target)

mean_scores = np.array(grid.cv_results_['mean_test_score'])
# scores are in the order of param_grid iteration, which is alphabetical
mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
# select score for best C
mean_scores = mean_scores.max(axis=0)
bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
               (len(reducer_labels) + 1) + .5)

plt.figure()
COLORS = 'bgrcmyk'
for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
    plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])

plt.title("Comparing feature reduction techniques")
plt.xlabel('Reduced number of features')
plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
plt.ylabel('Digit classification accuracy')
plt.ylim((0, 1))
plt.legend(loc='upper left')
plt.show()

無結果,報錯。


5.多種方法的面部補完。

# Face completion with a multi-output estimators
# This example shows the use of multi-output estimator to complete images.
# The goal is to predict the lower half of a face given its upper half.

print(__doc__)

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_olivetti_faces
# Loader for the Olivetti faces data-set from AT&T.
from sklearn.utils.validation import check_random_state
# Turn seed into a np.random.RandomState instance

from sklearn.ensemble import ExtraTreesRegressor
# An extra-trees regressor.
from sklearn.neighbors import KNeighborsRegressor
# Regression based on k-nearest neighbors.
from sklearn.linear_model import LinearRegression
# Ordinary least squares Linear Regression.
from sklearn.linear_model import RidgeCV
# Ridge regression with built-in cross-validation.
# By default, it performs Generalized Cross-Validation, 
# which is a form of efficient Leave-One-Out cross-validation.

# Load the faces datasets
data = fetch_olivetti_faces()
targets = data.target

data = data.images.reshape((len(data.images), -1))
train = data[targets < 30]
test = data[targets >= 30]  # Test on independent people

# Test on a subset of people
n_faces = 5
rng = check_random_state(4)
face_ids = rng.randint(test.shape[0], size=(n_faces, ))
test = test[face_ids, :]

n_pixels = data.shape[1]
# Upper half of the faces
X_train = train[:, :(n_pixels + 1) // 2]
# Lower half of the faces
y_train = train[:, n_pixels // 2:]
X_test = test[:, :(n_pixels + 1) // 2]
y_test = test[:, n_pixels // 2:]

# Fit estimators
ESTIMATORS = {
    "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32,
                                       random_state=0),
    "K-nn": KNeighborsRegressor(),
    "Linear regression": LinearRegression(),
    "Ridge": RidgeCV(),
}

y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
    estimator.fit(X_train, y_train)
    y_test_predict[name] = estimator.predict(X_test)

# Plot the completed faces
image_shape = (64, 64)

n_cols = 1 + len(ESTIMATORS)
plt.figure(figsize=(2. * n_cols, 2.26 * n_faces))
plt.suptitle("Face completion with multi-output estimators", size=16)

for i in range(n_faces):
    true_face = np.hstack((X_test[i], y_test[i]))

    if i:
        sub = plt.subplot(n_faces, n_cols, i * n_cols + 1)
    else:
        sub = plt.subplot(n_faces, n_cols, i * n_cols + 1,
                          title="true faces")

    sub.axis("off")
    sub.imshow(true_face.reshape(image_shape),
               cmap=plt.cm.gray,
               interpolation="nearest")

    for j, est in enumerate(sorted(ESTIMATORS)):
        completed_face = np.hstack((X_test[i], y_test_predict[est][i]))

        if i:
            sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j)

        else:
            sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j,
                              title=est)

        sub.axis("off")
        sub.imshow(completed_face.reshape(image_shape),
                   cmap=plt.cm.gray,
                   interpolation="nearest")

plt.show()


結果:





發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章