sklearn 源碼解析 基本線性模型

from __future__ import division
from abc import ABCMeta, abstractmethod: 有關實現抽象類的方法。
import numbers: 抽象基類層次結構,這些類不可被實例化。
import warnings

import numpy as np
import scipy.sparse as sp
from scipy import linalg
from scipy import sparse

from ..externals import six: 版本有關
from ..externals.joblib import Parallel, delayed:前者操縱多線程,後者用於捕獲
  EX:[delayed(sqrt)(i ** 3)for i in range(10)]
from ..base import BaseEstimator, ClassifierMixin, RegressorMixin:
from..utils import check_array, check_X_y, deprecated, as_float_array
 deprecated(反對) decorator: 用於在裝飾對象被調用時提供警告提示。
from ..utils.validation import FLOAT_DTYPES:
  (numpy.float64, numpy.float32, numpy.float16)
from ..utils import check_random_state:
from ..utils.extmath import safe_sprse_dot:
from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
 CSR CSC 爲不同的稀疏矩陣存儲格式,
 mean_variance_axis:指定計算稀疏矩陣沿某一軸方向的mean variance.
from ..utils.fixes import sparse_lsqr
 對於大型 稀疏 線性系統 或等式 找到最小二乘解。
from ..utils.seq_datasets import ArrayDataset, CSRDataset
 數據集類型 分別指定了表示形式爲二維數組及稀疏矩陣
from ..utils.validation import check_is_fitted:
from ..exceptions import NotFittedError
from import normalize as f_normalize
 正則化變換(只正則不中心),默認axis = 1是針對feature的。


def make_datasets(X, y, sample_weight, random_state = None):
 rng = check_random_state(random_state)
 seed = rng.randint(1, np.iinfo(np.int32).max)

 if sp.issparse(X):
  dataset = CSRDataset(, X.indptr, X.indices, y, sample_weight, seed = seed)
  intercept_decay = SPARSE_INTERCEPT_DECAY
  dataset = ArrayDataset(X, y, sample_weight, seed = seed)
  intercept_decay = 1

 return dataset, intercept_decay

  返回RandomState實例。(分None RandomState int三種參數返回實例)
 np.random.randint(low, high=None, size=None):
  返回[low, high)範圍的整數隨機數。


python and 具有javascript類似的特性,在計算bool值時當and第一個條件不被通過時

(如sparse_center_data center_data...)

def _preprocess_data(X, y, fit_intercept, normalize = False, copy = True,
     sample_weight = None, return_mean = False):
 if isinstance(sample_weight, numbers.Number):
  sample_weight = None
 X = check_array(X, copy = copy, accept_sparse = ['csr', 'csc'],
     dtype = FLOAT_DTYPES)
 if fit_intercept:
  if sp.issparse(X):
   X_offset, X_var = mean_variance_axis(X, axis = 0)
   if not return_mean:
    X_offset = np.zeros(X.shape[1])

   if normalize:
    X_var *= X.shape[0]
    X_scale = np.sqrt(X_var, X_var)
    del X_var
    X_scale[X_scale == 0] = 1
    inplace_column_scale(X, 1. / X_scale)
    X_scale = np.ones(X.shape[1])
   X_offset = np.average(X, axis = 0, weights = sample_weight)
   X -= X_offset
   if normalize:
    X, X_scale = f_normalize(X, axis = 0, copy = False, return_norm = True)
    X_scale = np.ones(X.shape[1])
  y_offset = np.average(y, axis = 0, weights = sample_weight)
  y = y - y_offset
  X_offset = np.zeros(X.shape[1])
  X_scale = np.ones(X.shape[1])
  y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype = x.dtype)

 return X, y, X_offset, y_offset, X_scale


 check_array: 實現將數據形式轉化爲ndarray的過程。


 inplace_column_scale 可以看到稀疏矩陣的正則化方法是直接使用標準差:
  1 / sqrt(n * var)


def _rescale_data(X, y, sample_weight):
 n_samples = X.shape[0]
 sample_weight = sample_weight * np.ones(n_samples)
 sample_weight = np.sqrt(sample_weight)
 sw_matrix = sparse.dia_matrix((sample_weight, 0), shape = (n_samples, n_samples))

 X = safe_sparse_dot(sw_matrix, X)
 y = safe_sparse_dot(sw_matrix, y)

 return X, y

 sparse.dia_matrix: 生成對角稀疏矩陣

class LinearModel(six.with_metaclass(ABCMeta, BaseEstimator)):
 def fit(self, X, y):
  """Fit model."""

 def _decision_function(self, X):
  check_is_fitted(self, "coef_")
  X = check_array(X, accept_sparse = ['csr', 'csc', 'coo'])
  return safe_sparse_dot(X, self.coef_.T, dense_output = True) + self.intercept_

 def predict(self, X):
  return self._decision_function(X)

_preprocess_data = staticmethod(_preprocess_data)

 def _set_intercept(self, X_offset, y_offset, X_scale):
  if self.fit_intercept:
   self.coef_ = self.coef_ / X_scale
   self.intercept_ = y_offset -, self.coef_.T)
   self.intercept_ = 0.

 six.with_metaclass(ABCMeta, BaseEstimator):


class LinearClassifierMixin(ClassifierMixin):
 def decision_function(self, X):
  if not hasattr(self, 'coef_') or self.coef_ in None:
   raise NotFittedError("This %(name)s instance is not fitted "
     "yet" % {"name": type(self), __name__})
  X = check_array(X, accept_sparse = 'csr')

  n_features = self.coef_.shape[1]
  if X.shape[1] != n_features:
   raise ValueError("X has %d features per sample; expecting %d"
     % (X.shape[1], n_features))

  scores = safe_sparse_dot(X, self.coef_.T, dense_output = True) + self.intercept_

 def predict(self, X):
  scores = self.decision_function(X)
  if len(scores.shape) == 1:
   indices = (scores > 0).astype(
   indices = scores.argmax(axis = 1)
  return self.classes_[indices]

 def _predict_proba_lr(self, X):
  prob = self.decision_function(X)
  prob *= -1
  np.exp(prob, prob)
  prob += 1
  np.reciprocal(prob, prob)
  if prob.ndim == 1:
   return np.vstack([1 - prob, prob]).T 
   prob /= prob.sum(axis = 1).reshape([prob.shape[0], -1])
   return prob

  np.exp, np.reciprocal(倒數變換),這裏多次使用out作爲第二參數簡化。

class SparseCoefMixin(object):
 def densify(self):
  msg = "Estimator, %(name)s, must be fitted before densifying."
  check_is_fitted(self, "coef_", msg = msg)
  if sp.issparse(self.coef_):
   self.coef_ = self.coef_.toarray()
  return self

 def sparsify(self):
  msg = "Estimator, %(name)s, must be fitted before sparsifying."
  check_is_fitted(self, "coef_", msg = msg)
  self.coef_ = sp.csr_matrix(self.coef_)
  return self


class LinearRegression(LinearModel, RegressorMixin):
 def __init__(self, fit_intercept = True, normalize = False, copy_X = True,
     n_jobs = 1):
  self.fit_intercept = fit_intercept
  self.normalize = normalize
  self.copy_X = copy_X
  self.n_jobs = n_jobs

 def fit(self, X, y, sample_weight = None):
  n_jobs_ = self.n_jobs
  X, y = check_X_y(X, y, accept_sparse = ['csr', 'csc', 'coo'],
     y_numeric = True, multi_output = True)

  if sample_weight is not None and np.atleast_id(sample_weight).ndim > 1:
   raise ValueError("Sample weights must be 1D array or scalar")

  X, y, X_offset, X_scale = self._preprocess_data(X, y, fit_intercept = self.fit_intercept, normalize = self.normalize, \
    copy = self.copy_X, sample_weight = sample_weight)

  if sample_weight is not None:
   X, y = _rescale_data(X, y, sample_weight)

  if sp.issparse(X):
   if y.ndim < 2:
    out = sparse_lsqr(X, y)
    self.coef_ = out[0]
    self._residues = out[3]
    outs = Parallel(n_jobs = n_jobs_)(delayed(sparse_lsqr)(X, y[:,j].ravel()for j in range(y.shape[1])))
    self.coef_ = np.vstack(out[0] for out in outs)
    self._residuals = np.vstack(out[3] for out in outs)
   self.coef_, self._residuals, self.rank_, self.singular_ = linalg.lstsq(X, y)
   self.coef_ = self.coef_.T

  if y.ndim == 1:
   self.coef_ = np.ravel(self.coef_)
  self._set_intercept(X_offset, y_offset, X_scale)
  return self

  check_X_y: 僅僅是進行形式檢查。






