sklearn 源碼解析基本線性模型 base.py

from __future__ import division
from abc import ABCMeta, abstractmethod: 有關實現抽象類的方法。
import numbers: 抽象基類層次結構，這些類不可被實例化。
import warnings

import numpy as np
import scipy.sparse as sp
from scipy import linalg
from scipy import sparse

from ..externals import six: 版本有關
from ..externals.joblib import Parallel, delayed:前者操縱多線程，後者用於捕獲
      參數：
  EX:[delayed(sqrt)(i ** 3)for i in range(10)]
   要應用前者必須用後者生成捕獲參數的形式。
from ..base import BaseEstimator, ClassifierMixin, RegressorMixin:
這裏Mixin類是爲了解決多重繼承的問題提出的，目的是爲派生類提供
可用的接口，而又不需要繁複的繼承，即Minin類無實例化意義但有方法意義。
from..utils import check_array, check_X_y, deprecated, as_float_array
deprecated(反對) decorator: 用於在裝飾對象被調用時提供警告提示。
from ..utils.validation import FLOAT_DTYPES:
(numpy.float64, numpy.float32, numpy.float16)
from ..utils import check_random_state:
對設定的隨機數狀態進行更新。
from ..utils.extmath import safe_sprse_dot:
正確處理稀疏矩陣的點乘函數，可見矩陣乘法的快速計算並非通過定義完成。
from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
CSR CSC 爲不同的稀疏矩陣存儲格式，
mean_variance_axis:指定計算稀疏矩陣沿某一軸方向的mean variance.
inplace_column_scale:對數據陣的列實現scale指定的列乘積變換。
from ..utils.fixes import sparse_lsqr
對於大型稀疏線性系統或等式找到最小二乘解。
from ..utils.seq_datasets import ArrayDataset, CSRDataset
數據集類型分別指定了表示形式爲二維數組及稀疏矩陣
from ..utils.validation import check_is_fitted:
看模型與所提供的參數是否一致。
from ..exceptions import NotFittedError
from ..preprocessing.data import normalize as f_normalize
正則化變換(只正則不中心)，默認axis = 1是針對feature的。

SPARSE_INTERCEPT_DECAY = 0.01
對於稀疏數據的相應常數列使用此參數進行調整，避免震盪。（。。。）

def make_datasets(X, y, sample_weight, random_state = None):
 rng = check_random_state(random_state)
 seed = rng.randint(1, np.iinfo(np.int32).max)

 if sp.issparse(X):
  dataset = CSRDataset(X.data, X.indptr, X.indices, y, sample_weight, seed = seed)
  intercept_decay = SPARSE_INTERCEPT_DECAY
 else:
  dataset = ArrayDataset(X, y, sample_weight, seed = seed)
  intercept_decay = 1

 return dataset, intercept_decay

make_dataset:
check_random_state:
  返回RandomState實例。（分None RandomState int三種參數返回實例）
np.iinf():
  返回機器對int類型的範圍。
np.random.randint(low, high=None, size=None):
  返回[low, high)範圍的整數隨機數。
CSRDataset（。。。）
ArrayDataset（。。。）

此函數的目的爲對於稀疏或非稀疏數組，轉化成相應的datasets及返回相應的常數調整因子。

python and 具有javascript類似的特性，在計算bool值時當and第一個條件不被通過時
不會計算第二個條件，故在設定第二個條件時可以假定第一個條件成立。
甚至函數的形參也可以改變。

刪去一些即將被刪去的或即將被替換的方法。
（如sparse_center_data center_data...）

def _preprocess_data(X, y, fit_intercept, normalize = False, copy = True,
     sample_weight = None, return_mean = False):
 if isinstance(sample_weight, numbers.Number):
  sample_weight = None
 X = check_array(X, copy = copy, accept_sparse = ['csr', 'csc'],
     dtype = FLOAT_DTYPES)
 if fit_intercept:
  if sp.issparse(X):
   X_offset, X_var = mean_variance_axis(X, axis = 0)
   if not return_mean:
    X_offset = np.zeros(X.shape[1])

   if normalize:
    X_var *= X.shape[0]
    X_scale = np.sqrt(X_var, X_var)
    del X_var
    X_scale[X_scale == 0] = 1
    inplace_column_scale(X, 1. / X_scale)
   else:
    X_scale = np.ones(X.shape[1])
  else:
   X_offset = np.average(X, axis = 0, weights = sample_weight)
   X -= X_offset
   if normalize:
    X, X_scale = f_normalize(X, axis = 0, copy = False, return_norm = True)
   else:
    X_scale = np.ones(X.shape[1])
  y_offset = np.average(y, axis = 0, weights = sample_weight)
  y = y - y_offset
 else:
  X_offset = np.zeros(X.shape[1])
  X_scale = np.ones(X.shape[1])
  y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype = x.dtype)

 return X, y, X_offset, y_offset, X_scale

_preprocess_data:
正則化過程，當數據陣爲稀疏（這時特徵均值接近0）或fit_intercept=False時
不會中心化但仍然會標準化。可以使用樣本加權係數對mean進行加權。
sample_weight（。。。）
numbers.Number爲python中數的抽象基類abc.ABCMeta

check_array: 實現將數據形式轉化爲ndarray的過程。

np.sqrt可以輸入第二個參數，但要求元素具有相同類型，作爲輸出到的數組。

inplace_column_scale 可以看到稀疏矩陣的正則化方法是直接使用標準差：
1 / sqrt(n * var)

正則化僅僅對於X進行，y僅中心化。

def _rescale_data(X, y, sample_weight):
 n_samples = X.shape[0]
 sample_weight = sample_weight * np.ones(n_samples)
 sample_weight = np.sqrt(sample_weight)
 sw_matrix = sparse.dia_matrix((sample_weight, 0), shape = (n_samples, n_samples))

 X = safe_sparse_dot(sw_matrix, X)
 y = safe_sparse_dot(sw_matrix, y)

 return X, y

_rescale_data:
利用矩陣乘法對數據進行放縮。
sparse.dia_matrix: 生成對角稀疏矩陣

class LinearModel(six.with_metaclass(ABCMeta, BaseEstimator)):
 @abstractmethod
 def fit(self, X, y):
  """Fit model."""

 def _decision_function(self, X):
  check_is_fitted(self, "coef_")
  X = check_array(X, accept_sparse = ['csr', 'csc', 'coo'])
  return safe_sparse_dot(X, self.coef_.T, dense_output = True) + self.intercept_

 def predict(self, X):
  return self._decision_function(X)

_preprocess_data = staticmethod(_preprocess_data)

 def _set_intercept(self, X_offset, y_offset, X_scale):
  if self.fit_intercept:
   self.coef_ = self.coef_ / X_scale
   self.intercept_ = y_offset - np.dot(X_offset, self.coef_.T)
  else:
   self.intercept_ = 0.

linearModel:
six.with_metaclass(ABCMeta, BaseEstimator):
使用ABCMeta產生BaseEstimator的抽象基類。
這裏使用abstractmethod是一種標註，ABCMeta生成的虛基類在
所有abstractmethod被重載前不能實例化。

_decision_function:
predict:
返回預測值向量.

staticmethod爲靜態方法轉換函數，可以將類內或類外的方法轉換爲靜態方法，

這裏靜態是指，類不需要實例化就可以調用的方法。（一般對於類內成員函數，一般沒有

self屬性）

_set_intercept:
設定參數的函數定義說明，當擬合常數項時其將求解與
變換相分離，對變換後的正則化數據進行求解，與原解相同。

class LinearClassifierMixin(ClassifierMixin):
 def decision_function(self, X):
  if not hasattr(self, 'coef_') or self.coef_ in None:
   raise NotFittedError("This %(name)s instance is not fitted "
     "yet" % {"name": type(self), __name__})
  X = check_array(X, accept_sparse = 'csr')

  n_features = self.coef_.shape[1]
  if X.shape[1] != n_features:
   raise ValueError("X has %d features per sample; expecting %d"
     % (X.shape[1], n_features))

  scores = safe_sparse_dot(X, self.coef_.T, dense_output = True) + self.intercept_

 def predict(self, X):
  scores = self.decision_function(X)
  if len(scores.shape) == 1:
   indices = (scores > 0).astype(np.int)
  else:
   indices = scores.argmax(axis = 1)
  return self.classes_[indices]

 def _predict_proba_lr(self, X):
  prob = self.decision_function(X)
  prob *= -1
  np.exp(prob, prob)
  prob += 1
  np.reciprocal(prob, prob)
  if prob.ndim == 1:
   return np.vstack([1 - prob, prob]).T 
  else:
   prob /= prob.sum(axis = 1).reshape([prob.shape[0], -1])
   return prob

LinearClassifierMixin:
由於這是一個Mixin類，故包含將要派生的類的混合多種方法。
這裏y應當是以0爲中心判別超平面的符號距離度量。
從返回值來看，其具有y特徵非一維的可能。

decision_function:
  與線性模型決定函數相同。

predict:
  當y爲正常意義的n_samples向量時，由閾值0來進行分類判斷，
  否則爲多類問題，按分類向量最大值來判斷。（幾何意義上的）
  返回分類。

_predict_proba_lr:
  np.exp, np.reciprocal(倒數變換),這裏多次使用out作爲第二參數簡化。
  返回兩類別或多類別概率。(歸一化)

class SparseCoefMixin(object):
 def densify(self):
  msg = "Estimator, %(name)s, must be fitted before densifying."
  check_is_fitted(self, "coef_", msg = msg)
  if sp.issparse(self.coef_):
   self.coef_ = self.coef_.toarray()
  return self

 def sparsify(self):
  msg = "Estimator, %(name)s, must be fitted before sparsifying."
  check_is_fitted(self, "coef_", msg = msg)
  self.coef_ = sp.csr_matrix(self.coef_)
  return self

SparseCoefMixin:
係數密集化或稀疏化（僅僅是形式轉換）

densify:
密集化
sparsify:
稀疏化

class LinearRegression(LinearModel, RegressorMixin):
 def __init__(self, fit_intercept = True, normalize = False, copy_X = True,
     n_jobs = 1):
  self.fit_intercept = fit_intercept
  self.normalize = normalize
  self.copy_X = copy_X
  self.n_jobs = n_jobs

 def fit(self, X, y, sample_weight = None):
  n_jobs_ = self.n_jobs
  X, y = check_X_y(X, y, accept_sparse = ['csr', 'csc', 'coo'],
     y_numeric = True, multi_output = True)

  if sample_weight is not None and np.atleast_id(sample_weight).ndim > 1:
   raise ValueError("Sample weights must be 1D array or scalar")

  X, y, X_offset, X_scale = self._preprocess_data(X, y, fit_intercept = self.fit_intercept, normalize = self.normalize, \
    copy = self.copy_X, sample_weight = sample_weight)

  if sample_weight is not None:
   X, y = _rescale_data(X, y, sample_weight)

  if sp.issparse(X):
   if y.ndim < 2:
    out = sparse_lsqr(X, y)
    self.coef_ = out[0]
    self._residues = out[3]
   else:
    outs = Parallel(n_jobs = n_jobs_)(delayed(sparse_lsqr)(X, y[:,j].ravel()for j in range(y.shape[1])))
    self.coef_ = np.vstack(out[0] for out in outs)
    self._residuals = np.vstack(out[3] for out in outs)
  else:
   self.coef_, self._residuals, self.rank_, self.singular_ = linalg.lstsq(X, y)
   self.coef_ = self.coef_.T

  if y.ndim == 1:
   self.coef_ = np.ravel(self.coef_)
  self._set_intercept(X_offset, y_offset, X_scale)
  return self

LinearRegression:
fit:
  check_X_y: 僅僅是進行形式檢查。

  這裏值得注意的是sparse_lsqr不能用於求解y爲二維的問題，
  轉化爲對每一個列求解，且由於列間求解獨立，採用了
  Parallel(delayed)這種並行求解方式加快求解速度。

  普通最小二乘求解使用linalg.lstsq

  其他細節在上面的討論中都提到了。

估計模型返回self對象。

斯溫jack

發佈了28 篇原創文章 · 獲贊 23 · 訪問量 18萬+

私信關注

sklearn 源碼解析基本線性模型 base.py

Cython 初探及關於性能提升的初步討論

Theano 初探（一）

Scipy Lecture Notes（一）

sklearn 源碼解析基本線性模型 base.py

重拾C++ 順序容器

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

sklearn 源碼解析 基本線性模型 base.py

sklearn 源碼解析基本線性模型 base.py