sklearn 源碼解析基本線性模型嶺迴歸 ridge.py(1)

對於前面已經提到的類及一些細節不再給出。對於稀疏矩陣的瞭解是必要的。
from abc import ABCMeta, abstractmethod
import warnings

import numpy as np
from scipy import linalg
from scipy import sparse
from scipy.sparse import linalg as sp_linalg

from .base import LinearClassifierMixin, LinearModel, _rescale_data
from .sag import sag_solver :是一種隨機平均梯度下降式的求解嶺迴歸與logistic迴歸的包，
使用梯度下降法，
   這種算法收斂很快。
from ..base import RegressorMixin
from ..utils.extmath import safe_aparse_dot
from ..utils.extmath import row_norms: 行範數，不支持稀疏矩陣。
from ..utils import check_X_y
from ..utils import check_array :轉換爲ndarray類型。
from ..utils import check_consistent_length: 檢查一個ndarray的list 是否所有元素第一個
   維度都相等。（i.e. same length）
from ..utils import column_or_1d: 特殊的拉直函數，接受類似feature形式的ndarray,
   即在第二個維度上只有1維，並將其按列拉直爲以爲數組。
from ..preprocessing import LabelBinarizer:
   對一對多問題將標籤進行二值化。
from ..model_selection import GridSearchCV
from ..externsls import six
from ..metrics.scorer import check_scoring: 對能夠進行返回score估計的模型，返回進行
   score計算的函數。

下面先不對具體的類，而是接口進行說明。（無組織架構）
sp_linalg.aslinearoperator: 將對象（ndarray, sparse, matrix an so on）轉換爲線性算子。
np.empty(shape): 返回相應shape的未初始化的ndarray.
sp_linalg.cg: 使用共軛梯度（Conjugate Gradient）法解線性系統。
sq_linalg.lsqr: lsqr :QR分解。
ndarray.flat:（。。。）
np.atleast_1d: 標量化爲一維數組，高維保持。

def _solve_sparse_cg(X, y, alpha, max_iter = None, tol = 1e-3, verbose = 0):
 n_samples, n_features = X.shape
 X1 = sp_linalg.aslinearoperator(X)
 coefs = np.empty((y.shape[1], n_features))

 if n_features > n_samples:
  def create_mv(curr_alpha):
   def _mv(x):
    return X1.matvec(X1.rmatvec(x)) + curr_alpha * x
   return _mv 
 else:
  def create_mv(curr_alpha):
   def _mv(curr_alpha):
    return X1.rmatvec(X1.matvec(x)) + curr_alpha * x 
   return _mv 

 for i in range(y.shape[1]):
  y_column = y[:, i]

  mv = create_mv(alpha[i])
  if n_features > n_samples:
   C = sp_linalg.LinearOperator((n_samples, n_samples), matvec = mv, dtype = x.dtype)
   coef, info = sp_linalg.cg(C, y_column, tol = tol)
   coefs[i] = X1.rmatvec(coef)
  else:
   y_column = X1.rmatvec(y_column)
   C = sp_linalg.LinearOperator((n_features, n_features), matvec = mv, dtype = x.dtype)
   coefs[i], info = sp_linalg.cg(C, y_column, maxiter = max_iter, tol = tol)

  if info < 0:
   raise ValueError("Failed with error code %d" % info)

  if max_iter is None and info > 0 and verbose:
   warning.warn("sparse_cg did not coverge sfter %d iterations." % info)
 return coefs

_solve_sparse_cg:
這裏將在理論意義上求逆矩陣的過程轉換爲線性系統的解。
所用到的線性算子僅僅是定義了對於矩陣的變換，matvec 向量乘積變換，
rmatvec 共軛轉置變換。
函數分n_features 與n_samples的大小進行了分類，可以證明在取逆的情況
下是相等的。（Kernel Ridge Regression.pdf）
返回係數

def _solve_lsqr(X, y, alpha, max_iter = None, tol = 1e-3):
 n_samples, n_features = X.shape
 coefs = np.empty((y.shape[1], n_features))
 n_iter = np.empty(y.shape[1], dtype = np.int32)

 sqrt_alpha = np.sqrt(alpha)

 for i in range(y.shape[1]):
  y_column = y[:,i]
  info = sp_linalg.lsqr(X, y_column, damp = sqrt_alpha[i], atol = tol, btol= tol, iter_lim = max_iter)
  coefs[i] = info[0]
  n_iter[i] = info[2]

 return coefs, n_iter

_solve_lsqr:
返回線性系統的最小二乘解。

def _solve_cholesky(X, y, alpha):
 n_samples, n_features = X.shape
 n_targets = y.shape[1]

 A = safe_sparse_dot(X.T, X, dense_output = True)
 Xy = safe_sparse_dot(X.T, y, dense_output = True)

 one_alpha = np.array_equal(alpha, len(alpha) * [alpha[0]])
 if one_alpha:
  A.flat[::n_features + 1] ++ alpha[0]
  return linalg.solve(A, Xy, sym_pos = True, overwrite_a = True)
 else:
  coefs = np.empty([n_targets, n_features])
  for coef, target , curr_alpha in zip(coefs, Xy.T, alpha):
   A.flat[::n_features + 1] += curr_alpha 
   coef[:] = linalg.solve(A, target, sym_pos = True, overwrite_a = False).ravel()
   A.flat[::n_features + 1] -= curr_alpha
  return coefs

_solve_cholesky:
由於在linalg.solve中選擇了sym_pos爲True，故指定了矩陣爲對稱正定矩陣，
採用cholesky分解來解（分解爲三角陣的內積）
A.flat[::n_features + 1]實現了對對角元素的簡寫。

def _solve_cholesky_kernel(K, y, alpha, sample_weight = None, copy = False):
 n_samples = K.shape[0]
 n_targets = y.shape[1]

 if copy:
  K = K.copy()

 alpha = np.atleast_1d(alpha)
 one_alpha = (alpha == alpha[0]).all()
 has_sw = isinstance(sample_weight, np.ndarray) or sample_weight not in [1.0, None]

 if has_sw:
  sw = np.sqrt(np.atleast_1d(sample_weight))
  y = y * sw[:, np.newaxis]
  K *= np.outer(sw, sw)

 if one_alpha:
  K.flat[::n_samples + 1] += alpha[0]
  try:
   dual_coef = linalg.solve(K, y, sym_pos = True, overwrite_a = False)
  except np.linalg.LinAlgError:
   warning.warn("Singular matrix in solving dual problem. using "
    "least-squares solution instead.")
   dual_coef = linalg.lstsq(K, y)[0]

  K.flat[::n_samples + 1] -= alpha[0]

  if has_sw:
   dual_coef *= sw[:,np.newaxis]

  return dual_coef
 else:
  dual_coefs = np.empty([n_targets, n_samples])

  for dual_coef, target, current_alpha in zip(dual_coefs, y.T, alpha):
   K.flat[::n_samples + 1] += current_alpha 
   dual_coef[:] = linalg.solve(K, target, sym_pos = True, overwrite_a = False).ravel()
   K.flat[::n_samples + 1] -= current_alpha
  if has_sw:
   dual_coefs *= sw[np.newaxis, :]

  return dual_coefs.T

_solve_cholesky_kernel:
該函數允許對樣本賦予權重，唯一與_solve_cholesky_kernel的不同是其使用的核
矩陣需要給出。
還提供了當解等式linalg.solve失效時使用linalg.lstsq求最小二乘解的方案。

def _solve_svd(X, y, alpha):
 U, s, Vt = linalg.svd(X, full_matrices = False)
 idx = s > 1e-15 
 s_nnz = s[idx][:,np.newaxis]
 UTy = np.dot(U.T, y)
 d = np.zeros((s.size, alpha.size))
 d[idx] = s_nnz / (s_nnz ** 2 + alpha)
 d_UT_y = d * UTy
 return np.dot(Vt.T, d_UT_y).T

_solve_svd:
這裏僅僅是將上面_solve_cholesky_kernel的過程先將X進行奇異值分解，
將奇異值推導的嶺迴歸結果求解出來。
這裏以1e-15爲閾值，砍掉了小的奇異值（與scipy.linalg.pinv求廣義逆矩陣
特徵值閾值相同）

def ridge_regression(X, y, alpha, sample_weight = None, solver = 'auto',
     max_iter = None, tol = ie-3, verbose = 0, random_state = None,
     return_n_iter = False, return_intercept = False):
 if return_intercept and sparse.issparse(X) and solver != 'sag':
  if solver != 'auto':
   warning.warn("In Ridge, only 'sag' solver can currently fit the "
    "intercept when X is sparse. Solver has been automatically "
    "changed into 'sag'.")
  solver = 'sag'

 if solver == 'sag':
  X = check_array(X, accept_sparse = ['csr'],
      dtype = np.float64, order = 'C')
  y = check_array(y, dtype = np.float64, ensure_2d = False, order = 'F')
 else:
  X = check_array(X, accept_sparse = ['csr', 'csc', 'coo'], dtype = np.float64)
  y = check_array(y, dtype = 'numeric', ensure_2d = False)

 check_consistent_length(X, y)
 n_samples, n_features = X.shape 

 if y.ndim > 2:
  raise ValueError("Target y has the wrong shape %s" % str(y,shape))

 ravel = False 
 if y.ndim == 1:
  y = y.reshpe(-1, 1)
  ravel = True

 n_samples_, n_targets = y.shape 
 if n_samples != n_samples_:
  raise ValueError("Number of samples in X and y does not correspond:"
     " %d != %d" % (n_samples, n_samples_))

 has_sw = sample_weight is not None 

 if solver == 'auto':
  if not sparse.issparse(X) or has_sw:
   solver = 'cholesky'
  else:
   solver = 'sparse_cg'

 elif solver == 'lsqr' and not hasattr(sp_linalg, 'lsqr'):
  warning.warn("lsqr not avaliable on this machine, falling back to sparse_cg")
  solver = 'sparse_cg'

 if has_sw:
  if np.atleast_1d(sample_weight).ndim > 1:
   raise ValueError("Sample weights must be 1D array or scalar")

  if solver != 'sag':
   X, y = _rescale_data(X, y, sample_weight)

 alpha = np.asarray(alpha).ravel()
 if alpha.size not in [1, n_targets]:
  raise ValueError("Number of targets nad number of penalties "
      "do nt correspond: %d != %d" % (alpha.size, n_targets))

 if alpha.size == 1 and n_targets > 1:
  alpha = np.repeat(alpha, n_targets)

 if solver not in ('sparse_cg', 'cholesky', 'svd', 'lsqr', 'sag'):
  raise ValueError('Solver %s not understood' % solver)

 n_iter = None 
 if solver == 'sparse_cg':
  coef = _solve_sparse_cg(X, y, alpha, max_iter, tol, verbose)
 elif solver == 'lsqr':
  coef, n_iter = _solve_lsqr(X, y, alpha, max_iter, tol)
 elif solver == 'cholesky':
  if n_features > n_samples:
   K = safe_sparse_dot(X, X.T, dense_output = True)
   try:
    dual_coef = _solve_cholesky_kernel(K, y, alpha)
    coef = safe_sparse_dot(X.T, dual_coef, dense_output = True).T 
   except linalg.LinAlgError:
    solver = 'svd'
  else:
   try:
    coef = _solve_cholesky(X, y, alpha)
   except linalg.LinAlgError:
    solver = 'svd'
 elif solver == 'sag':
  max_squared_sum = row_norms(X, squared = True).max()
  coef = np.empty([y.shape[1], n_features])
  n_iter = np.empty(y.shape[1], dtype = np.int32)
  intercept = np.zeros((y.shape[1],))
  for i, (alpha_i, target) in enumerate(zip(alpha, y.T)):
   init = {'coef': np.zeros((n_features + int(return_intercept), 1))}
   coef_, n_iter_, _ = sag_solver(X, target.ravel(), sample_weight, 'squared',
    alpha_i, max_iter, tol, verbose, random_state, False, max_squared_sum, init)
   if return_intercept:
    coef[i] = coef_[:-1]
    intercept[i] = coef_[-1]
   else:
    coef[i] = coef_
   n_iter[i] = n_iter_

  if intercept.shape[0] == 1:
   intercept = intercept[0]
  coef = np.asarray(coef)

 if solver == 'svd':
  if sparse.issparse(X):
   raise TypeError('SVD solver does not support sparse inputs currently')
  coef = _solve_svd(X, y, alpha)

 if ravel:
  coef = coef.ravel() 

 if return_n_iter and return_intercept:
  return coef, n_iter, intercept 
 elif return_intercept:
  return coef, intercept 
 elif return_n_iter:
  return coef, n_iter 
 else:
  return coef

ridge_regression:
這僅僅是對上面所提到的具體方法的統一接口。

嶺迴歸當數據陣爲sparse時僅能用隨機梯度下降法"sag"求解。

當solver選項爲'auto'時，對非稀疏矩陣或有加權的情況，採用cholesky
分解法解，否則使用sparse_cg

n_features > n_samples 在cholesky分解下采用核方法（僅僅是不變的線性核）
從這裏可以看到當矩陣爲奇異時採用svd奇異值分解求解。

svd不支持稀疏矩陣求解。

斯溫jack

發佈了28 篇原創文章 · 獲贊 23 · 訪問量 18萬+

私信關注

sklearn 源碼解析基本線性模型嶺迴歸 ridge.py(1)

Cython 初探及關於性能提升的初步討論

Theano 初探（一）

Scipy Lecture Notes（一）

sklearn 源碼解析基本線性模型 base.py

重拾C++ 順序容器

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

sklearn 源碼解析 基本線性模型 嶺迴歸 ridge.py(1)

sklearn 源碼解析基本線性模型嶺迴歸 ridge.py(1)