sklearn 源碼解析 基本線性模型 嶺迴歸 ridge.py(1)

對於前面已經提到的類及一些細節不再給出。對於稀疏矩陣的瞭解是必要的。
from abc import ABCMeta, abstractmethod
import warnings

import numpy as np
from scipy import linalg
from scipy import sparse
from scipy.sparse import linalg as sp_linalg

from .base import LinearClassifierMixin, LinearModel, _rescale_data
from .sag import sag_solver :是一種隨機平均梯度下降式的求解嶺迴歸與logistic迴歸的包,
  使用梯度下降法,
   這種算法收斂很快。
from ..base import RegressorMixin
from ..utils.extmath import safe_aparse_dot
from ..utils.extmath import row_norms: 行範數,不支持稀疏矩陣。
from ..utils import check_X_y
from ..utils import check_array :轉換爲ndarray類型。
from ..utils import check_consistent_length: 檢查一個ndarray的list 是否所有元素第一個
   維度都相等。(i.e. same length)
from ..utils import column_or_1d: 特殊的拉直函數,接受類似feature形式的ndarray,
   即在第二個維度上只有1維,並將其按列拉直爲以爲數組。
from ..preprocessing import LabelBinarizer:
   對一對多問題將標籤進行二值化。
from ..model_selection import GridSearchCV
from ..externsls import six
from ..metrics.scorer import check_scoring: 對能夠進行返回score估計的模型,返回進行
   score計算的函數。


下面先不對具體的類,而是接口進行說明。(無組織架構)
sp_linalg.aslinearoperator: 將對象(ndarray, sparse, matrix an so on)轉換爲線性算子。
np.empty(shape): 返回相應shape的未初始化的ndarray.
sp_linalg.cg: 使用共軛梯度(Conjugate Gradient)法解線性系統。
sq_linalg.lsqr: lsqr :QR分解。
ndarray.flat:(。。。)
np.atleast_1d: 標量化爲一維數組,高維保持。

def _solve_sparse_cg(X, y, alpha, max_iter = None, tol = 1e-3, verbose = 0):
 n_samples, n_features = X.shape
 X1 = sp_linalg.aslinearoperator(X)
 coefs = np.empty((y.shape[1], n_features))

 if n_features > n_samples:
  def create_mv(curr_alpha):
   def _mv(x):
    return X1.matvec(X1.rmatvec(x)) + curr_alpha * x
   return _mv 
 else:
  def create_mv(curr_alpha):
   def _mv(curr_alpha):
    return X1.rmatvec(X1.matvec(x)) + curr_alpha * x 
   return _mv 

 for i in range(y.shape[1]):
  y_column = y[:, i]

  mv = create_mv(alpha[i])
  if n_features > n_samples:
   C = sp_linalg.LinearOperator((n_samples, n_samples), matvec = mv, dtype = x.dtype)
   coef, info = sp_linalg.cg(C, y_column, tol = tol)
   coefs[i] = X1.rmatvec(coef)
  else:
   y_column = X1.rmatvec(y_column)
   C = sp_linalg.LinearOperator((n_features, n_features), matvec = mv, dtype = x.dtype)
   coefs[i], info = sp_linalg.cg(C, y_column, maxiter = max_iter, tol = tol)

  if info < 0:
   raise ValueError("Failed with error code %d" % info)

  if max_iter is None and info > 0 and verbose:
   warning.warn("sparse_cg did not coverge sfter %d iterations." % info)
 return coefs



_solve_sparse_cg:
 這裏將在理論意義上求逆矩陣的過程轉換爲線性系統的解。
 所用到的線性算子僅僅是定義了對於矩陣的變換,matvec 向量乘積變換,
 rmatvec 共軛轉置變換。
 函數分n_features 與n_samples的大小進行了分類,可以證明在取逆的情況
 下是相等的。(Kernel Ridge Regression.pdf)
 返回係數

def _solve_lsqr(X, y, alpha, max_iter = None, tol = 1e-3):
 n_samples, n_features = X.shape
 coefs = np.empty((y.shape[1], n_features))
 n_iter = np.empty(y.shape[1], dtype = np.int32)

 sqrt_alpha = np.sqrt(alpha)

 for i in range(y.shape[1]):
  y_column = y[:,i]
  info = sp_linalg.lsqr(X, y_column, damp = sqrt_alpha[i], atol = tol, btol= tol, iter_lim = max_iter)
  coefs[i] = info[0]
  n_iter[i] = info[2]

 return coefs, n_iter



_solve_lsqr:
 返回線性系統的最小二乘解。

def _solve_cholesky(X, y, alpha):
 n_samples, n_features = X.shape
 n_targets = y.shape[1]

 A = safe_sparse_dot(X.T, X, dense_output = True)
 Xy = safe_sparse_dot(X.T, y, dense_output = True)

 one_alpha = np.array_equal(alpha, len(alpha) * [alpha[0]])
 if one_alpha:
  A.flat[::n_features + 1] ++ alpha[0]
  return linalg.solve(A, Xy, sym_pos = True, overwrite_a = True)
 else:
  coefs = np.empty([n_targets, n_features])
  for coef, target , curr_alpha in zip(coefs, Xy.T, alpha):
   A.flat[::n_features + 1] += curr_alpha 
   coef[:] = linalg.solve(A, target, sym_pos = True, overwrite_a = False).ravel()
   A.flat[::n_features + 1] -= curr_alpha
  return coefs



_solve_cholesky:
 由於在linalg.solve中選擇了sym_pos爲True,故指定了矩陣爲對稱正定矩陣,
 採用cholesky分解來解(分解爲三角陣的內積)
 A.flat[::n_features + 1]實現了對對角元素的簡寫。

def _solve_cholesky_kernel(K, y, alpha, sample_weight = None, copy = False):
 n_samples = K.shape[0]
 n_targets = y.shape[1]

 if copy:
  K = K.copy()

 alpha = np.atleast_1d(alpha)
 one_alpha = (alpha == alpha[0]).all()
 has_sw = isinstance(sample_weight, np.ndarray) or sample_weight not in [1.0, None]

 if has_sw:
  sw = np.sqrt(np.atleast_1d(sample_weight))
  y = y * sw[:, np.newaxis]
  K *= np.outer(sw, sw)

 if one_alpha:
  K.flat[::n_samples + 1] += alpha[0]
  try:
   dual_coef = linalg.solve(K, y, sym_pos = True, overwrite_a = False)
  except np.linalg.LinAlgError:
   warning.warn("Singular matrix in solving dual problem. using "
    "least-squares solution instead.")
   dual_coef = linalg.lstsq(K, y)[0]

  K.flat[::n_samples + 1] -= alpha[0]

  if has_sw:
   dual_coef *= sw[:,np.newaxis]

  return dual_coef
 else:
  dual_coefs = np.empty([n_targets, n_samples])

  for dual_coef, target, current_alpha in zip(dual_coefs, y.T, alpha):
   K.flat[::n_samples + 1] += current_alpha 
   dual_coef[:] = linalg.solve(K, target, sym_pos = True, overwrite_a = False).ravel()
   K.flat[::n_samples + 1] -= current_alpha
  if has_sw:
   dual_coefs *= sw[np.newaxis, :]

  return dual_coefs.T 



_solve_cholesky_kernel:
 該函數允許對樣本賦予權重,唯一與_solve_cholesky_kernel的不同是其使用的核
 矩陣需要給出。
 還提供了當解等式linalg.solve失效時使用linalg.lstsq求最小二乘解的方案。

def _solve_svd(X, y, alpha):
 U, s, Vt = linalg.svd(X, full_matrices = False)
 idx = s > 1e-15 
 s_nnz = s[idx][:,np.newaxis]
 UTy = np.dot(U.T, y)
 d = np.zeros((s.size, alpha.size))
 d[idx] = s_nnz / (s_nnz ** 2 + alpha)
 d_UT_y = d * UTy
 return np.dot(Vt.T, d_UT_y).T



_solve_svd:
 這裏僅僅是將上面_solve_cholesky_kernel的過程先將X進行奇異值分解,
 將奇異值推導的嶺迴歸結果求解出來。
 這裏以1e-15爲閾值,砍掉了小的奇異值(與scipy.linalg.pinv求廣義逆矩陣
 特徵值閾值相同)

def ridge_regression(X, y, alpha, sample_weight = None, solver = 'auto',
     max_iter = None, tol = ie-3, verbose = 0, random_state = None,
     return_n_iter = False, return_intercept = False):
 if return_intercept and sparse.issparse(X) and solver != 'sag':
  if solver != 'auto':
   warning.warn("In Ridge, only 'sag' solver can currently fit the "
    "intercept when X is sparse. Solver has been automatically "
    "changed into 'sag'.")
  solver = 'sag'

 if solver == 'sag':
  X = check_array(X, accept_sparse = ['csr'],
      dtype = np.float64, order = 'C')
  y = check_array(y, dtype = np.float64, ensure_2d = False, order = 'F')
 else:
  X = check_array(X, accept_sparse = ['csr', 'csc', 'coo'], dtype = np.float64)
  y = check_array(y, dtype = 'numeric', ensure_2d = False)

 check_consistent_length(X, y)
 n_samples, n_features = X.shape 

 if y.ndim > 2:
  raise ValueError("Target y has the wrong shape %s" % str(y,shape))

 ravel = False 
 if y.ndim == 1:
  y = y.reshpe(-1, 1)
  ravel = True

 n_samples_, n_targets = y.shape 
 if n_samples != n_samples_:
  raise ValueError("Number of samples in X and y does not correspond:"
     " %d != %d" % (n_samples, n_samples_))

 has_sw = sample_weight is not None 

 if solver == 'auto':
  if not sparse.issparse(X) or has_sw:
   solver = 'cholesky'
  else:
   solver = 'sparse_cg'

 elif solver == 'lsqr' and not hasattr(sp_linalg, 'lsqr'):
  warning.warn("lsqr not avaliable on this machine, falling back to sparse_cg")
  solver = 'sparse_cg'

 if has_sw:
  if np.atleast_1d(sample_weight).ndim > 1:
   raise ValueError("Sample weights must be 1D array or scalar")

  if solver != 'sag':
   X, y = _rescale_data(X, y, sample_weight)

 alpha = np.asarray(alpha).ravel()
 if alpha.size not in [1, n_targets]:
  raise ValueError("Number of targets nad number of penalties "
      "do nt correspond: %d != %d" % (alpha.size, n_targets))

 if alpha.size == 1 and n_targets > 1:
  alpha = np.repeat(alpha, n_targets)

 if solver not in ('sparse_cg', 'cholesky', 'svd', 'lsqr', 'sag'):
  raise ValueError('Solver %s not understood' % solver)

 n_iter = None 
 if solver == 'sparse_cg':
  coef = _solve_sparse_cg(X, y, alpha, max_iter, tol, verbose)
 elif solver == 'lsqr':
  coef, n_iter = _solve_lsqr(X, y, alpha, max_iter, tol)
 elif solver == 'cholesky':
  if n_features > n_samples:
   K = safe_sparse_dot(X, X.T, dense_output = True)
   try:
    dual_coef = _solve_cholesky_kernel(K, y, alpha)
    coef = safe_sparse_dot(X.T, dual_coef, dense_output = True).T 
   except linalg.LinAlgError:
    solver = 'svd'
  else:
   try:
    coef = _solve_cholesky(X, y, alpha)
   except linalg.LinAlgError:
    solver = 'svd'
 elif solver == 'sag':
  max_squared_sum = row_norms(X, squared = True).max()
  coef = np.empty([y.shape[1], n_features])
  n_iter = np.empty(y.shape[1], dtype = np.int32)
  intercept = np.zeros((y.shape[1],))
  for i, (alpha_i, target) in enumerate(zip(alpha, y.T)):
   init = {'coef': np.zeros((n_features + int(return_intercept), 1))}
   coef_, n_iter_, _ = sag_solver(X, target.ravel(), sample_weight, 'squared',
    alpha_i, max_iter, tol, verbose, random_state, False, max_squared_sum, init)
   if return_intercept:
    coef[i] = coef_[:-1]
    intercept[i] = coef_[-1]
   else:
    coef[i] = coef_
   n_iter[i] = n_iter_

  if intercept.shape[0] == 1:
   intercept = intercept[0]
  coef = np.asarray(coef)

 if solver == 'svd':
  if sparse.issparse(X):
   raise TypeError('SVD solver does not support sparse inputs currently')
  coef = _solve_svd(X, y, alpha)

 if ravel:
  coef = coef.ravel() 

 if return_n_iter and return_intercept:
  return coef, n_iter, intercept 
 elif return_intercept:
  return coef, intercept 
 elif return_n_iter:
  return coef, n_iter 
 else:
  return coef



ridge_regression:
 這僅僅是對上面所提到的具體方法的統一接口。 

 嶺迴歸當數據陣爲sparse時僅能用隨機梯度下降法"sag"求解。
 
 當solver選項爲'auto'時,對非稀疏矩陣或有加權的情況,採用cholesky
 分解法解,否則使用sparse_cg

 n_features > n_samples 在cholesky分解下采用核方法(僅僅是不變的線性核)
 從這裏可以看到當矩陣爲奇異時採用svd奇異值分解求解。

 svd不支持稀疏矩陣求解。
 
 



 












發佈了28 篇原創文章 · 獲贊 23 · 訪問量 18萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章