from abc import ABCMeta, abstractmethod: 有關實現抽象類的方法。
import numbers: 抽象基類層次結構,這些類不可被實例化。
import warnings
import scipy.sparse as sp
from scipy import linalg
from scipy import sparse
from ..externals.joblib import Parallel, delayed:前者操縱多線程,後者用於捕獲
參數:
EX:[delayed(sqrt)(i ** 3)for i in range(10)]
要應用前者必須用後者生成捕獲參數的形式。
from ..base import BaseEstimator, ClassifierMixin, RegressorMixin:
這裏Mixin類是爲了解決多重繼承的問題提出的,目的是爲派生類提供
可用的接口,而又不需要繁複的繼承,即Minin類無實例化意義但有方法意義。
from..utils import check_array, check_X_y, deprecated, as_float_array
deprecated(反對) decorator: 用於在裝飾對象被調用時提供警告提示。
from ..utils.validation import FLOAT_DTYPES:
(numpy.float64, numpy.float32, numpy.float16)
from ..utils import check_random_state:
對設定的隨機數狀態進行更新。
from ..utils.extmath import safe_sprse_dot:
正確處理稀疏矩陣的點乘函數,可見矩陣乘法的快速計算並非通過定義完成。
from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
CSR CSC 爲不同的稀疏矩陣存儲格式,
mean_variance_axis:指定計算稀疏矩陣沿某一軸方向的mean variance.
inplace_column_scale:對數據陣的列實現scale指定的列乘積變換。
from ..utils.fixes import sparse_lsqr
對於大型 稀疏 線性系統 或等式 找到最小二乘解。
from ..utils.seq_datasets import ArrayDataset, CSRDataset
數據集類型 分別指定了表示形式爲二維數組及稀疏矩陣
from ..utils.validation import check_is_fitted:
看模型與所提供的參數是否一致。
from ..exceptions import NotFittedError
from ..preprocessing.data import normalize as f_normalize
正則化變換(只正則不中心),默認axis = 1是針對feature的。
對於稀疏數據的相應常數列使用此參數進行調整,避免震盪。(。。。)
def make_datasets(X, y, sample_weight, random_state = None):
rng = check_random_state(random_state)
seed = rng.randint(1, np.iinfo(np.int32).max)
if sp.issparse(X):
dataset = CSRDataset(X.data, X.indptr, X.indices, y, sample_weight, seed = seed)
intercept_decay = SPARSE_INTERCEPT_DECAY
else:
dataset = ArrayDataset(X, y, sample_weight, seed = seed)
intercept_decay = 1
return dataset, intercept_decay
check_random_state:
返回RandomState實例。(分None RandomState int三種參數返回實例)
np.iinf():
返回機器對int類型的範圍。
np.random.randint(low, high=None, size=None):
返回[low, high)範圍的整數隨機數。
CSRDataset(。。。)
ArrayDataset(。。。)
python and 具有javascript類似的特性,在計算bool值時當and第一個條件不被通過時
不會計算第二個條件,故在設定第二個條件時可以假定第一個條件成立。
甚至函數的形參也可以改變。
(如sparse_center_data center_data...)
def _preprocess_data(X, y, fit_intercept, normalize = False, copy = True,
sample_weight = None, return_mean = False):
if isinstance(sample_weight, numbers.Number):
sample_weight = None
X = check_array(X, copy = copy, accept_sparse = ['csr', 'csc'],
dtype = FLOAT_DTYPES)
if fit_intercept:
if sp.issparse(X):
X_offset, X_var = mean_variance_axis(X, axis = 0)
if not return_mean:
X_offset = np.zeros(X.shape[1])
if normalize:
X_var *= X.shape[0]
X_scale = np.sqrt(X_var, X_var)
del X_var
X_scale[X_scale == 0] = 1
inplace_column_scale(X, 1. / X_scale)
else:
X_scale = np.ones(X.shape[1])
else:
X_offset = np.average(X, axis = 0, weights = sample_weight)
X -= X_offset
if normalize:
X, X_scale = f_normalize(X, axis = 0, copy = False, return_norm = True)
else:
X_scale = np.ones(X.shape[1])
y_offset = np.average(y, axis = 0, weights = sample_weight)
y = y - y_offset
else:
X_offset = np.zeros(X.shape[1])
X_scale = np.ones(X.shape[1])
y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype = x.dtype)
return X, y, X_offset, y_offset, X_scale
正則化過程,當數據陣爲稀疏(這時特徵均值接近0)或fit_intercept=False時
不會中心化但仍然會標準化。可以使用樣本加權係數對mean進行加權。
sample_weight(。。。)
numbers.Number爲python中數的抽象基類abc.ABCMeta
1 / sqrt(n * var)
def _rescale_data(X, y, sample_weight):
n_samples = X.shape[0]
sample_weight = sample_weight * np.ones(n_samples)
sample_weight = np.sqrt(sample_weight)
sw_matrix = sparse.dia_matrix((sample_weight, 0), shape = (n_samples, n_samples))
X = safe_sparse_dot(sw_matrix, X)
y = safe_sparse_dot(sw_matrix, y)
return X, y
利用矩陣乘法對數據進行放縮。
sparse.dia_matrix: 生成對角稀疏矩陣
class LinearModel(six.with_metaclass(ABCMeta, BaseEstimator)):
@abstractmethod
def fit(self, X, y):
"""Fit model."""
def _decision_function(self, X):
check_is_fitted(self, "coef_")
X = check_array(X, accept_sparse = ['csr', 'csc', 'coo'])
return safe_sparse_dot(X, self.coef_.T, dense_output = True) + self.intercept_
def predict(self, X):
return self._decision_function(X)
_preprocess_data = staticmethod(_preprocess_data)
def _set_intercept(self, X_offset, y_offset, X_scale):
if self.fit_intercept:
self.coef_ = self.coef_ / X_scale
self.intercept_ = y_offset - np.dot(X_offset, self.coef_.T)
else:
self.intercept_ = 0.
six.with_metaclass(ABCMeta, BaseEstimator):
使用ABCMeta產生BaseEstimator的抽象基類。
這裏使用abstractmethod是一種標註,ABCMeta生成的虛基類在
所有abstractmethod被重載前不能實例化。
_decision_function:
predict:
返回預測值向量.
_set_intercept:
設定參數的函數定義說明,當擬合常數項時其將求解與
變換相分離,對變換後的正則化數據進行求解,與原解相同。
class LinearClassifierMixin(ClassifierMixin):
def decision_function(self, X):
if not hasattr(self, 'coef_') or self.coef_ in None:
raise NotFittedError("This %(name)s instance is not fitted "
"yet" % {"name": type(self), __name__})
X = check_array(X, accept_sparse = 'csr')
n_features = self.coef_.shape[1]
if X.shape[1] != n_features:
raise ValueError("X has %d features per sample; expecting %d"
% (X.shape[1], n_features))
scores = safe_sparse_dot(X, self.coef_.T, dense_output = True) + self.intercept_
def predict(self, X):
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores > 0).astype(np.int)
else:
indices = scores.argmax(axis = 1)
return self.classes_[indices]
def _predict_proba_lr(self, X):
prob = self.decision_function(X)
prob *= -1
np.exp(prob, prob)
prob += 1
np.reciprocal(prob, prob)
if prob.ndim == 1:
return np.vstack([1 - prob, prob]).T
else:
prob /= prob.sum(axis = 1).reshape([prob.shape[0], -1])
return prob
LinearClassifierMixin:
由於這是一個Mixin類,故包含將要派生的類的混合多種方法。
這裏y應當是以0爲中心判別超平面的符號距離度量。
從返回值來看,其具有y特徵非一維的可能。
decision_function:
與線性模型決定函數相同。
predict:
當y爲正常意義的n_samples向量時,由閾值0來進行分類判斷,
否則爲多類問題,按分類向量最大值來判斷。(幾何意義上的)
返回分類。
_predict_proba_lr:
np.exp, np.reciprocal(倒數變換),這裏多次使用out作爲第二參數簡化。
返回兩類別或多類別概率。(歸一化)
class SparseCoefMixin(object):
def densify(self):
msg = "Estimator, %(name)s, must be fitted before densifying."
check_is_fitted(self, "coef_", msg = msg)
if sp.issparse(self.coef_):
self.coef_ = self.coef_.toarray()
return self
def sparsify(self):
msg = "Estimator, %(name)s, must be fitted before sparsifying."
check_is_fitted(self, "coef_", msg = msg)
self.coef_ = sp.csr_matrix(self.coef_)
return self
係數密集化或稀疏化(僅僅是形式轉換)
densify:
密集化
sparsify:
稀疏化
class LinearRegression(LinearModel, RegressorMixin):
def __init__(self, fit_intercept = True, normalize = False, copy_X = True,
n_jobs = 1):
self.fit_intercept = fit_intercept
self.normalize = normalize
self.copy_X = copy_X
self.n_jobs = n_jobs
def fit(self, X, y, sample_weight = None):
n_jobs_ = self.n_jobs
X, y = check_X_y(X, y, accept_sparse = ['csr', 'csc', 'coo'],
y_numeric = True, multi_output = True)
if sample_weight is not None and np.atleast_id(sample_weight).ndim > 1:
raise ValueError("Sample weights must be 1D array or scalar")
X, y, X_offset, X_scale = self._preprocess_data(X, y, fit_intercept = self.fit_intercept, normalize = self.normalize, \
copy = self.copy_X, sample_weight = sample_weight)
if sample_weight is not None:
X, y = _rescale_data(X, y, sample_weight)
if sp.issparse(X):
if y.ndim < 2:
out = sparse_lsqr(X, y)
self.coef_ = out[0]
self._residues = out[3]
else:
outs = Parallel(n_jobs = n_jobs_)(delayed(sparse_lsqr)(X, y[:,j].ravel()for j in range(y.shape[1])))
self.coef_ = np.vstack(out[0] for out in outs)
self._residuals = np.vstack(out[3] for out in outs)
else:
self.coef_, self._residuals, self.rank_, self.singular_ = linalg.lstsq(X, y)
self.coef_ = self.coef_.T
if y.ndim == 1:
self.coef_ = np.ravel(self.coef_)
self._set_intercept(X_offset, y_offset, X_scale)
return self
fit:
check_X_y: 僅僅是進行形式檢查。
這裏值得注意的是sparse_lsqr不能用於求解y爲二維的問題,
轉化爲對每一個列求解,且由於列間求解獨立,採用了
Parallel(delayed)這種並行求解方式加快求解速度。
普通最小二乘求解使用linalg.lstsq
其他細節在上面的討論中都提到了。