機器學習sklearn模塊(線性迴歸LinearRegression模型、嶺迴歸Ridge模型、基於LinearRegression的多項式迴歸模型)

對比利用多項式定義進行多項式擬合:筆記代碼:多項式擬合和極值點連接


線性迴歸

sklearn.linear_model.LinearRegression()
    --> return:線性迴歸器
                線性迴歸器.fit(輸入樣本,輸出標籤)                # 訓練數據
                線性迴歸器.predict(輸入樣本)                      # 預測數據
                    -- > return:預測輸出標籤

嶺迴歸

# 嶺迴歸 (削弱異常值對擬合的影響,正則強度越大,削弱的越厲害,降低對異常數據的依賴)
loss = J(k, b) + 正則函數(樣本權重)*正則強度(或懲罰係數)         # 正則項:可以防止過擬合
sklearn.linear_model.Ridge(正則強度,
                           fit_intercept=是否修正截距,
                           max_iter=最大迭代次數)
    --> return:嶺迴歸器
                嶺迴歸器.fit()                      # 訓練數據
                嶺迴歸器.predict()                  # 預測數據

欠擬合:無論是訓練數據還是測試數據,模型給出的預測值和真實值都存在較大的誤差。
過擬合:模型對於訓練數據具有較高的精度,但是對測試數據則表現極差。模型過於特殊,不夠泛化(不夠一般,即普適性不強)
欠擬合 <--- 模型複雜度 ---> 過擬合

多項式迴歸

sklearn.preprocessing.PolynomialFeatures(最高次數)

sklearn.pipeline.make_pipeline(多項式特徵擴展器, 線性迴歸器)         # 管線函數 pipeline模塊
    --> return:P:Pipeline

# 或者:
sklearn.pipeline.Pipeline([('擴展器自定義名', 多項式特徵擴展器), 
                           ('迴歸器自定義名', 線性迴歸器)])

x-->多項式特徵擴展器 -- x x^2 x^3 ... --> 線性迴歸器 ---> k1,k2,k3...
# 注pipeline.make_pipeline is a shorthand for the Pipeline constructor,區別在於make_pipeline不需要自定義名稱
1. 管線函數make_pipeline(.., ..)
pl.make_pipeline(sp.PolynomialFeatures(7),lm.LinearRegression())
output:
Pipeline(memory=None,
         steps=[('PolyFeatures', PolynomialFeatures(degree=7, include_bias=True, interaction_only=False)), 
                ('RegressionModel', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))
                ]
         )

# help(...)
def make_pipeline(*steps, **kwargs):
    """Construct a Pipeline from the given estimators.
    This is a shorthand for the Pipeline constructor; it does not require, and
    does not permit, naming the estimators. Instead, their names will be set
    to the lowercase of their types automatically.

    Parameters
    ----------
    *steps : list of estimators,   # list

    memory :
2. 管線函數Pipeline([()], ..)
pl.Pipeline([('PolyFeatures', sp.PolynomialFeatures(7)),    # (自定義名,)
              ('RegressionModel',lm.LinearRegression())])   # Pipeline的參數是tuple組成的list,[]一定不能少
              # sp.PolynomialFeatures(7)的輸出作爲lm.LinearRegression()的輸入
# == 
pl.make_pipeline(sp.PolynomialFeatures(7),lm.LinearRegression())

# help(...Pipeline)
    Parameters
    ----------
    steps : list
        List of (name, transform) tuples (implementing fit/transform) that are
        chained, in the order in which they are chained, with the last object
        an estimator.

迴歸模型性能評估

r2_score

# 擬合模型性能評估R^2
est_error = sklearn.metrics.r2_score(train_y, pred_train_y)
# R^2:
#    越接近1,表明方程的變量對y的解釋能力越強,這個模型對數據擬合的也較好
#    越接近0,表明模型擬合的越差
#    經驗值:>0.4, 擬合效果好
# R^2缺點:數據集的樣本越大,R²越大,因此,不同數據集的模型結果比較會有一定的誤差

code

# -*- coding: utf-8 -*-
"""
Created on Mon Jul 30 16:56:50 2018

@author: Administrator
"""

import pickle                              # 硬盤存儲模塊
import numpy as np
import sklearn.linear_model as lm          # 線性模型模塊
import sklearn.metrics as sm               # 模型評估模塊
import matplotlib.pyplot as plt
import sklearn.pipeline as spl             # 管線函數
import sklearn.preprocessing as sp

# traning datas
train_x, train_y = [], []
# 讀取文件
with open('single.txt', 'r') as f:
    for line in f.readlines():
        data = [float(substr) for substr in line.split(',')]
        train_x.append(data[:-1])
        train_y.append(data[-1])
train_x = np.array(train_x)                 # 訓練數據集必須爲array或者array_like
train_y = np.array(train_y)                 # 訓練數據集必須爲array或者array_like
#print(x.shape, y.shape)

'''
模型建立
sklearn.linear_model.LinearRegression()
    --> return:線性迴歸器
                線性迴歸器.fit(輸入樣本,輸出標籤)                # 訓練數據
                線性迴歸器.predict(輸入樣本)                      # 預測數據
                    -- > return:預測輸出標籤
'''
model_ln = lm.LinearRegression()               # 構建線性迴歸器
model_ln.fit(train_x, train_y)                 # 訓練數據  不返回k和b model中存儲
pred_y_ln = model_ln.predict(train_x)

'''
嶺迴歸 (削弱異常值對擬合的影響,正則強度越大,削弱的越厲害,降低對異常數據的依賴)
loss = J(k, b) + 正則函數(樣本權重)*正則強度(或懲罰係數)         # 正則項:可以防止過擬合
sklearn.linear_model.Ridge(正則強度,
                           fit_intercept=是否修正截距,
                           max_iter=最大迭代次數)
    --> return:嶺迴歸器
                嶺迴歸器.fit()                      # 訓練數據
                嶺迴歸器.predict()                  # 預測數據
'''
model_rd = lm.Ridge(150, fit_intercept=True, max_iter=10000)     # 構建線性迴歸器
model_rd.fit(train_x, train_y)                                   # 訓練數據  不返回k和b model中存儲
pred_y_rd = model_rd.predict(train_x)

'''
多項式迴歸
sklearn.preprocessing.PolynomialFeatures(最高次數)
    --> return:多項式特徵擴展器
sklearn.pipeline.make_pipeline(多項式特徵擴展器, 線性迴歸器)         # 管線函數 pipeline模塊 後續需要再研究???
    --> return:k1,k2,k3...
x-->多項式特徵擴展器 -- x x^2 x^3 ... --> 線性迴歸器 ---> k1,k2,k3...
'''
# 構建模型:訓練數據train
model_poly = spl.make_pipeline(sp.PolynomialFeatures(7),lm.LinearRegression())  # 構建多項式特徵擴展器
#model_poly = spl.Pipeline([('PolyFeatures', sp.PolynomialFeatures(7)),
#                          ('RegressionModel',lm.LinearRegression())])
model_poly.fit(train_x, train_y)                                                # 訓練數據集
pred_train_y = model_poly.predict(train_x)                                      # 根據訓練集進行訓練數據的預測
# 擬合模型性能評估R^2
est_error = sm.r2_score(train_y, pred_train_y)                                  # R-square:模型決定係數R^2
# R^2:
#    越接近1,表明方程的變量對y的解釋能力越強,這個模型對數據擬合的也較好
#    越接近0,表明模型擬合的越差
#    經驗值:>0.4, 擬合效果好
# R^2缺點:數據集的樣本越大,R²越大,因此,不同數據集的模型結果比較會有一定的誤差
print(est_error)

# 使用模型,預測數據test: 利用多項式模型進行數據測試test
#test_x = np.linspace(train_x.min(), train_y.max(), 1001)[:,np.newaxis]  # np.newaxis新增一個列--> 2-dim
test_x = np.linspace(train_x.min(), train_y.max(), 1001)        # .shape == (1001,)
# 一維數組 -->  二維數組(單純增加一個列)
test_x = test_x.reshape((test_x.shape[0],-1))                   # (1001, 1),數組test_x行數:test_x.shape[0]; 列數:任意多列
pred_test_y = model_poly.predict(test_x)
'''
By default, the input is converted to an at least 2D numpy array
'''

# 評估模型
#print(sm.mean_absolute_error(y, pred_y_ln))        # 平均絕對誤差
#print(sm.mean_squared_error(y, pred_y_ln))         # 均方差
#print(sm.median_absolute_error(y, pred_y_ln))      # 中位數絕對誤差
#print(sm.r2_score(y, pred_y_ln))                   # LR模型推薦使用sm.r2_score評估 coefficient of determination
#
# 模型寫入硬盤 pkl格式 方便pickle模塊讀取
with open('linear.pkl', 'wb') as f:                 # pickle.dump() 與 pickle.dumps()有什麼區別 ???????
    pickle.dump(model_ln, f)
with open('ridge.pkl', 'wb') as f:
    pickle.dump(model_rd, f)
with open('polynomial.pkl', 'wb') as f:
    pickle.dump(model_poly, f)
'''
# pikcle.dump(..) 與 pickle.dumps(..)的區別:
dump(obj, file, protocol=None, *, fix_imports=True)
    Write a pickled representation of obj to the open file object file.
dumps(obj, protocol=None, *, fix_imports=True)
    Return the pickled representation of the object as a bytes object.
'''


'''
可視化
'''
plt.figure('Regressions', facecolor='lightgray')
plt.title('Regressions', fontsize=20)
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.tick_params(labelsize=10)
plt.grid(linestyle=':')

# 輸入樣本散點圖
plt.scatter(train_x, train_y, label='Sample', color='black',linewidth=1,alpha=0.8)
# 繪製線性迴歸和嶺迴歸擬合圖(由於訓練集的點x不是有序排列的,因此作爲x軸畫圖時需要先對點按x進行排序)
sorted_indices = train_x.T[0].argsort()       # train_x 不是有序的,需要進行排序
'''
argsort(a, axis=-1, kind='quicksort', order=None)
    Returns the indices that would sort an array.
'''
plt.plot(train_x[sorted_indices], pred_y_ln[sorted_indices], 'o-', label='LinearRegression', color='g',linewidth=1,alpha=1)
plt.plot(train_x[sorted_indices], pred_y_rd[sorted_indices], 'o-', label='RidgeRegression', color='b',linewidth=1,alpha=1)
# 繪製多項式迴歸擬合圖
plt.plot(test_x, pred_test_y, label='PolynomialRegression', color='r',linewidth=2,alpha=1)

plt.legend(fontsize=8, loc='upper left')
plt.show()




'''
def r2_score(y_true, y_pred, sample_weight=None,
             multioutput="uniform_average"):
    """R^2 (coefficient of determination) regression score function.

    Best possible score is 1.0 and it can be negative (because the
    model can be arbitrarily worse). A constant model that always
    predicts the expected value of y, disregarding the input features,
    would get a R^2 score of 0.0.
'''

'''
def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
    """Scale input vectors individually to unit norm (vector length).
    Read more in the :ref:`User Guide <preprocessing_normalization>`.
    Parameters
    ----------
    X : {array-like, sparse matrix}, shape [n_samples, n_features]
        The data to normalize, element by element.
        scipy.sparse matrices should be in CSR format to avoid an
        un-necessary copy.

def transform(self, X):
        """Transform data to polynomial features
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to transform, row by row.

def fit(self, X, y=None):
        """
        Compute number of output features.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data.

dir(sklearn.pipeline):         
    ['Bunch', 'FeatureUnion', 'Memory', 'Parallel', 'Pipeline', 'TransformerMixin', 
    '_BaseComposition', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', 
    '__loader__', '__name__', '__package__', '__spec__', '_fit_one_transformer', '_fit_transform_one', 
    '_name_estimators', '_transform_one', 'check_memory', 'clone', 'defaultdict', 'delayed', 
    'if_delegate_has_method', 'make_pipeline', 'make_union', 'np', 'six', 'sparse']
'''
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章