對比利用多項式定義進行多項式擬合:筆記代碼:多項式擬合和極值點連接
線性迴歸
sklearn.linear_model.LinearRegression()
--> return:線性迴歸器
線性迴歸器.fit(輸入樣本,輸出標籤) # 訓練數據
線性迴歸器.predict(輸入樣本) # 預測數據
-- > return:預測輸出標籤
嶺迴歸
# 嶺迴歸 (削弱異常值對擬合的影響,正則強度越大,削弱的越厲害,降低對異常數據的依賴)
loss = J(k, b) + 正則函數(樣本權重)*正則強度(或懲罰係數) # 正則項:可以防止過擬合
sklearn.linear_model.Ridge(正則強度,
fit_intercept=是否修正截距,
max_iter=最大迭代次數)
--> return:嶺迴歸器
嶺迴歸器.fit() # 訓練數據
嶺迴歸器.predict() # 預測數據
欠擬合:無論是訓練數據還是測試數據,模型給出的預測值和真實值都存在較大的誤差。
過擬合:模型對於訓練數據具有較高的精度,但是對測試數據則表現極差。模型過於特殊,不夠泛化(不夠一般,即普適性不強)
欠擬合 <--- 模型複雜度 ---> 過擬合
多項式迴歸
sklearn.preprocessing.PolynomialFeatures(最高次數)
sklearn.pipeline.make_pipeline(多項式特徵擴展器, 線性迴歸器) # 管線函數 pipeline模塊
--> return:P:Pipeline
# 或者:
sklearn.pipeline.Pipeline([('擴展器自定義名', 多項式特徵擴展器),
('迴歸器自定義名', 線性迴歸器)])
x-->多項式特徵擴展器 -- x x^2 x^3 ... --> 線性迴歸器 ---> k1,k2,k3...
# 注pipeline.make_pipeline is a shorthand for the Pipeline constructor,區別在於make_pipeline不需要自定義名稱
1. 管線函數make_pipeline(.., ..)
pl.make_pipeline(sp.PolynomialFeatures(7),lm.LinearRegression())
output:
Pipeline(memory=None,
steps=[('PolyFeatures', PolynomialFeatures(degree=7, include_bias=True, interaction_only=False)),
('RegressionModel', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))
]
)
# help(...)
def make_pipeline(*steps, **kwargs):
"""Construct a Pipeline from the given estimators.
This is a shorthand for the Pipeline constructor; it does not require, and
does not permit, naming the estimators. Instead, their names will be set
to the lowercase of their types automatically.
Parameters
----------
*steps : list of estimators, # list
memory :
2. 管線函數Pipeline([()], ..)
pl.Pipeline([('PolyFeatures', sp.PolynomialFeatures(7)), # (自定義名,)
('RegressionModel',lm.LinearRegression())]) # Pipeline的參數是tuple組成的list,[]一定不能少
# sp.PolynomialFeatures(7)的輸出作爲lm.LinearRegression()的輸入
# ==
pl.make_pipeline(sp.PolynomialFeatures(7),lm.LinearRegression())
# help(...Pipeline)
Parameters
----------
steps : list
List of (name, transform) tuples (implementing fit/transform) that are
chained, in the order in which they are chained, with the last object
an estimator.
迴歸模型性能評估
r2_score
# 擬合模型性能評估R^2
est_error = sklearn.metrics.r2_score(train_y, pred_train_y)
# R^2:
# 越接近1,表明方程的變量對y的解釋能力越強,這個模型對數據擬合的也較好
# 越接近0,表明模型擬合的越差
# 經驗值:>0.4, 擬合效果好
# R^2缺點:數據集的樣本越大,R²越大,因此,不同數據集的模型結果比較會有一定的誤差
code
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 30 16:56:50 2018
@author: Administrator
"""
import pickle # 硬盤存儲模塊
import numpy as np
import sklearn.linear_model as lm # 線性模型模塊
import sklearn.metrics as sm # 模型評估模塊
import matplotlib.pyplot as plt
import sklearn.pipeline as spl # 管線函數
import sklearn.preprocessing as sp
# traning datas
train_x, train_y = [], []
# 讀取文件
with open('single.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr in line.split(',')]
train_x.append(data[:-1])
train_y.append(data[-1])
train_x = np.array(train_x) # 訓練數據集必須爲array或者array_like
train_y = np.array(train_y) # 訓練數據集必須爲array或者array_like
#print(x.shape, y.shape)
'''
模型建立
sklearn.linear_model.LinearRegression()
--> return:線性迴歸器
線性迴歸器.fit(輸入樣本,輸出標籤) # 訓練數據
線性迴歸器.predict(輸入樣本) # 預測數據
-- > return:預測輸出標籤
'''
model_ln = lm.LinearRegression() # 構建線性迴歸器
model_ln.fit(train_x, train_y) # 訓練數據 不返回k和b model中存儲
pred_y_ln = model_ln.predict(train_x)
'''
嶺迴歸 (削弱異常值對擬合的影響,正則強度越大,削弱的越厲害,降低對異常數據的依賴)
loss = J(k, b) + 正則函數(樣本權重)*正則強度(或懲罰係數) # 正則項:可以防止過擬合
sklearn.linear_model.Ridge(正則強度,
fit_intercept=是否修正截距,
max_iter=最大迭代次數)
--> return:嶺迴歸器
嶺迴歸器.fit() # 訓練數據
嶺迴歸器.predict() # 預測數據
'''
model_rd = lm.Ridge(150, fit_intercept=True, max_iter=10000) # 構建線性迴歸器
model_rd.fit(train_x, train_y) # 訓練數據 不返回k和b model中存儲
pred_y_rd = model_rd.predict(train_x)
'''
多項式迴歸
sklearn.preprocessing.PolynomialFeatures(最高次數)
--> return:多項式特徵擴展器
sklearn.pipeline.make_pipeline(多項式特徵擴展器, 線性迴歸器) # 管線函數 pipeline模塊 後續需要再研究???
--> return:k1,k2,k3...
x-->多項式特徵擴展器 -- x x^2 x^3 ... --> 線性迴歸器 ---> k1,k2,k3...
'''
# 構建模型:訓練數據train
model_poly = spl.make_pipeline(sp.PolynomialFeatures(7),lm.LinearRegression()) # 構建多項式特徵擴展器
#model_poly = spl.Pipeline([('PolyFeatures', sp.PolynomialFeatures(7)),
# ('RegressionModel',lm.LinearRegression())])
model_poly.fit(train_x, train_y) # 訓練數據集
pred_train_y = model_poly.predict(train_x) # 根據訓練集進行訓練數據的預測
# 擬合模型性能評估R^2
est_error = sm.r2_score(train_y, pred_train_y) # R-square:模型決定係數R^2
# R^2:
# 越接近1,表明方程的變量對y的解釋能力越強,這個模型對數據擬合的也較好
# 越接近0,表明模型擬合的越差
# 經驗值:>0.4, 擬合效果好
# R^2缺點:數據集的樣本越大,R²越大,因此,不同數據集的模型結果比較會有一定的誤差
print(est_error)
# 使用模型,預測數據test: 利用多項式模型進行數據測試test
#test_x = np.linspace(train_x.min(), train_y.max(), 1001)[:,np.newaxis] # np.newaxis新增一個列--> 2-dim
test_x = np.linspace(train_x.min(), train_y.max(), 1001) # .shape == (1001,)
# 一維數組 --> 二維數組(單純增加一個列)
test_x = test_x.reshape((test_x.shape[0],-1)) # (1001, 1),數組test_x行數:test_x.shape[0]; 列數:任意多列
pred_test_y = model_poly.predict(test_x)
'''
By default, the input is converted to an at least 2D numpy array
'''
# 評估模型
#print(sm.mean_absolute_error(y, pred_y_ln)) # 平均絕對誤差
#print(sm.mean_squared_error(y, pred_y_ln)) # 均方差
#print(sm.median_absolute_error(y, pred_y_ln)) # 中位數絕對誤差
#print(sm.r2_score(y, pred_y_ln)) # LR模型推薦使用sm.r2_score評估 coefficient of determination
#
# 模型寫入硬盤 pkl格式 方便pickle模塊讀取
with open('linear.pkl', 'wb') as f: # pickle.dump() 與 pickle.dumps()有什麼區別 ???????
pickle.dump(model_ln, f)
with open('ridge.pkl', 'wb') as f:
pickle.dump(model_rd, f)
with open('polynomial.pkl', 'wb') as f:
pickle.dump(model_poly, f)
'''
# pikcle.dump(..) 與 pickle.dumps(..)的區別:
dump(obj, file, protocol=None, *, fix_imports=True)
Write a pickled representation of obj to the open file object file.
dumps(obj, protocol=None, *, fix_imports=True)
Return the pickled representation of the object as a bytes object.
'''
'''
可視化
'''
plt.figure('Regressions', facecolor='lightgray')
plt.title('Regressions', fontsize=20)
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.tick_params(labelsize=10)
plt.grid(linestyle=':')
# 輸入樣本散點圖
plt.scatter(train_x, train_y, label='Sample', color='black',linewidth=1,alpha=0.8)
# 繪製線性迴歸和嶺迴歸擬合圖(由於訓練集的點x不是有序排列的,因此作爲x軸畫圖時需要先對點按x進行排序)
sorted_indices = train_x.T[0].argsort() # train_x 不是有序的,需要進行排序
'''
argsort(a, axis=-1, kind='quicksort', order=None)
Returns the indices that would sort an array.
'''
plt.plot(train_x[sorted_indices], pred_y_ln[sorted_indices], 'o-', label='LinearRegression', color='g',linewidth=1,alpha=1)
plt.plot(train_x[sorted_indices], pred_y_rd[sorted_indices], 'o-', label='RidgeRegression', color='b',linewidth=1,alpha=1)
# 繪製多項式迴歸擬合圖
plt.plot(test_x, pred_test_y, label='PolynomialRegression', color='r',linewidth=2,alpha=1)
plt.legend(fontsize=8, loc='upper left')
plt.show()
'''
def r2_score(y_true, y_pred, sample_weight=None,
multioutput="uniform_average"):
"""R^2 (coefficient of determination) regression score function.
Best possible score is 1.0 and it can be negative (because the
model can be arbitrarily worse). A constant model that always
predicts the expected value of y, disregarding the input features,
would get a R^2 score of 0.0.
'''
'''
def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
"""Scale input vectors individually to unit norm (vector length).
Read more in the :ref:`User Guide <preprocessing_normalization>`.
Parameters
----------
X : {array-like, sparse matrix}, shape [n_samples, n_features]
The data to normalize, element by element.
scipy.sparse matrices should be in CSR format to avoid an
un-necessary copy.
def transform(self, X):
"""Transform data to polynomial features
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to transform, row by row.
def fit(self, X, y=None):
"""
Compute number of output features.
Parameters
----------
X : array-like, shape (n_samples, n_features)
The data.
dir(sklearn.pipeline):
['Bunch', 'FeatureUnion', 'Memory', 'Parallel', 'Pipeline', 'TransformerMixin',
'_BaseComposition', '__all__', '__builtins__', '__cached__', '__doc__', '__file__',
'__loader__', '__name__', '__package__', '__spec__', '_fit_one_transformer', '_fit_transform_one',
'_name_estimators', '_transform_one', 'check_memory', 'clone', 'defaultdict', 'delayed',
'if_delegate_has_method', 'make_pipeline', 'make_union', 'np', 'six', 'sparse']
'''