機器學習sklearn管線Pipeline, 樣本生成器,特徵選擇器

# -*- coding: utf-8 -*-
"""
Created on Fri Aug  3 10:04:14 2018

@author: Administrator
"""

import numpy as np
import sklearn.datasets as sd
import sklearn.feature_selection as fs
import sklearn.ensemble as se
import sklearn.pipeline as spp
import sklearn.model_selection as ms
import matplotlib.pyplot as plt

'''
樣本生成器:
n_samples:樣本個數
n_features :特徵個數= n_informative() + n_redundant + n_repeated
n_informative:多信息特徵的個數
n_redundant:冗餘信息,informative特徵的隨機線性組合
n_repeated :重複信息,隨機提取n_informative和n_redundant 特徵
n_classes:分類類別
n_clusters_per_class :某一個類別是由幾個cluster構成的
random_state:隨機種子,使得實驗可重複
n_classes*n_clusters_per_class 小於或等於 2^n_informative
'''
x, y = sd.samples_generator.make_classification(n_samples=200,          # 樣本總個數
                                                n_informative=4,        # 多信息特徵
                                                n_features=20,          # 20個特徵
                                                n_redundant=0,          # 冗餘特徵個數
                                                random_state=5
                                                )


'''
特徵選擇器: 優選出k個特徵作爲主要特徵
 |  Parameters
 |  ----------
 |  score_func : callable
 |      Function taking two arrays X and y, and returning a pair of arrays
 |      (scores, pvalues) or a single array with scores.
 |      Default is f_classif (see below "See also"). The default function only
 |      works with classification tasks.
 |  k : int or "all", optional, default=10
 |      Number of top features to select.
 |      The "all" option bypasses selection, for use in a parameter search.
'''
skb = fs.SelectKBest(fs.f_regression,                                   # 迴歸規則作爲特徵個數選擇的規則
                     k=5,                                               # 選出k個主要特徵
                     )


# 隨機森林分類器
rfc = se.RandomForestClassifier(n_estimators=25,                        # 決策樹個數
                                max_depth=4,                            # 樹高最大4
                                random_state=7                          # 隨機種子源
                                )

# 構建管線: 區別make_pipeline用法,make_pipeline不能給管線自定義命名
model = spp.Pipeline([('selector', skb),                                #  順序:輸入skb, skb管線的輸出作爲rfc的輸入
                      ('classifier', rfc)])

# 模型性能評估:f1_score
print('f1_score_0:', ms.cross_val_score(model, x, y, cv=10, scoring='f1_weighted').mean()) # 0.7631188256188256

# 參數修改:selector爲Pipeline參數中自定義的名字selector, 後面加2個_
model.set_params(selector__k=2,
                 classifier__n_estimators=10
                 )

# 修改參數後評估模型性能
print('f1_score_r:', ms.cross_val_score(model, x, y, cv=10, scoring='f1_weighted').mean()) # 0.6943056943056943
# 訓練修改參數後的優選模型
model.fit(x, y)

# 優選模型實際分類用到的有效特徵 (n_features中selected_mask==True的特徵)
selected_mask = model.named_steps['selector'].get_support()             # (20,)
print('selected_mask:',selected_mask)                                   # [...False True False False...]

# 篩選出實際分類用到的有效特徵:選擇基於掩碼爲真的數組索引
selected_indices_tuple = np.where(selected_mask==True)                  # tuple: (array([ 9, 15], dtype=int64),),見np.where()的用法
selected_indices = selected_indices_tuple[0]                            # array: array([ 9, 15]
print('selected_indices:', selected_indices)                            # selected_indices: [ 9 15]



# 根據優選的有效特徵重新定義數據集x: 只含有selected_indices.shape[0]個特徵的樣本數據
x = x[:, selected_indices]              # 只有2個特徵時才能畫散點圖,多個點無法畫圖
# 根據新的數據集訓練模型
model.fit(x, y)




l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005

grid_x = np.meshgrid(np.arange(l, r, h), np.arange(b, t, v)) # 柵格點陣
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]         # 柵格橫座標拉平成1維,預測一維柵格分類值
flat_y = model.predict(flat_x)                               # 一維柵格分類值(作爲顏色區分pcolormesh的c)
grid_y = flat_y.reshape(grid_x[0].shape)                     # 柵格分類reshape與grid_x一樣

plt.figure('Selector_Classfier Pipeline', facecolor='lightgray')
plt.title('Selector_Classfier Pipeline', fontsize=14)
plt.xlabel('x', fontsize=14)
plt.ylabel('y', fontsize=14)
plt.tick_params(labelsize=10)

plt.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='Dark2')
plt.scatter(x[:, 0], x[:, 1], c=y, cmap='cool', s=30)

plt.show()

'''
獲取array數組的下標:`np.where()[0]` 
通過下標數組返回數組中的元素集:`np.take()`
注:`np.where()`的return: `tuple((arr_indices, arr_dtype), )`
'''

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章