# -*- coding: utf-8 -*-
"""
Created on Fri Aug 3 10:04:14 2018
@author: Administrator
"""
import numpy as np
import sklearn.datasets as sd
import sklearn.feature_selection as fs
import sklearn.ensemble as se
import sklearn.pipeline as spp
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
'''
樣本生成器:
n_samples:樣本個數
n_features :特徵個數= n_informative() + n_redundant + n_repeated
n_informative:多信息特徵的個數
n_redundant:冗餘信息,informative特徵的隨機線性組合
n_repeated :重複信息,隨機提取n_informative和n_redundant 特徵
n_classes:分類類別
n_clusters_per_class :某一個類別是由幾個cluster構成的
random_state:隨機種子,使得實驗可重複
n_classes*n_clusters_per_class 小於或等於 2^n_informative
'''
x, y = sd.samples_generator.make_classification(n_samples=200, # 樣本總個數
n_informative=4, # 多信息特徵
n_features=20, # 20個特徵
n_redundant=0, # 冗餘特徵個數
random_state=5
)
'''
特徵選擇器: 優選出k個特徵作爲主要特徵
| Parameters
| ----------
| score_func : callable
| Function taking two arrays X and y, and returning a pair of arrays
| (scores, pvalues) or a single array with scores.
| Default is f_classif (see below "See also"). The default function only
| works with classification tasks.
| k : int or "all", optional, default=10
| Number of top features to select.
| The "all" option bypasses selection, for use in a parameter search.
'''
skb = fs.SelectKBest(fs.f_regression, # 迴歸規則作爲特徵個數選擇的規則
k=5, # 選出k個主要特徵
)
# 隨機森林分類器
rfc = se.RandomForestClassifier(n_estimators=25, # 決策樹個數
max_depth=4, # 樹高最大4
random_state=7 # 隨機種子源
)
# 構建管線: 區別make_pipeline用法,make_pipeline不能給管線自定義命名
model = spp.Pipeline([('selector', skb), # 順序:輸入skb, skb管線的輸出作爲rfc的輸入
('classifier', rfc)])
# 模型性能評估:f1_score
print('f1_score_0:', ms.cross_val_score(model, x, y, cv=10, scoring='f1_weighted').mean()) # 0.7631188256188256
# 參數修改:selector爲Pipeline參數中自定義的名字selector, 後面加2個_
model.set_params(selector__k=2,
classifier__n_estimators=10
)
# 修改參數後評估模型性能
print('f1_score_r:', ms.cross_val_score(model, x, y, cv=10, scoring='f1_weighted').mean()) # 0.6943056943056943
# 訓練修改參數後的優選模型
model.fit(x, y)
# 優選模型實際分類用到的有效特徵 (n_features中selected_mask==True的特徵)
selected_mask = model.named_steps['selector'].get_support() # (20,)
print('selected_mask:',selected_mask) # [...False True False False...]
# 篩選出實際分類用到的有效特徵:選擇基於掩碼爲真的數組索引
selected_indices_tuple = np.where(selected_mask==True) # tuple: (array([ 9, 15], dtype=int64),),見np.where()的用法
selected_indices = selected_indices_tuple[0] # array: array([ 9, 15]
print('selected_indices:', selected_indices) # selected_indices: [ 9 15]
# 根據優選的有效特徵重新定義數據集x: 只含有selected_indices.shape[0]個特徵的樣本數據
x = x[:, selected_indices] # 只有2個特徵時才能畫散點圖,多個點無法畫圖
# 根據新的數據集訓練模型
model.fit(x, y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h), np.arange(b, t, v)) # 柵格點陣
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()] # 柵格橫座標拉平成1維,預測一維柵格分類值
flat_y = model.predict(flat_x) # 一維柵格分類值(作爲顏色區分pcolormesh的c)
grid_y = flat_y.reshape(grid_x[0].shape) # 柵格分類reshape與grid_x一樣
plt.figure('Selector_Classfier Pipeline', facecolor='lightgray')
plt.title('Selector_Classfier Pipeline', fontsize=14)
plt.xlabel('x', fontsize=14)
plt.ylabel('y', fontsize=14)
plt.tick_params(labelsize=10)
plt.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='Dark2')
plt.scatter(x[:, 0], x[:, 1], c=y, cmap='cool', s=30)
plt.show()
'''
獲取array數組的下標:`np.where()[0]`
通過下標數組返回數組中的元素集:`np.take()`
注:`np.where()`的return: `tuple((arr_indices, arr_dtype), )`
'''
機器學習sklearn管線Pipeline, 樣本生成器,特徵選擇器
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.