0.19 sklearn的onehot編碼有bug,
完成onehot編碼轉換,在過程中將轉換後特徵維度記錄下來,便於配合xgb特徵選擇獲取不同特徵權重
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
import numpy as np
class CategoricalEncoder(BaseEstimator, TransformerMixin):
def __init__(self, cat_attribs, encoding='onehot', categories='auto', dtype=np.float64,
handle_unknown='error'):
self.cat_attribs = cat_attribs
self.encoding = encoding
self.categories = categories
self.dtype = dtype
self.handle_unknown = handle_unknown
def fit(self, X, y=None):
"""Fit the CategoricalEncoder to X.
Parameters
----------
X : array-like, shape [n_samples, n_feature]
The data to determine the categories of each feature.
Returns
-------
self
"""
if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
template = ("encoding should be either 'onehot', 'onehot-dense' "
"or 'ordinal', got %s")
raise ValueError(template % self.handle_unknown)
if self.handle_unknown not in ['error', 'ignore']:
template = ("handle_unknown should be either 'error' or "
"'ignore', got %s")
raise ValueError(template % self.handle_unknown)
if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
raise ValueError("handle_unknown='ignore' is not supported for"
" encoding='ordinal'")
X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
n_samples, n_features = X.shape
self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]
for i in range(n_features):
le = self._label_encoders_[i]
Xi = X[:, i]
if self.categories == 'auto':
le.fit(Xi)
else:
valid_mask = np.in1d(Xi, self.categories[i])
if not np.all(valid_mask):
if self.handle_unknown == 'error':
diff = np.unique(Xi[~valid_mask])
msg = ("Found unknown categories {0} in column {1}"
" during fit".format(diff, i))
raise ValueError(msg)
le.classes_ = np.array(np.sort(self.categories[i]))
self.categories_ = [le.classes_ for le in self._label_encoders_]
self.cat_attribs_ = ["{}_{}".format(self.cat_attribs[i], j) for (i, item) in enumerate(self.categories_) for j in item]
return self
def transform(self, X):
"""Transform X using one-hot encoding.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to encode.
Returns
-------
X_out : sparse matrix or a 2-d array
Transformed input.
"""
X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
n_samples, n_features = X.shape
X_int = np.zeros_like(X, dtype=np.int)
X_mask = np.ones_like(X, dtype=np.bool)
for i in range(n_features):
valid_mask = np.in1d(X[:, i], self.categories_[i])
if not np.all(valid_mask):
if self.handle_unknown == 'error':
diff = np.unique(X[~valid_mask, i])
msg = ("Found unknown categories {0} in column {1}"
" during transform".format(diff, i))
raise ValueError(msg)
else:
# Set the problematic rows to an acceptable value and
# continue `The rows are marked `X_mask` and will be
# removed later.
X_mask[:, i] = valid_mask
X[:, i][~valid_mask] = self.categories_[i][0]
X_int[:, i] = self._label_encoders_[i].transform(X[:, i])
if self.encoding == 'ordinal':
return X_int.astype(self.dtype, copy=False)
mask = X_mask.ravel()
n_values = [cats.shape[0] for cats in self.categories_]
n_values = np.array([0] + n_values)
indices = np.cumsum(n_values)
column_indices = (X_int + indices[:-1]).ravel()[mask]
row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
n_features)[mask]
data = np.ones(n_samples * n_features)[mask]
out = sparse.csc_matrix((data, (row_indices, column_indices)),
shape=(n_samples, indices[-1]),
dtype=self.dtype).tocsr()
if self.encoding == 'onehot-dense':
return out.toarray()
else:
return out
特徵選擇,使用時需對模型、特徵維度文件、輸出記錄路徑進行修改,也可以再對特徵權重表現方式進行更改,默認爲weight
最後導入模型的特徵維度根據你的管道流的順序爲準。
如第一部分爲數值型pipeline,第二部分爲類別型pipeline
那最後輸出的特徵維度先是數值型的特徵再加上類別型的特徵
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
import pandas as pd
import operator
from matplotlib import pylab as plt
from pylab import *
import os
import time
class FeatureSelect(BaseEstimator, TransformerMixin):
def __init__(self):
self.d = os.path.dirname(__file__) #返回當前文件所在的目錄
# __file__ 爲當前文件, 若果在ide中運行此行會報錯,可改爲 #d = path.dirname('.')
self.parent_path = os.path.dirname(self.d) #獲得d所在的目錄,即d的父級目錄
# parent_path = os.path.dirname(parent_path) ##獲得parent_path所在的目錄即parent_path的父級目錄
# 將特徵名稱轉換爲xgb可識別的xgb.fmap文件, 用於特徵權重輸出時可匹配上其特徵列名稱
# params features: 特徵列名 df.columns, to_file_name: 輸出的文件名稱, 默認保存至公用靜態文件夾中
def create_feature_map(self, features, to_file_name):
outfile = open('{}/static_file/{}.fmap'.format(self.parent_path, to_file_name), 'w')
i = 0
for item in features:
outfile.write('{0}\t{1}\tq\n'.format(i, item))
i = i + 1
outfile.close()
# params feature_num: 想要查看top10的特徵名稱及權重,這裏設置想要查看前多少個特徵及其權重
def xgb_feature_select(self, model_file='/output/model/xgb.model', fmap_file='/static_file/xgb.fmap', feature_num=10, importance_type='weight'):
fm = mpl.font_manager
fm.get_cachedir()
model_dir = self.parent_path
model =xgb.Booster(model_file=model_dir + model_file)
importance = model.get_score(fmap=model_dir + fmap_file, importance_type=importance_type)
# 將特徵名稱轉換成第幾列特徵用的dict, 保留
# f = open(model_dir + fmap_file)
# fmap_dict = {}
# for line in f:
# line = line.strip().split()
# fmap_dict[line[1]] = line[0]
print("特徵: ", importance)
importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
# 保存起來此次使用特徵,便於輸出
self.importance = importance
importance = importance[:feature_num]
index = 1
print('特徵排名', '特徵名稱', '特徵權重值')
for line in importance:
print(index, str(line[0]), line[1])
index += 1
def xgb_feature_importance_output(self, remark='無備註'):
output_list = []
output_list.append([self.importance, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), len(self.importance),remark])
output_df = pd.DataFrame(output_list, columns=['特徵名稱與權重值', '特徵構造時間', '特徵維度數量','完成操作備註'])
output_df.to_csv(self.parent_path+'/output/csv/xgb.csv',mode='a', header=False, index=False)
if __name__ == "__main__":
pass