機器學習分類算法
預處理 + 數據劃分 + 特徵選擇 + 特徵重構 + 參數尋優 + 模型調用
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import preprocessing #預處理
from sklearn.model_selection import train_test_split #劃分數據
from sklearn.model_selection import GridSearchCV #網格搜索
#特徵選擇
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
#降維處理
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.svm import SVC #SVM
from sklearn.tree import DecisionTreeClassifier #DT
from sklearn.ensemble import AdaBoostClassifier #Ada
from sklearn.ensemble import RandomForestClassifier #RF
from sklearn.ensemble import GradientBoostingClassifier #GBDT
from lightgbm import LGBMClassifier #LGBM
from xgboost.sklearn import XGBClassifier #XGB
import warnings
warnings.filterwarnings('ignore')
def obtainFormatData():
dataset = datasets.load_breast_cancer()
featuresMatrix,y = dataset.data,dataset.target
print('======================================================')
print('nums = {} features = {}'.
format(featuresMatrix.shape[0],featuresMatrix.shape[1]))
print('======================================================')
#最大最小化規約處理X = (X - min(X)) / (max(X) - min(X))
x = preprocessing.MinMaxScaler().fit_transform(featuresMatrix)
return x,y
#特徵選擇
def treeModelFS(x,y):
clf = ExtraTreesClassifier(n_estimators=10,criterion='gini'
,max_depth=None,random_state=1)
clf.fit(x,y)
scores = pd.Series(clf.feature_importances_).sort_values(ascending=False)
scores.plot.bar(rot=0,figsize=(8,3),title='importance of features based on tree')
def LRModelFS(x,y):
print('FS based on L1:')
for c in [0.1,1.0,2.5]:
clf = LogisticRegression(penalty='l1',C=c,random_state=1)
clf.fit(x,y)
index = list(np.where(clf.coef_[0]!=0)[0])
print('C = {} FSI = {}'.format(c,index))
#特徵重構
def PCAdescendingDimension(x,y):
for n in [3,8,15]:
clf = PCA(n_components=n)
#fx表示降維後的特徵集
fx = clf.fit_transform(x)
x_train,x_test,y_train,y_test = train_test_split(fx,y,
test_size=0.2,random_state=1)
m = LogisticRegression()
m.fit(x_train,y_train)
train_acc = round(m.score(x_train,y_train),5)
test_acc = round(m.score(x_test,y_test),5)
#clf.explained_variance_ratio_表示方差比
print('components = {} var = {}: train = {} test = {}'.
format(n,round(np.sum(clf.explained_variance_ratio_),3),train_acc,test_acc))
def LDAdescendingDimension(x,y):
for n in [1,3,8]:
clf = LDA(n_components=n)
clf.fit(x,y)
#fx表示降維後的特徵集
fx = clf.transform(x)
x_train,x_test,y_train,y_test = train_test_split(fx,y,
test_size=0.2,random_state=1)
m = LogisticRegression()
m.fit(x_train,y_train)
train_acc = round(m.score(x_train,y_train),5)
test_acc = round(m.score(x_test,y_test),5)
#clf.explained_variance_ratio_表示方差比
print('components = {} var = {}: train = {} test = {}'.
format(n,round(np.sum(clf.explained_variance_ratio_),3),train_acc,test_acc))
#參數尋優
def KNNClassifierALG(x_train,x_test,y_train,y_test):
params = {'n_neighbors':range(3,50)}
clf = GridSearchCV(estimator=KNeighborsClassifier(n_neighbors=5),
param_grid=params)
clf.fit(x_train,y_train)
best = clf.best_params_
print('KNN:',best)
m = KNeighborsClassifier(n_neighbors=best['n_neighbors'])
m.fit(x_train,y_train)
train_acc = round(m.score(x_train,y_train),5)
test_acc = round(m.score(x_test,y_test),5)
print('acc: Train = {} Test = {}'.format(train_acc,test_acc))
def LRClassifierALG(x_train,x_test,y_train,y_test):
params = {'C':[0.1,1.0,10],'penalty':['l1','l2']}
clf = GridSearchCV(estimator=LogisticRegression(),param_grid=params)
clf.fit(x_train,y_train)
best = clf.best_params_
print('LR:',best)
m = LogisticRegression(C=best['C'],penalty=best['penalty'])
m.fit(x_train,y_train)
train_acc = round(m.score(x_train,y_train),5)
test_acc = round(m.score(x_test,y_test),5)
print('acc: Train = {} Test = {}'.format(train_acc,test_acc))
def SVMClassifierALG(x_train,x_test,y_train,y_test):
params = {'C':[0.1,1.0,10],'kernel':['rbf','sigmoid','linear']}
clf = GridSearchCV(estimator=SVC(),param_grid=params)
clf.fit(x_train,y_train)
best = clf.best_params_
print('SVM:',best)
m = SVC(C=best['C'],kernel=best['kernel'])
m.fit(x_train,y_train)
train_acc = round(m.score(x_train,y_train),5)
test_acc = round(m.score(x_test,y_test),5)
print('acc: Train = {} Test = {}'.format(train_acc,test_acc))
#樹模型
def TreeModelClassifierALG(x_train,x_test,y_train,y_test):
models = {'DT':DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=None,min_samples_split=2,
max_features=None,max_leaf_nodes=None,
random_state=1),
'Ada':AdaBoostClassifier(DecisionTreeClassifier(random_state=1),n_estimators=50,
learning_rate=0.1),
'RF':RandomForestClassifier(n_estimators=50,criterion='gini',
max_depth=None,max_features='auto',
min_samples_split=2,min_samples_leaf=1,
random_state=1),
'GBDT':GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,
subsample=0.9,max_features=None,
min_samples_split=2,min_samples_leaf=1,
random_state=1),
'LGBM':LGBMClassifier(boosting_type='gbdt',num_leaves=30,
n_estimators=100,learning_rate=0.1,
min_child_samples=10,max_depth=-1),
'XGB':XGBClassifier(max_depth=3,learning_rate=0.1,
n_estimators=100,booster='gbtree',
objective='binary:logistic',subsample=1,
reg_alpha=0,reg_lambda=1)
}
print('======================================================')
for model in models:
clf = models[model]
clf.fit(x_train,y_train)
train_acc = round(clf.score(x_train,y_train),5)
test_acc = round(clf.score(x_test,y_test),5)
print('model: {}\nacc: Train = {} Test = {}'.format(model,train_acc,test_acc))
print('======================================================')
x,y = obtainFormatData()
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)
KNNClassifierALG(x_train,x_test,y_train,y_test)
LRClassifierALG(x_train,x_test,y_train,y_test)
SVMClassifierALG(x_train,x_test,y_train,y_test)
TreeModelClassifierALG(x_train,x_test,y_train,y_test)