機器學習分類算法ALG

機器學習分類算法

預處理 + 數據劃分 + 特徵選擇 + 特徵重構 + 參數尋優 + 模型調用

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import preprocessing                               #預處理
from sklearn.model_selection import train_test_split #劃分數據
from sklearn.model_selection import GridSearchCV  #網格搜索

#特徵選擇
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

#降維處理
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from sklearn.neighbors import KNeighborsClassifier           #KNN
from sklearn.svm import SVC                                                  #SVM
from sklearn.tree import DecisionTreeClassifier                   #DT
from sklearn.ensemble import AdaBoostClassifier              #Ada
from sklearn.ensemble import RandomForestClassifier      #RF
from sklearn.ensemble import GradientBoostingClassifier  #GBDT
from lightgbm import LGBMClassifier                                    #LGBM
from xgboost.sklearn import XGBClassifier                           #XGB

import warnings
warnings.filterwarnings('ignore')

def obtainFormatData():
    dataset = datasets.load_breast_cancer()
    featuresMatrix,y = dataset.data,dataset.target
    print('======================================================')
    print('nums = {}        features = {}'.
          format(featuresMatrix.shape[0],featuresMatrix.shape[1]))
    print('======================================================')
    
    #最大最小化規約處理X = (X - min(X)) / (max(X) - min(X))
    x = preprocessing.MinMaxScaler().fit_transform(featuresMatrix)
    
    return x,y

#特徵選擇
def treeModelFS(x,y):
    clf = ExtraTreesClassifier(n_estimators=10,criterion='gini'
                               ,max_depth=None,random_state=1)
    clf.fit(x,y)
    scores = pd.Series(clf.feature_importances_).sort_values(ascending=False)
    scores.plot.bar(rot=0,figsize=(8,3),title='importance of features based on tree')

def LRModelFS(x,y):
    print('FS based on L1:')
    for c in [0.1,1.0,2.5]:
        clf = LogisticRegression(penalty='l1',C=c,random_state=1)
        clf.fit(x,y)
        index = list(np.where(clf.coef_[0]!=0)[0])
        print('C = {} FSI = {}'.format(c,index))

#特徵重構
def PCAdescendingDimension(x,y):
    for n in [3,8,15]:
        clf = PCA(n_components=n) 
        #fx表示降維後的特徵集
        fx = clf.fit_transform(x)
        x_train,x_test,y_train,y_test = train_test_split(fx,y,
                                        test_size=0.2,random_state=1)
        m = LogisticRegression()
        m.fit(x_train,y_train)
        train_acc = round(m.score(x_train,y_train),5)
        test_acc = round(m.score(x_test,y_test),5)
        #clf.explained_variance_ratio_表示方差比
        print('components = {}  var = {}: train = {} test = {}'.
              format(n,round(np.sum(clf.explained_variance_ratio_),3),train_acc,test_acc))
        
def LDAdescendingDimension(x,y):
    for n in [1,3,8]:
        clf = LDA(n_components=n)
        clf.fit(x,y)
        #fx表示降維後的特徵集
        fx = clf.transform(x)
        x_train,x_test,y_train,y_test = train_test_split(fx,y,
                                        test_size=0.2,random_state=1)
        m = LogisticRegression()
        m.fit(x_train,y_train)
        train_acc = round(m.score(x_train,y_train),5)
        test_acc = round(m.score(x_test,y_test),5)
        #clf.explained_variance_ratio_表示方差比
        print('components = {}  var = {}: train = {} test = {}'.
              format(n,round(np.sum(clf.explained_variance_ratio_),3),train_acc,test_acc))

#參數尋優
def KNNClassifierALG(x_train,x_test,y_train,y_test):
    params = {'n_neighbors':range(3,50)}
    clf = GridSearchCV(estimator=KNeighborsClassifier(n_neighbors=5),
                       param_grid=params)
    clf.fit(x_train,y_train)
    best = clf.best_params_
    print('KNN:',best)
    
    m = KNeighborsClassifier(n_neighbors=best['n_neighbors'])
    m.fit(x_train,y_train)
    train_acc = round(m.score(x_train,y_train),5)
    test_acc = round(m.score(x_test,y_test),5)
    print('acc: Train = {}  Test = {}'.format(train_acc,test_acc))

def LRClassifierALG(x_train,x_test,y_train,y_test):
    params = {'C':[0.1,1.0,10],'penalty':['l1','l2']}
    clf = GridSearchCV(estimator=LogisticRegression(),param_grid=params)
    clf.fit(x_train,y_train)
    best = clf.best_params_
    print('LR:',best)
    
    m = LogisticRegression(C=best['C'],penalty=best['penalty'])
    m.fit(x_train,y_train)
    train_acc = round(m.score(x_train,y_train),5)
    test_acc = round(m.score(x_test,y_test),5)
    print('acc: Train = {}  Test = {}'.format(train_acc,test_acc))
    
def SVMClassifierALG(x_train,x_test,y_train,y_test):
    params = {'C':[0.1,1.0,10],'kernel':['rbf','sigmoid','linear']}
    clf = GridSearchCV(estimator=SVC(),param_grid=params)
    clf.fit(x_train,y_train)
    best = clf.best_params_
    print('SVM:',best)
    
    m = SVC(C=best['C'],kernel=best['kernel'])
    m.fit(x_train,y_train)
    train_acc = round(m.score(x_train,y_train),5)
    test_acc = round(m.score(x_test,y_test),5)
    print('acc: Train = {}  Test = {}'.format(train_acc,test_acc))

#樹模型
def TreeModelClassifierALG(x_train,x_test,y_train,y_test):
    models = {'DT':DecisionTreeClassifier(criterion='gini',splitter='best',
                                          max_depth=None,min_samples_split=2,
                                          max_features=None,max_leaf_nodes=None,
                                          random_state=1),
              'Ada':AdaBoostClassifier(DecisionTreeClassifier(random_state=1),n_estimators=50,
                                       learning_rate=0.1),
              'RF':RandomForestClassifier(n_estimators=50,criterion='gini',
                                          max_depth=None,max_features='auto',
                                          min_samples_split=2,min_samples_leaf=1,
                                          random_state=1),
              'GBDT':GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,
                                                subsample=0.9,max_features=None,
                                                min_samples_split=2,min_samples_leaf=1,
                                                random_state=1),
              'LGBM':LGBMClassifier(boosting_type='gbdt',num_leaves=30,
                                    n_estimators=100,learning_rate=0.1,
                                    min_child_samples=10,max_depth=-1),
              'XGB':XGBClassifier(max_depth=3,learning_rate=0.1,
                                  n_estimators=100,booster='gbtree',
                                  objective='binary:logistic',subsample=1,
                                  reg_alpha=0,reg_lambda=1)
              }
    print('======================================================')
    for model in models:
        clf = models[model]
        clf.fit(x_train,y_train)
        train_acc = round(clf.score(x_train,y_train),5)
        test_acc = round(clf.score(x_test,y_test),5)
        print('model: {}\nacc: Train = {}  Test = {}'.format(model,train_acc,test_acc))
    print('======================================================')

x,y = obtainFormatData()  
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1) 

KNNClassifierALG(x_train,x_test,y_train,y_test)
LRClassifierALG(x_train,x_test,y_train,y_test)
SVMClassifierALG(x_train,x_test,y_train,y_test)
TreeModelClassifierALG(x_train,x_test,y_train,y_test)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章