機器學習迴歸算法ALG

機器學習迴歸算法

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import preprocessing                    #預處理
from sklearn.model_selection import train_test_split #劃分數據
from sklearn.model_selection import GridSearchCV     #網格搜索

#特徵選擇
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.svm import SVR

from sklearn.tree import DecisionTreeRegressor                         #DT
from sklearn.ensemble import AdaBoostRegressor                    #Ada
from sklearn.ensemble import RandomForestRegressor           #RF
from sklearn.ensemble import GradientBoostingRegressor      #GBDT
from lightgbm import LGBMRegressor                                         #LGBM
from xgboost.sklearn import XGBRegressor                               #XGB

from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

def obtainFormatData():
    dataset = datasets.load_boston()
    featuresMatrix,y = dataset.data,dataset.target
    print('======================================================')
    print('nums = {}        features = {}'.
          format(featuresMatrix.shape[0],featuresMatrix.shape[1]))
    print('======================================================')
    
    #最大最小化規約處理X = (X - min(X)) / (max(X) - min(X))
    x = preprocessing.MinMaxScaler().fit_transform(featuresMatrix)
    
    return x,y

#特徵選擇
def lassoFS(x,y):
    print('FS based on L1:')
    for a in [0.1,0.2,0.3]:
        clf = Lasso(alpha=a,random_state=1)
        clf.fit(x,y)
        index = list(np.where(clf.coef_!=0)[0])
        print('A = {} FSI = {}'.format(a,index))
def ridgeFS(x,y):
     print('FS based on L2:')
     for a in [0.1,0.5,0.8]:
        clf = Ridge(alpha=a,random_state=1)
        clf.fit(x,y)
        index = list(np.where(clf.coef_>=0.1)[0])
        print('A = {} FSI = {}'.format(a,index))
def treeModelFS(x,y):
    clf = ExtraTreesRegressor(n_estimators=10,criterion='mse'
                               ,max_depth=None,random_state=1)
    clf.fit(x,y)
    scores = pd.Series(clf.feature_importances_).sort_values(ascending=False)
    scores.plot.bar(rot=0,figsize=(8,3),title='importance of features based on tree')

#參數尋優
def SVRRegressionALG(x_train,x_test,y_train,y_test):
    params = {'C':[0.01,0.1,1.0,10],
              'kernel':['linear','rbf','sigmoid'],
              'epsilon':[0.01,0.05,0.1]}
    clf = GridSearchCV(estimator=SVR(),param_grid=params)
    clf.fit(x_train,y_train)
    best = clf.best_params_
    print('SVM:',best)
    
    m = SVR(C=best['C'],kernel=best['kernel'],epsilon=best['epsilon'])
    m.fit(x_train,y_train)
    predict_train = m.predict(x_train)
    predict_test = m.predict(x_test)
    train_r2 = round(r2_score(y_train,predict_train),4)
    test_r2 = round(r2_score(y_test,predict_test),4)
    print('R2: Train = {} Test = {}'.format(train_r2,test_r2))

#樹模型
def TreeModelRegressionALG(x_train,x_test,y_train,y_test):
    models = {'DT':DecisionTreeRegressor(criterion='mse',splitter='best',
                                          max_depth=None,min_samples_split=2,
                                          max_features=None,max_leaf_nodes=None,
                                          random_state=1),
              'Ada':AdaBoostRegressor(DecisionTreeRegressor(random_state=1),n_estimators=50,
                                       learning_rate=0.1),
              'RF':RandomForestRegressor(n_estimators=50,criterion='mse',
                                          max_depth=None,max_features='auto',
                                          min_samples_split=2,min_samples_leaf=1,
                                          random_state=1),
              'GBDT':GradientBoostingRegressor(loss='ls',n_estimators=100,learning_rate=0.1,
                                                subsample=0.9,max_features=None,
                                                min_samples_split=2,min_samples_leaf=1,
                                                random_state=1),
              'LGBM':LGBMRegressor(boosting_type='gbdt',num_leaves=30,
                                    n_estimators=100,learning_rate=0.1,
                                    objective='regression',min_child_samples=20,
                                    max_depth=-1),
              'XGB':XGBRegressor(max_depth=3,learning_rate=0.1,
                                  n_estimators=100,booster='gbtree',
                                  subsample=1,objective='reg:linear',
                                  reg_alpha=0,reg_lambda=1)
              }
    print('======================================================')
    for model in models:
        clf = models[model]
        clf.fit(x_train,y_train)
        predict_train = clf.predict(x_train)
        predict_test = clf.predict(x_test)
        train_r2 = round(r2_score(y_train,predict_train),4)
        test_r2 = round(r2_score(y_test,predict_test),4)
        print('model: {}\nR2: Train = {} Test = {}'.format(model,train_r2,test_r2))
    print('======================================================')

x,y = obtainFormatData()

x_train,x_test,y_train,y_test = train_test_split(x,y,
                                    test_size=0.2,random_state=1)
SVRRegressionALG(x_train,x_test,y_train,y_test)
TreeModelRegressionALG(x_train,x_test,y_train,y_test)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章