Advertising.csv數據集——迴歸樹與XGBoost迴歸

分別使用迴歸樹與XGBoost迴歸,預測實驗三中給出的Advertising.csv數據集,並與傳統線性迴歸預測方法進行比較。

具體要求:

  1. 首先進行數據標準化。
  2. 測試集和訓練集比例分別爲30%和70%。
  3. 使用均方誤差來評價預測的好壞程度。
  4. 對於XGBoost請嘗試使用交叉驗證找到n_estimators的最優參數值。n_estimators的取值範圍爲[100-1000]。

 

迴歸樹:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

train_path = r'Advertising.csv'

def read_train_file(path):
    data = pd.read_csv(path)
    return  data

#迴歸樹
def RegressionTree(data):
    X = data[['TV', 'Radio', 'Newspaper']]
    Y = data['Sales']
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

    dt_reg = DecisionTreeRegressor(max_depth=5)
    dt_reg.fit(x_train, y_train)
    score = dt_reg.score(x_test, y_test)
    print("迴歸樹預測準確率: ",score,"%")
    y_pred = dt_reg.predict(x_test)
    print("迴歸樹均方誤差:",mean_squared_error(y_test, y_pred))

if __name__ == '__main__':
    print("read train file.....")
    data=read_train_file(train_path)
    RegressionTree(data)

XGBoost迴歸:

通過交叉驗證找到n_estimators的最優參數值(調參):

import pandas as pd
from pylab import *
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
train_path = r'Advertising.csv'


data = pd.read_csv(train_path)
X = data[['TV', 'Radio', 'Newspaper']]
Y = data['Sales']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

def modelfit(alg, dtrain,dlable,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    def rmse(predictions, targets):
        return np.sqrt(((predictions - targets) ** 2).mean())
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain, label=dlable)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='rmse', early_stopping_rounds=early_stopping_rounds,
                          callbacks=[xgb.callback.print_evaluation(show_stdv=True), xgb.callback.early_stop(early_stopping_rounds)])
        print("n_estimators:",cvresult.shape[0])
        alg.set_params(n_estimators=cvresult.shape[0])
          # Fit the algorithm on the data
        alg.fit(dtrain, dlable, eval_metric='rmse')
        # Predict training set:
        #preds = alg.predict(dtrain)
        # Print model report:
        #print("\nModel Report:",rmse(preds,dtrain.get_label()))

xgb1 = XGBRegressor(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'gpu:reg:linear',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
modelfit(xgb1, X, Y,)

 

XGBoost模型:

import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

train_path = r'Advertising.csv'

def read_train_file(path):
    data = pd.read_csv(path)
    return  data

def xgboost(data):
    X = data[['TV', 'Radio', 'Newspaper']]
    # 標準化特徵值
    sc = StandardScaler()
    sc.fit(X)
    X = sc.transform(X)
    Y = data['Sales']
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
    #n_estimators的值已調出最優值 187
    model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators= 187 , silent=False, objective='reg:gamma')
    model.fit(x_train, y_train)
    # 對測試集進行預測
    score = model.score(x_test, y_test)
    print("xgboost預測準確率: ", score, "%")
    y_pred = model.predict(x_test)
    print("xgboost均方誤差:", mean_squared_error(y_test, y_pred))


if __name__ == '__main__':
    print("read train file.....")
    data=read_train_file(train_path)
    xgboost(data)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章