分別使用迴歸樹與XGBoost迴歸,預測實驗三中給出的Advertising.csv數據集,並與傳統線性迴歸預測方法進行比較。
具體要求:
- 首先進行數據標準化。
- 測試集和訓練集比例分別爲30%和70%。
- 使用均方誤差來評價預測的好壞程度。
- 對於XGBoost請嘗試使用交叉驗證找到n_estimators的最優參數值。n_estimators的取值範圍爲[100-1000]。
迴歸樹:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
train_path = r'Advertising.csv'
def read_train_file(path):
data = pd.read_csv(path)
return data
#迴歸樹
def RegressionTree(data):
X = data[['TV', 'Radio', 'Newspaper']]
Y = data['Sales']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
dt_reg = DecisionTreeRegressor(max_depth=5)
dt_reg.fit(x_train, y_train)
score = dt_reg.score(x_test, y_test)
print("迴歸樹預測準確率: ",score,"%")
y_pred = dt_reg.predict(x_test)
print("迴歸樹均方誤差:",mean_squared_error(y_test, y_pred))
if __name__ == '__main__':
print("read train file.....")
data=read_train_file(train_path)
RegressionTree(data)
XGBoost迴歸:
通過交叉驗證找到n_estimators的最優參數值(調參):
import pandas as pd
from pylab import *
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
train_path = r'Advertising.csv'
data = pd.read_csv(train_path)
X = data[['TV', 'Radio', 'Newspaper']]
Y = data['Sales']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
def modelfit(alg, dtrain,dlable,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
def rmse(predictions, targets):
return np.sqrt(((predictions - targets) ** 2).mean())
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain, label=dlable)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='rmse', early_stopping_rounds=early_stopping_rounds,
callbacks=[xgb.callback.print_evaluation(show_stdv=True), xgb.callback.early_stop(early_stopping_rounds)])
print("n_estimators:",cvresult.shape[0])
alg.set_params(n_estimators=cvresult.shape[0])
# Fit the algorithm on the data
alg.fit(dtrain, dlable, eval_metric='rmse')
# Predict training set:
#preds = alg.predict(dtrain)
# Print model report:
#print("\nModel Report:",rmse(preds,dtrain.get_label()))
xgb1 = XGBRegressor(
learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'gpu:reg:linear',
nthread=4,
scale_pos_weight=1,
seed=27)
modelfit(xgb1, X, Y,)
XGBoost模型:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
train_path = r'Advertising.csv'
def read_train_file(path):
data = pd.read_csv(path)
return data
def xgboost(data):
X = data[['TV', 'Radio', 'Newspaper']]
# 標準化特徵值
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
Y = data['Sales']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
#n_estimators的值已調出最優值 187
model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators= 187 , silent=False, objective='reg:gamma')
model.fit(x_train, y_train)
# 對測試集進行預測
score = model.score(x_test, y_test)
print("xgboost預測準確率: ", score, "%")
y_pred = model.predict(x_test)
print("xgboost均方誤差:", mean_squared_error(y_test, y_pred))
if __name__ == '__main__':
print("read train file.....")
data=read_train_file(train_path)
xgboost(data)