不多廢話,直接上代碼
# -*- coding: utf-8 -*-
# @Author: Phill
# @Date: 2019-03-14 14:43:10
# @Last Modified by: Phill
# @Last Modified time: 2019-03-14 14:57:02
import numpy as np
import pandas as pd
import xgboost as xgb
def readData(fileName,sheetName):
data = pd.read_excel(fileName,index=None,sheetName=sheetName,header=None)
return np.array(data)
def getXY(data):
data = data.T
return np.transpose(data[:-1]),np.transpose([data[-1]])
def trainFunction(x_data,y_data,params,numRounds,modelPath):
trainData=xgb.DMatrix(x_data,label=y_data)
model=xgb.train(params,trainData,numRounds)
model.save_model(modelPath)
def predictFunction(x_data,params,modelPath):
model=xgb.Booster(params)
model.load_model(modelPath)
ypred = model.predict(xgb.DMatrix(x_data))
return np.array(ypred)
def main():
train = readData("test.xlsx","Sheet1")
test = readData("test.xlsx","Sheet2")
trainX,trainY = getXY(train)
testX,testY = getXY(test)
params = {"nthreed":4,"max_depth":5,"eta":0.05,"silent":0,"gamma":0.1,"lambda":1,"objective":"reg:linear","booster":"gbtree"}
trainFunction(trainX,trainY,params,100,".\\model")
print(predictFunction(testX,{"nthreed":4},".\\model"))
if __name__ == '__main__':
main()
其中參數配置需要注意,參數的具體說明如下:
1、訓練參數
“y_data”:label數據,Array類型
“numRounds”:迭代次數
“params”:JSON格式,其中nthreed表示線程數;max_depth表示樹的最大深度;eta類似學習率;silent可以設置爲1或0,表示輸出或不輸出運行過程;gamma表示決策樹剪枝的力度,一般設爲0.1或0.2;lambda表示正則化,設置爲1或2,表示L1和L2;objective’表示模型方式,二分類設置’binary:logistic’,迴歸設置’reg:linear’,多分類設置爲softmax;booster表示模型結構,可以設置爲gbtree;還有很多參數,具體請查看xgboost docment。
2、預測模式
“params”:JSON格式,僅需設置nthreed線程數即可
明眼警告:腳本名稱不要命名爲xgboost,會出錯誤的,因爲腳本頂端有一個import xgboost,會導致自己import自己