sklearn的模型訓練與預測

sklearn的模型訓練與預測

sklearn是強大的python機器學習工具,支持豐富的機器學習算法數據預處理,在學術界和企業中應用廣泛,下面是sklearn的代碼編寫流程和各種算法使用示例(以分類爲例)。

分類任務流程三步走

  1. 創建模型對象
  2. 訓練
  3. 預測與性能評價

xgboost算法分類

'''
 * xgboost分類
'''

from classifier import LogRegClassifier
import numpy as np
import json
import math
import time
import os
import random
from sklearn.model_selection import train_test_split
from sklearn import metrics


def main():
    time_begin = time.time()
    # 原始數據(省略)
    data = d.data
    labels = d.labels
    # 數據標準化
    from sklearn.preprocessing import StandardScaler
    data = StandardScaler().fit_transform(data)
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.3)

    # 1.創建模型對象
    import sklearn
    from xgboost import XGBClassifier
    clf = XGBClassifier(learning_rate=0.1,
                        n_estimators=1000,  # 樹的個數--1000棵樹建立xgboost
                        max_depth=6,  # 樹的深度
                        min_child_weight=1,  # 葉子節點最小權重
                        gamma=0.,  # 懲罰項中葉子結點個數前的參數
                        subsample=0.8,  # 隨機選擇80%樣本建立決策樹
                        colsample_btree=0.8,  # 隨機選擇80%特徵建立決策樹
                        objective='multi:softmax',  # 指定損失函數
                        scale_pos_weight=1,  # 解決樣本個數不平衡的問題
                        random_state=27  # 隨機數
                        )

    # 2.訓練
    clf = clf.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric="mlogloss", early_stopping_rounds=10,
                  verbose=True)

    # 3.預測與性能評價
    np.set_printoptions(threshold=np.inf)
    predicted = clf.predict(x_test)
    predicted = np.array(predicted)
    print(metrics.classification_report(y_test, predicted))
    print(metrics.confusion_matrix(y_test, predicted))
    time_end = time.time()
    print("total time is ", time_end-time_begin)


# 程序入口
if __name__ == "__main__":
    main()

隨機森林算法分類

n_estimators是隨機森林的一個重要調優參數,表示樹的個數。

'''
 * 隨機森林分類
'''

from classifier import LogRegClassifier
import numpy as np
import json
import math
import time
import os
import random
from sklearn.model_selection import train_test_split
from sklearn import metrics


def main():
    time_begin = time.time()
    # 原始數據(省略)
    data = d.data
    labels = d.labels
    # 數據標準化
    from sklearn.preprocessing import StandardScaler
    data = StandardScaler().fit_transform(data)
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.3)

    # 1.創建模型對象
    import sklearn
    from xgboost import XGBClassifier
    clf = sklearn.ensemble.RandomForestClassifier(n_estimators=100)

    # 2.訓練
    clf = clf.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric="mlogloss", early_stopping_rounds=10,
                  verbose=True)

    # 3.預測與性能評價
    np.set_printoptions(threshold=np.inf)
    predicted = clf.predict(x_test)
    predicted = np.array(predicted)
    print(metrics.classification_report(y_test, predicted))
    print(metrics.confusion_matrix(y_test, predicted))
    time_end = time.time()
    print("total time is ", time_end-time_begin)


# 程序入口
if __name__ == "__main__":
    main()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章