電費敏感數據挖掘三: 構建低敏用戶模型

電費敏感數據挖掘一: 數據處理與特徵工程
電費敏感數據挖掘二: 文本特徵構造

六. 構建XGBoost模型

6.1 讀取特徵

import pandas as pd
import numpy as np
import pickle
from scipy.sparse import csc_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from scipy.sparse import hstack

df = pickle.load(open(r'..\電費\statistical_features_1.pkl', 'rb'))
text = pickle.load(open(r'..\電費\text_features_1.pkl', 'rb'))
df = df.merge(text, on = 'CUST_NO', how = 'left')

train = df.loc[df.label != -1]
test = df.loc[df.label == -1]
print('訓練集:',train.shape[0])
print('正樣本:',train.loc[train.label == 1].shape[0])
print('負樣本:',train.loc[train.label == 0].shape[0])
print('測試集:',test.shape[0])

訓練集: 400075
正樣本: 13139
負樣本: 386936
測試集: 326167

6.2 基於選擇的詞來創建tf-idf,構建模型輸入數據

x_data = train.copy()
x_val = test.copy()
x_data = x_data.sample(frac = 1, random_state = 42).reset_index(drop = True)

delete_columns = ['CUST_NO', 'label', 'contents']
X_train_1 = csc_matrix(x_data.drop(delete_columns, axis = 1).values)
X_val_1 = csc_matrix(x_val.drop(delete_columns, axis = 1).values)

y_train = x_data.label.values
y_val = x_val.label.values
featurenames = list(x_data.drop(delete_columns, axis = 1).columns)

select_words = pickle.load(open(r'..\電費\single_select_words.pkl', 'rb'))
tfidf = TfidfVectorizer(ngram_range = (1, 2), min_df = 3, sublinear_tf = True,
                      smooth_idf = False, use_idf = False, vocabulary = select_words)
tfidf.fit(x_data.contents)
word_names = tfidf.get_feature_names()
X_train_2 = tfidf.transform(x_data.contents)
X_val_2 = tfidf.transform(x_val.contents)
print('文本特徵:{}維.'.format(len(word_names)))
statistic_feature = featurenames.copy()
print('其他特徵:{}維.'.format(len(statistic_feature)))
featurenames.extend(word_names)

X_train = hstack((X_train_1, X_train_2)).tocsc()
X_val = hstack((X_val_1, X_val_2)).tocsc()
print('特徵數量', X_train.shape[1])

文本特徵:341維.
其他特徵:85維.
特徵數量 426

6.3 XGBoost

bagging = []
for i in range(1, 4):
    print('Group:', i)
    dtrain = xgb.DMatrix(X_train, y_train, feature_names = featurenames)
    dval = xgb.DMatrix(X_val, feature_names = featurenames)
    
    params = {'objective': 'binary:logistic', 'eta': 0.1, 'max_depth': 12,
              'booster': 'gbtree', 'eval_metric': 'error', 'subsample': 0.8,
              'min_child_weight': 3, 'gamma': 0.2, 'lambda': 300,
              'colsample_bytree': 1, 'silent': 1, 'seed': i}
    watchlist = [(dtrain, 'train')]
    model = xgb.train(params, dtrain, 2000, evals = watchlist, early_stopping_rounds = 50, verbose_eval = 100)
    
    print('Predicting...')
    y_prob = model.predict(dval, ntree_limit = model.best_ntree_limit)
    bagging.append(y_prob)
    print('--------------------------------')
print('Done!')

def threshold(y, t):
    z = np.copy(y)
    z[z >= t] = 1
    z[z < t] = 0
    return z

t = 0.5
pres = []
for i in bagging:
    pres.append(threshold(i, t))
    
pres = np.array(pres).T.astype('int64')
result = []
for line in pres:
    result.append(np.bincount(line).argmax())
    
myout = test[['CUST_NO']].copy()
myout['pre'] = result

在這裏插入圖片描述

七. 保存最終預測

import os
if not os.path.isdir(r'..\數據挖掘\result'):
    os.makedirs(r'..\數據挖掘\result')
myout.loc[myout.pre == 1, 'CUST_NO'].to_csv(r'..\數據挖掘\result\A.csv', index = False)

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章