電費敏感數據挖掘一: 數據處理與特徵工程
電費敏感數據挖掘二: 文本特徵構造
六. 構建XGBoost模型
6.1 讀取特徵
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import csc_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from scipy.sparse import hstack
df = pickle.load(open(r'..\電費\statistical_features_1.pkl', 'rb'))
text = pickle.load(open(r'..\電費\text_features_1.pkl', 'rb'))
df = df.merge(text, on = 'CUST_NO', how = 'left')
train = df.loc[df.label != -1]
test = df.loc[df.label == -1]
print('訓練集:',train.shape[0])
print('正樣本:',train.loc[train.label == 1].shape[0])
print('負樣本:',train.loc[train.label == 0].shape[0])
print('測試集:',test.shape[0])
訓練集: 400075
正樣本: 13139
負樣本: 386936
測試集: 326167
6.2 基於選擇的詞來創建tf-idf,構建模型輸入數據
x_data = train.copy()
x_val = test.copy()
x_data = x_data.sample(frac = 1, random_state = 42).reset_index(drop = True)
delete_columns = ['CUST_NO', 'label', 'contents']
X_train_1 = csc_matrix(x_data.drop(delete_columns, axis = 1).values)
X_val_1 = csc_matrix(x_val.drop(delete_columns, axis = 1).values)
y_train = x_data.label.values
y_val = x_val.label.values
featurenames = list(x_data.drop(delete_columns, axis = 1).columns)
select_words = pickle.load(open(r'..\電費\single_select_words.pkl', 'rb'))
tfidf = TfidfVectorizer(ngram_range = (1, 2), min_df = 3, sublinear_tf = True,
smooth_idf = False, use_idf = False, vocabulary = select_words)
tfidf.fit(x_data.contents)
word_names = tfidf.get_feature_names()
X_train_2 = tfidf.transform(x_data.contents)
X_val_2 = tfidf.transform(x_val.contents)
print('文本特徵:{}維.'.format(len(word_names)))
statistic_feature = featurenames.copy()
print('其他特徵:{}維.'.format(len(statistic_feature)))
featurenames.extend(word_names)
X_train = hstack((X_train_1, X_train_2)).tocsc()
X_val = hstack((X_val_1, X_val_2)).tocsc()
print('特徵數量', X_train.shape[1])
文本特徵:341維.
其他特徵:85維.
特徵數量 426
6.3 XGBoost
bagging = []
for i in range(1, 4):
print('Group:', i)
dtrain = xgb.DMatrix(X_train, y_train, feature_names = featurenames)
dval = xgb.DMatrix(X_val, feature_names = featurenames)
params = {'objective': 'binary:logistic', 'eta': 0.1, 'max_depth': 12,
'booster': 'gbtree', 'eval_metric': 'error', 'subsample': 0.8,
'min_child_weight': 3, 'gamma': 0.2, 'lambda': 300,
'colsample_bytree': 1, 'silent': 1, 'seed': i}
watchlist = [(dtrain, 'train')]
model = xgb.train(params, dtrain, 2000, evals = watchlist, early_stopping_rounds = 50, verbose_eval = 100)
print('Predicting...')
y_prob = model.predict(dval, ntree_limit = model.best_ntree_limit)
bagging.append(y_prob)
print('--------------------------------')
print('Done!')
def threshold(y, t):
z = np.copy(y)
z[z >= t] = 1
z[z < t] = 0
return z
t = 0.5
pres = []
for i in bagging:
pres.append(threshold(i, t))
pres = np.array(pres).T.astype('int64')
result = []
for line in pres:
result.append(np.bincount(line).argmax())
myout = test[['CUST_NO']].copy()
myout['pre'] = result
七. 保存最終預測
import os
if not os.path.isdir(r'..\數據挖掘\result'):
os.makedirs(r'..\數據挖掘\result')
myout.loc[myout.pre == 1, 'CUST_NO'].to_csv(r'..\數據挖掘\result\A.csv', index = False)