一、數據預處理
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from tqdm import tqdm_notebook
import joblib
import warnings
warnings.filterwarnings("ignore")
%%time
# 加載訓練和測試數據集
train_transaction = pd.read_csv("train_transaction.csv")
print("train_transaction shape : ", train_transaction.shape)
train_identity = pd.read_csv("train_identity.csv")
print("train_identity shape : ", train_identity.shape)
test_transaction = pd.read_csv("test_transaction.csv")
print("test_transaction shape : ", test_transaction.shape)
test_identity = pd.read_csv("test_identity.csv")
print("test_identity shape : ", test_identity.shape)
train_transaction shape : (590540, 394)
train_identity shape : (144233, 41)
test_transaction shape : (506691, 393)
test_identity shape : (141907, 41)
Wall time: 21.5 s
# 默認顯示前5行
train_transaction.head()
TransactionID | isFraud | TransactionDT | TransactionAmt | ProductCD | card1 | card2 | card3 | card4 | card5 | ... | V330 | V331 | V332 | V333 | V334 | V335 | V336 | V337 | V338 | V339 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2987000 | 0 | 86400 | 68.5 | W | 13926 | NaN | 150.0 | discover | 142.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2987001 | 0 | 86401 | 29.0 | W | 2755 | 404.0 | 150.0 | mastercard | 102.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2987002 | 0 | 86469 | 59.0 | W | 4663 | 490.0 | 150.0 | visa | 166.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 2987003 | 0 | 86499 | 50.0 | W | 18132 | 567.0 | 150.0 | mastercard | 117.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 2987004 | 0 | 86506 | 50.0 | H | 4497 | 514.0 | 150.0 | mastercard | 102.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 394 columns
# 查看數據信息
train_transaction.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 1.7+ GB
# 統計數據中的NaN
train_transaction.isnull().sum()
TransactionID 0
isFraud 0
TransactionDT 0
TransactionAmt 0
ProductCD 0
...
V335 508189
V336 508189
V337 508189
V338 508189
V339 508189
Length: 394, dtype: int64
同樣的方式查看train_identity、test_transaction、test_identity的數據類型、數據信息和空值情況
根據 TransactionID 合併 train_transaction 和 train_identity,test_transaction 和 test_identity
# 根據以上數據的shape,可以發現,並非所有的transaction都有相關聯的identity
# 計算一下佔比
train_count = np.sum(train_transaction['TransactionID'].isin(train_identity['TransactionID']))
print("train_transaction與train_identity基於TransactionID相關聯的數據量 : ", train_count)
train_ratio = train_count / len(train_transaction)
print("相關聯數據量佔整個train_transaction數據量的比例是 : {:.2f}%".format(train_ratio * 100))
train_transaction與train_identity基於TransactionID相關聯的數據量 : 144233
相關聯數據量佔整個train_transaction數據量的比例是 : 24.42%
test_count = np.sum(test_transaction['TransactionID'].isin(test_identity['TransactionID']))
print("test_transaction與test_identity基於TransactionID相關聯的數據量 : ", test_count)
test_ratio = test_count / len(test_transaction)
print("相關聯數據量佔整個test_transaction數據量的比例是 : {:.2f}%".format(test_ratio * 100))
est_transaction與test_identity基於TransactionID相關聯的數據量 : 141907
相關聯數據量佔整個test_transaction數據量的比例是 : 28.01%
# 合併
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
train.shape:(590540, 434)
test.shape:(506691, 433)
訓練數據比測試數據多一個標籤特徵
二、數據探索分析
在train_transaction中,正負樣本比例
sns.countplot('isFraud', data=train)
plt.title("Normal VS Fraud")
plt.show()
交易金額分佈
train['TransactionAmt'].apply(np.log).plot(kind='hist', bins=100, figsize=(15, 5), title='Distribution of Transaction Amount')
交易金額(正常 vs 欺詐)分佈
爲了展示更爲
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 6)) # 畫布大小
train.loc[train['isFraud']==1]['TransactionAmt'].apply(np.log).plot(kind='hist',
bins=100,
title='Log Transaction Amount isFraud = 1',
ax=ax1)
train.loc[train['isFraud']==1]['TransactionAmt'].plot(kind='hist',
bins=100,
title='Transaction Amount isFraud=1',
ax=ax2)
train.loc[train['isFraud']==0]['TransactionAmt'].apply(np.log).plot(kind='hist',
bins=100,
title='Log Transaction Amount isFraud = 0',
ax=ax3)
train.loc[train['isFraud']==0]['TransactionAmt'].plot(kind='hist',
bins=100,
title='Transaction Amount isFraud=0',
ax=ax4)
plt.show()
分析 ProductCD
train.groupby('ProductCD')['TransactionID'].count().plot(kind='barh',
figsize=(15, 6),
title='ProductCD TransctionID')
plt.show()
train.groupby('ProductCD')['isFraud'].mean().plot(kind='barh',
figsize=(15, 6),
title='ProductCD isFraud')
plt.show()
分類變量進行轉換 LabelEncoder
LabelEncoder :將離散型的數據轉換成 0 到 n − 1 之間的數,這裏 n 是一個列表的不同取值的個數,可以認爲是某個特徵的所有不同取值的個數。
# 輸出 object 類型的列
for col in train.columns:
if train[col].dtype == "object":
print(col)
ProductCD
card4
card6
P_emaildomain
R_emaildomain
M1
M2
M3
M4
M5
M6
M7
M8
M9
id_12
id_15
id_16
id_23
id_27
id_28
id_29
id_30
id_31
id_33
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo
# 輸出 DeviceType 的分佈
train.groupby("DeviceType").mean()["isFraud"].plot(kind='barh',
figsize=(15, 5),
title = "DeviceType Distribution")
plt.show()
統計每一列的NaN的數量,如果比例超過70%,則刪除該列
def clean_nan(df):
temp_columns = []
for col in df.columns:
# 獲取一列
counter = df[col].isnull().sum()
# 佔該列的比例
ratio = counter / len(df[col])
if ratio > 0.7:
# 刪除該列
temp_columns.append(col)
# 刪除那些列
new_df = df.drop(temp_columns, axis=1)
return new_df
print("原始的train shape : ", train.shape)
train = clean_nan(train)
print("清洗後的train shape : ", train.shape)
原始的train shape : (590540, 434)
清洗後的train shape : (590540, 226)
訓練數據清洗掉208個特徵
print("原始的test shape : ", test.shape)
test = clean_nan(test)
print("清洗後的test shape : ", test.shape)
原始的test shape : (506691, 433)
清洗後的test shape : (506691, 225)
測試數據清洗掉208個特徵
刪除那些一列中某個類別數據量超過90%的列
def clean_top_cols(df):
new_cols = [col for col in df.columns if df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
return new_cols
train_cols = clean_top_cols(train)
test_cols = clean_top_cols(test)
cols_to_drop = list(set(train_cols + test_cols)) # 使用set列表進行去重
'isFraud' in cols_to_drop # 查看標籤列是否在其中
cols_to_drop.remove('isFraud') # 刪除標籤列
# 清理掉這些列
print("原始的train shape : ", train.shape)
train = train.drop(cols_to_drop, axis=1)
print("清理後的train shape : ", train.shape)
原始的train shape : (590540, 226)
清理後的train shape : (590540, 156)
# 清理掉這些列
print("原始的test shape : ", test.shape)
test = test.drop(cols_to_drop, axis=1)
print("清理後的test shape : ", test.shape)
原始的test shape : (506691, 225)
清理後的test shape : (506691, 155)
# LabelEncoder
# 加上進度條:tqdm_notebook
for col in tqdm_notebook(train.columns):
if train[col].dtype == "object":
encoder = LabelEncoder()
encoder.fit(list(train[col].values) + list(test[col].values))
train[col] = encoder.transform(list(train[col].values))
test[col] = encoder.transform(list(test[col].values))
train = train.reset_index() # 重置索引
test = test.reset_index()
del train['index']
del test['index']
train.shape:(590540, 156)
test.shape:(506691, 155)
# 分離數據集和標籤
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']
X.shape:(590540, 153)
y.shape:(590540,)
# test
X_test = test.drop(['TransactionDT', 'TransactionID'], axis=1) # 刪除多餘的列,不是特徵
test = test[['TransactionDT', 'TransactionID']]
X_test.shape:(506691, 153)
test.shape:(506691, 2)
三、LightGBM建模
params 超參數設置
params = {'num_leaves': 491, # 葉節點數量
'min_data_in_leaf': 106, # 每個葉子節點中的數據
'objective': 'binary', # 任務:二分類
'max_depth': -1, # -1 : 不限制深度
"boosting_type": "gbdt", # 'dart', 'goss', 'rf'
"metric": 'auc', # 衡量標準
"verbosity" : -1, # 不顯示信息
'random_state': 66, # 隨機種子
}
創建DataFrame保存特徵重要性
feature_importances = pd.DataFrame(index=None)
feature_importances['features'] = X.columns
feature_importances
features | |
---|---|
0 | TransactionAmt |
1 | ProductCD |
2 | card1 |
3 | card2 |
4 | card3 |
... | ... |
148 | V312 |
149 | V313 |
150 | V314 |
151 | V315 |
152 | V317 |
153 rows × 1 columns
5折交叉驗證
folds = KFold(n_splits=5)
splits = folds.split(X, y) # 分割成5份,前4份是訓練集索引,最後1份是驗證集索引
next(iter(splits)) # 輸出的是索引
(array([118108, 118109, 118110, ..., 590537, 590538, 590539]),
array([ 0, 1, 2, ..., 118105, 118106, 118107]))
best_auc = 0
best_model = None
for k, (train_indices, val_indices) in enumerate(splits):
print("第 %d 折\n" % k)
X_train_data, X_val_data = X.iloc[train_indices], X.iloc[val_indices] # 訓練集, 驗證集
y_train, y_val = y.iloc[train_indices], y.iloc[val_indices] # 訓練標籤,驗證標籤
#print("X_train_data shape : \t", X_train_data.shape, "X_val_data shape : \t", X_val_data.shape)
train_dataset = lgb.Dataset(X_train_data, label=y_train) # 訓練集
val_dataset = lgb.Dataset(X_val_data, label=y_val) # 驗證集
lgb_model = lgb.train(params=params, # 超參數設置
train_set=train_dataset, # 訓練數據
num_boost_round=10000, # 循環的輪數
valid_sets=val_dataset, # 驗證數據
valid_names='validation', # 驗證集名稱
early_stopping_rounds=200,) # 如果200輪後沒有提升,就停止循環
# 保存特徵重要性
feature_importances[f'fold_{k+1}'] = lgb_model.feature_importance()
# 對驗證集進行預測
y_val_pred = lgb_model.predict(X_val_data)
# 計算roc_auc
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f" Fold {k + 1} | AUC_ROC : { roc_auc * 100}%")
# 判斷是否是最優模型
if roc_auc > best_auc:
best_auc = roc_auc
best_model = lgb_model
feature_importances['average'] = feature_importances[[f'fold_{k+1}' for k in range(folds.n_splits-1)]].mean(axis=1)
feature_importances.head()
features | fold_1 | fold_2 | fold_3 | fold_4 | average | |
---|---|---|---|---|---|---|
0 | TransactionAmt | 10758 | 2337 | 2571 | 3088 | 4688.5 |
1 | ProductCD | 478 | 148 | 189 | 179 | 248.5 |
2 | card1 | 10241 | 2791 | 3168 | 3450 | 4912.5 |
3 | card2 | 8222 | 2517 | 2606 | 3039 | 4096.0 |
4 | card3 | 559 | 214 | 263 | 310 | 336.5 |
# 可視化顯示前50個特徵
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(50),
x='average',
y='features')
plt.title("50 top features importance over {} folds average.".format(folds.n_splits-1))
print("The best roc_auc : ", roc_auc)
The best roc_auc : 0.9187823659441293
joblib.dump(best_model, "best_model.pkl") # 保存模型
對與test進行預測
y_test_pred = best_model.predict(X_test)
labels = np.round(y_test_pred) # np.round() 四捨五入
from collections import Counter
Counter(labels)
Counter({0.0: 498254, 1.0: 8437})