基於lightGBM的交易詐騙案例檢測

一、數據預處理

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from tqdm import tqdm_notebook
import joblib
import warnings
warnings.filterwarnings("ignore")
%%time
# 加載訓練和測試數據集

train_transaction = pd.read_csv("train_transaction.csv")
print("train_transaction shape : ", train_transaction.shape)

train_identity = pd.read_csv("train_identity.csv")
print("train_identity shape : ", train_identity.shape)

test_transaction = pd.read_csv("test_transaction.csv")
print("test_transaction shape : ", test_transaction.shape)

test_identity = pd.read_csv("test_identity.csv")
print("test_identity shape : ", test_identity.shape)

train_transaction shape : (590540, 394)
train_identity shape : (144233, 41)
test_transaction shape : (506691, 393)
test_identity shape : (141907, 41)
Wall time: 21.5 s

# 默認顯示前5行
train_transaction.head()
TransactionID isFraud TransactionDT TransactionAmt ProductCD card1 card2 card3 card4 card5 ... V330 V331 V332 V333 V334 V335 V336 V337 V338 V339
0 2987000 0 86400 68.5 W 13926 NaN 150.0 discover 142.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2987001 0 86401 29.0 W 2755 404.0 150.0 mastercard 102.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 2987002 0 86469 59.0 W 4663 490.0 150.0 visa 166.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 2987003 0 86499 50.0 W 18132 567.0 150.0 mastercard 117.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 2987004 0 86506 50.0 H 4497 514.0 150.0 mastercard 102.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 394 columns

# 查看數據信息
train_transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 1.7+ GB

# 統計數據中的NaN
train_transaction.isnull().sum()

TransactionID 0
isFraud 0
TransactionDT 0
TransactionAmt 0
ProductCD 0
...
V335 508189
V336 508189
V337 508189
V338 508189
V339 508189
Length: 394, dtype: int64

同樣的方式查看train_identity、test_transaction、test_identity的數據類型、數據信息和空值情況

根據 TransactionID 合併 train_transaction 和 train_identity,test_transaction 和 test_identity

# 根據以上數據的shape,可以發現,並非所有的transaction都有相關聯的identity
# 計算一下佔比
train_count = np.sum(train_transaction['TransactionID'].isin(train_identity['TransactionID']))
print("train_transaction與train_identity基於TransactionID相關聯的數據量 : ", train_count)

train_ratio = train_count / len(train_transaction)
print("相關聯數據量佔整個train_transaction數據量的比例是 : {:.2f}%".format(train_ratio * 100))

train_transaction與train_identity基於TransactionID相關聯的數據量 : 144233
相關聯數據量佔整個train_transaction數據量的比例是 : 24.42%

test_count = np.sum(test_transaction['TransactionID'].isin(test_identity['TransactionID']))
print("test_transaction與test_identity基於TransactionID相關聯的數據量 : ", test_count)

test_ratio = test_count / len(test_transaction)
print("相關聯數據量佔整個test_transaction數據量的比例是 : {:.2f}%".format(test_ratio * 100))

est_transaction與test_identity基於TransactionID相關聯的數據量 : 141907
相關聯數據量佔整個test_transaction數據量的比例是 : 28.01%

# 合併
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

train.shape:(590540, 434)

test.shape:(506691, 433)

訓練數據比測試數據多一個標籤特徵

二、數據探索分析

在train_transaction中,正負樣本比例

sns.countplot('isFraud', data=train)
plt.title("Normal VS Fraud")
plt.show()

image

交易金額分佈

train['TransactionAmt'].apply(np.log).plot(kind='hist', bins=100, figsize=(15, 5), title='Distribution of Transaction Amount')

image

交易金額(正常 vs 欺詐)分佈

爲了展示更爲

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 6)) # 畫布大小

train.loc[train['isFraud']==1]['TransactionAmt'].apply(np.log).plot(kind='hist',
                                                                    bins=100,
                                                                    title='Log Transaction Amount isFraud = 1',
                                                                    ax=ax1)

train.loc[train['isFraud']==1]['TransactionAmt'].plot(kind='hist',
                                                      bins=100,
                                                      title='Transaction Amount isFraud=1',
                                                      ax=ax2)

train.loc[train['isFraud']==0]['TransactionAmt'].apply(np.log).plot(kind='hist',
                                                                    bins=100,
                                                                    title='Log Transaction Amount isFraud = 0',
                                                                    ax=ax3)

train.loc[train['isFraud']==0]['TransactionAmt'].plot(kind='hist',
                                                      bins=100,
                                                      title='Transaction Amount isFraud=0',
                                                      ax=ax4)

plt.show()

image

分析 ProductCD

train.groupby('ProductCD')['TransactionID'].count().plot(kind='barh',
                                                         figsize=(15, 6),
                                                         title='ProductCD TransctionID')
plt.show()

image

train.groupby('ProductCD')['isFraud'].mean().plot(kind='barh',
                                                         figsize=(15, 6),
                                                         title='ProductCD isFraud')
plt.show()

image

分類變量進行轉換 LabelEncoder

LabelEncoder :將離散型的數據轉換成 0 到 n − 1 之間的數,這裏 n 是一個列表的不同取值的個數,可以認爲是某個特徵的所有不同取值的個數。

# 輸出 object 類型的列
for col in train.columns:
    if train[col].dtype == "object":
        print(col)

ProductCD
card4
card6
P_emaildomain
R_emaildomain
M1
M2
M3
M4
M5
M6
M7
M8
M9
id_12
id_15
id_16
id_23
id_27
id_28
id_29
id_30
id_31
id_33
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo

# 輸出 DeviceType 的分佈
train.groupby("DeviceType").mean()["isFraud"].plot(kind='barh',
                                        figsize=(15, 5),
                                        title = "DeviceType Distribution")
plt.show()

統計每一列的NaN的數量,如果比例超過70%,則刪除該列

def clean_nan(df):
    temp_columns = []
    for col in df.columns:
        # 獲取一列
        counter = df[col].isnull().sum()
        # 佔該列的比例
        ratio = counter / len(df[col])
        if ratio > 0.7:
            # 刪除該列
            temp_columns.append(col)

    # 刪除那些列
    new_df = df.drop(temp_columns, axis=1)
    return new_df
print("原始的train shape : ", train.shape)
train = clean_nan(train)
print("清洗後的train shape : ", train.shape)

原始的train shape : (590540, 434)
清洗後的train shape : (590540, 226)

訓練數據清洗掉208個特徵

print("原始的test shape : ", test.shape)
test = clean_nan(test)
print("清洗後的test shape : ", test.shape)

原始的test shape : (506691, 433)
清洗後的test shape : (506691, 225)

測試數據清洗掉208個特徵

刪除那些一列中某個類別數據量超過90%的列

def clean_top_cols(df):
    new_cols = [col for col in df.columns if df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
    return new_cols
train_cols = clean_top_cols(train)
test_cols = clean_top_cols(test)
cols_to_drop = list(set(train_cols + test_cols)) # 使用set列表進行去重

'isFraud' in cols_to_drop # 查看標籤列是否在其中
cols_to_drop.remove('isFraud') # 刪除標籤列
# 清理掉這些列
print("原始的train shape : ", train.shape)
train = train.drop(cols_to_drop, axis=1)
print("清理後的train shape : ", train.shape)

原始的train shape : (590540, 226)
清理後的train shape : (590540, 156)

# 清理掉這些列
print("原始的test shape : ", test.shape)
test = test.drop(cols_to_drop, axis=1)
print("清理後的test shape : ", test.shape)

原始的test shape : (506691, 225)
清理後的test shape : (506691, 155)

# LabelEncoder
# 加上進度條:tqdm_notebook
for col in tqdm_notebook(train.columns):
    if train[col].dtype == "object":
        encoder = LabelEncoder()
        encoder.fit(list(train[col].values) + list(test[col].values))
        train[col] = encoder.transform(list(train[col].values))
        test[col] = encoder.transform(list(test[col].values))

train = train.reset_index() # 重置索引
test = test.reset_index()

del train['index']
del test['index']

train.shape:(590540, 156)

test.shape:(506691, 155)

# 分離數據集和標籤

X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)

y = train.sort_values('TransactionDT')['isFraud']

X.shape:(590540, 153)

y.shape:(590540,)

# test
X_test = test.drop(['TransactionDT', 'TransactionID'], axis=1) # 刪除多餘的列,不是特徵

test = test[['TransactionDT', 'TransactionID']]

X_test.shape:(506691, 153)

test.shape:(506691, 2)

三、LightGBM建模

params 超參數設置

params = {'num_leaves': 491, # 葉節點數量
          'min_data_in_leaf': 106, # 每個葉子節點中的數據
          'objective': 'binary', # 任務:二分類
          'max_depth': -1, # -1 : 不限制深度
          "boosting_type": "gbdt", # 'dart', 'goss', 'rf'
          "metric": 'auc', # 衡量標準
          "verbosity" : -1, # 不顯示信息
          'random_state': 66, # 隨機種子
         }

創建DataFrame保存特徵重要性

feature_importances = pd.DataFrame(index=None)

feature_importances['features'] = X.columns

feature_importances
features
0 TransactionAmt
1 ProductCD
2 card1
3 card2
4 card3
... ...
148 V312
149 V313
150 V314
151 V315
152 V317

153 rows × 1 columns

5折交叉驗證

folds = KFold(n_splits=5)
splits = folds.split(X, y) # 分割成5份,前4份是訓練集索引,最後1份是驗證集索引
next(iter(splits)) # 輸出的是索引

(array([118108, 118109, 118110, ..., 590537, 590538, 590539]),
array([ 0, 1, 2, ..., 118105, 118106, 118107]))

best_auc = 0
best_model = None

for k, (train_indices, val_indices) in enumerate(splits):
    print("第 %d 折\n" % k)
    X_train_data, X_val_data = X.iloc[train_indices], X.iloc[val_indices] # 訓練集, 驗證集
    y_train, y_val = y.iloc[train_indices], y.iloc[val_indices] # 訓練標籤,驗證標籤
    #print("X_train_data shape : \t", X_train_data.shape, "X_val_data shape : \t", X_val_data.shape)
    
    train_dataset = lgb.Dataset(X_train_data, label=y_train) # 訓練集
    val_dataset = lgb.Dataset(X_val_data, label=y_val) # 驗證集
    
    lgb_model = lgb.train(params=params, # 超參數設置
                          train_set=train_dataset, # 訓練數據
                          num_boost_round=10000, # 循環的輪數
                          valid_sets=val_dataset, # 驗證數據
                          valid_names='validation', # 驗證集名稱
                          early_stopping_rounds=200,) # 如果200輪後沒有提升,就停止循環
    # 保存特徵重要性
    feature_importances[f'fold_{k+1}'] = lgb_model.feature_importance()
    
    # 對驗證集進行預測
    y_val_pred = lgb_model.predict(X_val_data)
    
    # 計算roc_auc
    roc_auc = roc_auc_score(y_val, y_val_pred)
    print(f" Fold {k + 1} | AUC_ROC : { roc_auc * 100}%")
    
    # 判斷是否是最優模型
    if roc_auc > best_auc:
        best_auc = roc_auc
        best_model = lgb_model
feature_importances['average'] = feature_importances[[f'fold_{k+1}' for k in range(folds.n_splits-1)]].mean(axis=1)

feature_importances.head()
features fold_1 fold_2 fold_3 fold_4 average
0 TransactionAmt 10758 2337 2571 3088 4688.5
1 ProductCD 478 148 189 179 248.5
2 card1 10241 2791 3168 3450 4912.5
3 card2 8222 2517 2606 3039 4096.0
4 card3 559 214 263 310 336.5
# 可視化顯示前50個特徵
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(50), 
            x='average', 
            y='features')
plt.title("50 top features importance over {} folds average.".format(folds.n_splits-1))

image

print("The best roc_auc : ", roc_auc)

The best roc_auc : 0.9187823659441293

joblib.dump(best_model, "best_model.pkl") # 保存模型

對與test進行預測

y_test_pred = best_model.predict(X_test)
labels = np.round(y_test_pred) # np.round() 四捨五入
from collections import Counter
Counter(labels)

Counter({0.0: 498254, 1.0: 8437})

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章