基于lightGBM的交易诈骗案例检测

一、数据预处理

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from tqdm import tqdm_notebook
import joblib
import warnings
warnings.filterwarnings("ignore")
%%time
# 加载训练和测试数据集

train_transaction = pd.read_csv("train_transaction.csv")
print("train_transaction shape : ", train_transaction.shape)

train_identity = pd.read_csv("train_identity.csv")
print("train_identity shape : ", train_identity.shape)

test_transaction = pd.read_csv("test_transaction.csv")
print("test_transaction shape : ", test_transaction.shape)

test_identity = pd.read_csv("test_identity.csv")
print("test_identity shape : ", test_identity.shape)

train_transaction shape : (590540, 394)
train_identity shape : (144233, 41)
test_transaction shape : (506691, 393)
test_identity shape : (141907, 41)
Wall time: 21.5 s

# 默认显示前5行
train_transaction.head()
TransactionID isFraud TransactionDT TransactionAmt ProductCD card1 card2 card3 card4 card5 ... V330 V331 V332 V333 V334 V335 V336 V337 V338 V339
0 2987000 0 86400 68.5 W 13926 NaN 150.0 discover 142.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2987001 0 86401 29.0 W 2755 404.0 150.0 mastercard 102.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 2987002 0 86469 59.0 W 4663 490.0 150.0 visa 166.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 2987003 0 86499 50.0 W 18132 567.0 150.0 mastercard 117.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 2987004 0 86506 50.0 H 4497 514.0 150.0 mastercard 102.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 394 columns

# 查看数据信息
train_transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 1.7+ GB

# 统计数据中的NaN
train_transaction.isnull().sum()

TransactionID 0
isFraud 0
TransactionDT 0
TransactionAmt 0
ProductCD 0
...
V335 508189
V336 508189
V337 508189
V338 508189
V339 508189
Length: 394, dtype: int64

同样的方式查看train_identity、test_transaction、test_identity的数据类型、数据信息和空值情况

根据 TransactionID 合并 train_transaction 和 train_identity,test_transaction 和 test_identity

# 根据以上数据的shape,可以发现,并非所有的transaction都有相关联的identity
# 计算一下占比
train_count = np.sum(train_transaction['TransactionID'].isin(train_identity['TransactionID']))
print("train_transaction与train_identity基于TransactionID相关联的数据量 : ", train_count)

train_ratio = train_count / len(train_transaction)
print("相关联数据量占整个train_transaction数据量的比例是 : {:.2f}%".format(train_ratio * 100))

train_transaction与train_identity基于TransactionID相关联的数据量 : 144233
相关联数据量占整个train_transaction数据量的比例是 : 24.42%

test_count = np.sum(test_transaction['TransactionID'].isin(test_identity['TransactionID']))
print("test_transaction与test_identity基于TransactionID相关联的数据量 : ", test_count)

test_ratio = test_count / len(test_transaction)
print("相关联数据量占整个test_transaction数据量的比例是 : {:.2f}%".format(test_ratio * 100))

est_transaction与test_identity基于TransactionID相关联的数据量 : 141907
相关联数据量占整个test_transaction数据量的比例是 : 28.01%

# 合并
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

train.shape:(590540, 434)

test.shape:(506691, 433)

训练数据比测试数据多一个标签特征

二、数据探索分析

在train_transaction中,正负样本比例

sns.countplot('isFraud', data=train)
plt.title("Normal VS Fraud")
plt.show()

image

交易金额分布

train['TransactionAmt'].apply(np.log).plot(kind='hist', bins=100, figsize=(15, 5), title='Distribution of Transaction Amount')

image

交易金额(正常 vs 欺诈)分布

为了展示更为

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 6)) # 画布大小

train.loc[train['isFraud']==1]['TransactionAmt'].apply(np.log).plot(kind='hist',
                                                                    bins=100,
                                                                    title='Log Transaction Amount isFraud = 1',
                                                                    ax=ax1)

train.loc[train['isFraud']==1]['TransactionAmt'].plot(kind='hist',
                                                      bins=100,
                                                      title='Transaction Amount isFraud=1',
                                                      ax=ax2)

train.loc[train['isFraud']==0]['TransactionAmt'].apply(np.log).plot(kind='hist',
                                                                    bins=100,
                                                                    title='Log Transaction Amount isFraud = 0',
                                                                    ax=ax3)

train.loc[train['isFraud']==0]['TransactionAmt'].plot(kind='hist',
                                                      bins=100,
                                                      title='Transaction Amount isFraud=0',
                                                      ax=ax4)

plt.show()

image

分析 ProductCD

train.groupby('ProductCD')['TransactionID'].count().plot(kind='barh',
                                                         figsize=(15, 6),
                                                         title='ProductCD TransctionID')
plt.show()

image

train.groupby('ProductCD')['isFraud'].mean().plot(kind='barh',
                                                         figsize=(15, 6),
                                                         title='ProductCD isFraud')
plt.show()

image

分类变量进行转换 LabelEncoder

LabelEncoder :将离散型的数据转换成 0 到 n − 1 之间的数,这里 n 是一个列表的不同取值的个数,可以认为是某个特征的所有不同取值的个数。

# 输出 object 类型的列
for col in train.columns:
    if train[col].dtype == "object":
        print(col)

ProductCD
card4
card6
P_emaildomain
R_emaildomain
M1
M2
M3
M4
M5
M6
M7
M8
M9
id_12
id_15
id_16
id_23
id_27
id_28
id_29
id_30
id_31
id_33
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo

# 输出 DeviceType 的分布
train.groupby("DeviceType").mean()["isFraud"].plot(kind='barh',
                                        figsize=(15, 5),
                                        title = "DeviceType Distribution")
plt.show()

统计每一列的NaN的数量,如果比例超过70%,则删除该列

def clean_nan(df):
    temp_columns = []
    for col in df.columns:
        # 获取一列
        counter = df[col].isnull().sum()
        # 占该列的比例
        ratio = counter / len(df[col])
        if ratio > 0.7:
            # 删除该列
            temp_columns.append(col)

    # 删除那些列
    new_df = df.drop(temp_columns, axis=1)
    return new_df
print("原始的train shape : ", train.shape)
train = clean_nan(train)
print("清洗后的train shape : ", train.shape)

原始的train shape : (590540, 434)
清洗后的train shape : (590540, 226)

训练数据清洗掉208个特征

print("原始的test shape : ", test.shape)
test = clean_nan(test)
print("清洗后的test shape : ", test.shape)

原始的test shape : (506691, 433)
清洗后的test shape : (506691, 225)

测试数据清洗掉208个特征

删除那些一列中某个类别数据量超过90%的列

def clean_top_cols(df):
    new_cols = [col for col in df.columns if df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
    return new_cols
train_cols = clean_top_cols(train)
test_cols = clean_top_cols(test)
cols_to_drop = list(set(train_cols + test_cols)) # 使用set列表进行去重

'isFraud' in cols_to_drop # 查看标签列是否在其中
cols_to_drop.remove('isFraud') # 删除标签列
# 清理掉这些列
print("原始的train shape : ", train.shape)
train = train.drop(cols_to_drop, axis=1)
print("清理后的train shape : ", train.shape)

原始的train shape : (590540, 226)
清理后的train shape : (590540, 156)

# 清理掉这些列
print("原始的test shape : ", test.shape)
test = test.drop(cols_to_drop, axis=1)
print("清理后的test shape : ", test.shape)

原始的test shape : (506691, 225)
清理后的test shape : (506691, 155)

# LabelEncoder
# 加上进度条:tqdm_notebook
for col in tqdm_notebook(train.columns):
    if train[col].dtype == "object":
        encoder = LabelEncoder()
        encoder.fit(list(train[col].values) + list(test[col].values))
        train[col] = encoder.transform(list(train[col].values))
        test[col] = encoder.transform(list(test[col].values))

train = train.reset_index() # 重置索引
test = test.reset_index()

del train['index']
del test['index']

train.shape:(590540, 156)

test.shape:(506691, 155)

# 分离数据集和标签

X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)

y = train.sort_values('TransactionDT')['isFraud']

X.shape:(590540, 153)

y.shape:(590540,)

# test
X_test = test.drop(['TransactionDT', 'TransactionID'], axis=1) # 删除多余的列,不是特征

test = test[['TransactionDT', 'TransactionID']]

X_test.shape:(506691, 153)

test.shape:(506691, 2)

三、LightGBM建模

params 超参数设置

params = {'num_leaves': 491, # 叶节点数量
          'min_data_in_leaf': 106, # 每个叶子节点中的数据
          'objective': 'binary', # 任务:二分类
          'max_depth': -1, # -1 : 不限制深度
          "boosting_type": "gbdt", # 'dart', 'goss', 'rf'
          "metric": 'auc', # 衡量标准
          "verbosity" : -1, # 不显示信息
          'random_state': 66, # 随机种子
         }

创建DataFrame保存特征重要性

feature_importances = pd.DataFrame(index=None)

feature_importances['features'] = X.columns

feature_importances
features
0 TransactionAmt
1 ProductCD
2 card1
3 card2
4 card3
... ...
148 V312
149 V313
150 V314
151 V315
152 V317

153 rows × 1 columns

5折交叉验证

folds = KFold(n_splits=5)
splits = folds.split(X, y) # 分割成5份,前4份是训练集索引,最后1份是验证集索引
next(iter(splits)) # 输出的是索引

(array([118108, 118109, 118110, ..., 590537, 590538, 590539]),
array([ 0, 1, 2, ..., 118105, 118106, 118107]))

best_auc = 0
best_model = None

for k, (train_indices, val_indices) in enumerate(splits):
    print("第 %d 折\n" % k)
    X_train_data, X_val_data = X.iloc[train_indices], X.iloc[val_indices] # 训练集, 验证集
    y_train, y_val = y.iloc[train_indices], y.iloc[val_indices] # 训练标签,验证标签
    #print("X_train_data shape : \t", X_train_data.shape, "X_val_data shape : \t", X_val_data.shape)
    
    train_dataset = lgb.Dataset(X_train_data, label=y_train) # 训练集
    val_dataset = lgb.Dataset(X_val_data, label=y_val) # 验证集
    
    lgb_model = lgb.train(params=params, # 超参数设置
                          train_set=train_dataset, # 训练数据
                          num_boost_round=10000, # 循环的轮数
                          valid_sets=val_dataset, # 验证数据
                          valid_names='validation', # 验证集名称
                          early_stopping_rounds=200,) # 如果200轮后没有提升,就停止循环
    # 保存特征重要性
    feature_importances[f'fold_{k+1}'] = lgb_model.feature_importance()
    
    # 对验证集进行预测
    y_val_pred = lgb_model.predict(X_val_data)
    
    # 计算roc_auc
    roc_auc = roc_auc_score(y_val, y_val_pred)
    print(f" Fold {k + 1} | AUC_ROC : { roc_auc * 100}%")
    
    # 判断是否是最优模型
    if roc_auc > best_auc:
        best_auc = roc_auc
        best_model = lgb_model
feature_importances['average'] = feature_importances[[f'fold_{k+1}' for k in range(folds.n_splits-1)]].mean(axis=1)

feature_importances.head()
features fold_1 fold_2 fold_3 fold_4 average
0 TransactionAmt 10758 2337 2571 3088 4688.5
1 ProductCD 478 148 189 179 248.5
2 card1 10241 2791 3168 3450 4912.5
3 card2 8222 2517 2606 3039 4096.0
4 card3 559 214 263 310 336.5
# 可视化显示前50个特征
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(50), 
            x='average', 
            y='features')
plt.title("50 top features importance over {} folds average.".format(folds.n_splits-1))

image

print("The best roc_auc : ", roc_auc)

The best roc_auc : 0.9187823659441293

joblib.dump(best_model, "best_model.pkl") # 保存模型

对与test进行预测

y_test_pred = best_model.predict(X_test)
labels = np.round(y_test_pred) # np.round() 四舍五入
from collections import Counter
Counter(labels)

Counter({0.0: 498254, 1.0: 8437})

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章