所需環境庫以環境

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import sklearn

print('pandas:',pd.__version__)
print('matplotlib:',matplotlib.__version__)
print('numpy:',np.__version__)
print('sklearn:',sklearn.__version__)

pandas: 0.23.4
matplotlib: 2.2.3
numpy: 1.16.4
sklearn: 0.22.2.post1

讀取數據並顯示數據各列信息

data = pd.read_csv('creditcard.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
Time      284807 non-null float64
V1        284807 non-null float64
V2        284807 non-null float64
V3        284807 non-null float64
V4        284807 non-null float64
V5        284807 non-null float64
V6        284807 non-null float64
V7        284807 non-null float64
V8        284807 non-null float64
V9        284807 non-null float64
V10       284807 non-null float64
V11       284807 non-null float64
V12       284807 non-null float64
V13       284807 non-null float64
V14       284807 non-null float64
V15       284807 non-null float64
V16       284807 non-null float64
V17       284807 non-null float64
V18       284807 non-null float64
V19       284807 non-null float64
V20       284807 non-null float64
V21       284807 non-null float64
V22       284807 non-null float64
V23       284807 non-null float64
V24       284807 non-null float64
V25       284807 non-null float64
V26       284807 non-null float64
V27       284807 non-null float64
V28       284807 non-null float64
Amount    284807 non-null float64
Class     284807 non-null int64
dtypes: float64(30), int64(1)
memory usage: 67.4 MB

有上述信息可以看出，各列信息不存在缺失值，而且數值類型皆爲數值，不需要進行離散化。

後續在檢查一下各特徵數據分佈：

data.describe()

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class
count	284807.000000	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	...	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	2.848070e+05	284807.000000	284807.000000
mean	94813.859575	3.919560e-15	5.688174e-16	-8.769071e-15	2.782312e-15	-1.552563e-15	2.010663e-15	-1.694249e-15	-1.927028e-16	-3.137024e-15	...	1.537294e-16	7.959909e-16	5.367590e-16	4.458112e-15	1.453003e-15	1.699104e-15	-3.660161e-16	-1.206049e-16	88.349619	0.001727
std	47488.145955	1.958696e+00	1.651309e+00	1.516255e+00	1.415869e+00	1.380247e+00	1.332271e+00	1.237094e+00	1.194353e+00	1.098632e+00	...	7.345240e-01	7.257016e-01	6.244603e-01	6.056471e-01	5.212781e-01	4.822270e-01	4.036325e-01	3.300833e-01	250.120109	0.041527
min	0.000000	-5.640751e+01	-7.271573e+01	-4.832559e+01	-5.683171e+00	-1.137433e+02	-2.616051e+01	-4.355724e+01	-7.321672e+01	-1.343407e+01	...	-3.483038e+01	-1.093314e+01	-4.480774e+01	-2.836627e+00	-1.029540e+01	-2.604551e+00	-2.256568e+01	-1.543008e+01	0.000000	0.000000
25%	54201.500000	-9.203734e-01	-5.985499e-01	-8.903648e-01	-8.486401e-01	-6.915971e-01	-7.682956e-01	-5.540759e-01	-2.086297e-01	-6.430976e-01	...	-2.283949e-01	-5.423504e-01	-1.618463e-01	-3.545861e-01	-3.171451e-01	-3.269839e-01	-7.083953e-02	-5.295979e-02	5.600000	0.000000
50%	84692.000000	1.810880e-02	6.548556e-02	1.798463e-01	-1.984653e-02	-5.433583e-02	-2.741871e-01	4.010308e-02	2.235804e-02	-5.142873e-02	...	-2.945017e-02	6.781943e-03	-1.119293e-02	4.097606e-02	1.659350e-02	-5.213911e-02	1.342146e-03	1.124383e-02	22.000000	0.000000
75%	139320.500000	1.315642e+00	8.037239e-01	1.027196e+00	7.433413e-01	6.119264e-01	3.985649e-01	5.704361e-01	3.273459e-01	5.971390e-01	...	1.863772e-01	5.285536e-01	1.476421e-01	4.395266e-01	3.507156e-01	2.409522e-01	9.104512e-02	7.827995e-02	77.165000	0.000000
max	172792.000000	2.454930e+00	2.205773e+01	9.382558e+00	1.687534e+01	3.480167e+01	7.330163e+01	1.205895e+02	2.000721e+01	1.559499e+01	...	2.720284e+01	1.050309e+01	2.252841e+01	4.584549e+00	7.519589e+00	3.517346e+00	3.161220e+01	3.384781e+01	25691.160000	1.000000

8 rows × 31 columns

從上表可以看出特徵V1至V28的特徵可以綱量比較統一；而Time特徵屬於連續遞增數據，不適合作爲訓練特徵，捨去該特徵；而對於Amount特徵是否需要進行標準化，通過後續訓練以及測試準確率來判斷。

print('0:{:d}, 1:{:d}'.format(sum(data.Class==0),sum(data.Class==1)))

0:284315, 1:492

而對於Class類別，只有0（正常），1（異常），可以明顯看出標籤是非常不均衡的。

設置訓練集和測試集

由於原數據標籤十分不均衡，爲了測試集的準確性，需將訓練集設爲類別數量1：1，因此正樣本50個，負樣本50個。

data_fixed = data.drop(['Time'], axis=1)

data_pos = data_fixed[data_fixed['Class'].values == 0].sample(frac = 1).reset_index(drop=True)
data_neg = data_fixed[data_fixed['Class'].values == 1].sample(frac = 1).reset_index(drop=True)
data_train = pd.concat([data_neg.iloc[50:,:], data_pos.iloc[50:,:]] ).sample(frac = 1).reset_index(drop=True)
data_test = pd.concat([data_neg.iloc[:50,:], data_pos.iloc[:50,:]] ).sample(frac = 1).reset_index(drop=True)
data_train.to_csv('creditcard_train.csv')
data_test.to_csv('creditcard_test.csv')
X_train, y_train = data_train.iloc[:,:-1], data_train.iloc[:,-1]
X_test, y_test = data_test.iloc[:,:-1], data_test.iloc[:,-1]

print('0:{:d}, 1:{:d}'.format(sum(y_test==0),sum(y_test==1)))

0:50, 1:50

訓練階段

首先定義一個訓練並可以計算測試集準確率的函數

from sklearn.metrics import confusion_matrix
def model_train(model):
    model = model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    matrix = confusion_matrix(y_test, y_predict)
    return (sum(y_predict == y_test)/len(y_test)),matrix

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

NBM = [KNeighborsClassifier(n_neighbors=6, n_jobs=8), 
       GaussianNB(), 
       DecisionTreeClassifier(max_depth=5, min_samples_split=5), 
       RandomForestClassifier(n_estimators= 100, max_depth=10, n_jobs=8),
       RandomForestClassifier(n_estimators= 100, max_depth=10, n_jobs=8, class_weight='balanced'),
       xgb.XGBClassifier(tree_method = "hist", n_estimators=100, n_jobs = 8)]
NAME= ["KNN", "GNB", "DCT", "RF", "RF_Balanced", "XGBT"]

for itr, itrname in zip(NBM, NAME):
    acc, con_matrix = model_train(itr)
    print(itrname+' '+str(acc*100)+'%\n',con_matrix)

KNN 83.0%
 [[50  0]
 [17 33]]
GNB 91.0%
 [[49  1]
 [ 8 42]]
DCT 91.0%
 [[50  0]
 [ 9 41]]
RF 91.0%
 [[50  0]
 [ 9 41]]
RF_Balanced 91.0%
 [[50  0]
 [ 9 41]]
XGBT 92.0%
 [[50  0]
 [ 8 42]]

由於數據量較大，所以選取了訓練速度很快，以及可以多線程進行的模型。通過結果可以看出整體表現比較平均，所有模型都出現了同一個問題，正樣本訓練的結果非常好，預測的錯誤都出現在了負樣本。這也是由於樣本標籤不均衡的結果，由於是交易欺詐的預測，實際情況上負樣本確實佔少部分，由於該項目的數據集非常標準，根據生活經驗也可以發現欺詐交易中與普通交易存在比較大的差距（也就是說在樣本空間上，正樣本和負樣本距離會比較大），所以得到的預測效果比較好。

Creditcard_prediction_練手小項目

目錄

所需環境庫以環境

讀取數據並顯示數據各列信息

設置訓練集和測試集

訓練階段

Kafka存儲機制

aws語音呼叫調用，告警電話

【轉】[C#] WebAPI 防止併發調用二（冥等性）

HTTP URL 詳解

創新工具：2024年開發者必備的一款表格控件（二）

車牌識別控制檯可快速整合二次開發

深度學習學習筆記——RNN（LSTM、GRU、雙向RNN）

Reliability Does Matter: An End-to-End Weakly Supervised Semantic Segmentation Approach(AAAI 2020)

機器學習——分類器算法對比（KNN、SVM、樸素貝葉斯、隨機森林、Adaboost）（學習筆記）

決策樹_練手小項目

Creditcard_prediction_練手小項目

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結