欺詐檢測案例AND泰坦尼克號獲救案例

欺詐檢測案例(樣本不平衡,標準化,交叉驗證,模型評估)

#繪製類別比例圖
count_classes = pd.value_counts(data['Class'], sort=True).sort_index()
count_classes.plot(kind="bar")
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
#標準化操作
from sklearn.preprocessing import StandardScaler
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1, 1))  # -1 系統自動計算行數
data =data.drop("Time", "Amount", axis=1)
#下采樣策略
X=data.ix[:, data.columns != 'Class']
y=data.ix[:, data.columns == 'Class']
#獲取類別爲1的個數
number_records_fraud = len(data[data.Class==1])
fraud_indices = np.array(data[data.Class==1].index)
normal_indices = data[data.Class == 0].index
#從類別爲0的索引組當中 隨機選擇 和類別爲1一樣數量的 拿出來
random_normal_indices = np.random.choice(normal_indices, number_records_fraud,replace=False)
random_normal_indices = np.array(random_normal_indices)
#混合兩種類別的索引
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
under_sample_data = data.iloc[under_sample_indices,:]
X_undersample = under_sample_data.ix[:, under_sample_data !='Class']
y_undersample = under_sample_data.ix[:, under_sample_data =='Class']
#調用切分訓練集和測試集的工具
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

from sklearn.linear_model imprt LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report

混淆矩陣

# threshold值可以自己指定,值越大越嚴格
lr=LogisticRegression(C=0.01, penalty='l1')
lr.fit(X_train_undersample, y_train_undersample.values.ravel())
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)  #獲取模型給出的類別概率值
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
plt.figure(figsize=(10,10))
j=1
for i in thresholds:
    y_test_predictions_high_recall = y_pred_undersample_proba[:,1]>i     #概率值大於閾值才判定爲某類別
    plt.subplot(3,3,j)
    j+=1
    cnf_matrix = confusion_matrix(y_test_undersample, y_test_predictions_high_recall)
    np.set_printoptions(precision=2)

    class_names =[0,1]
    plot_confusion_matrix(cnf_matrix, classes=class_names, title='Threshold >= %s'%i)
plt.show()

過採樣策略:

#SMOTE算法 訓練集生成
import imblearn.over_sampling import SMOTE
oversampler=SMOTE(random_state=0)
os_features, os_labels=oversampler.fit_sample(features_train, labels_train)

泰坦尼克號獲救案例(缺失值填充,數字字符映射,提取特徵,算法集成)
調用線性迴歸算法

from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch"]
alg = LinearRegression()
kf = KFold(titaniic.shape[0], n_folds=3, random_state=1)
predictors=[]
for train, test in kf:
    train_predictors = (titanic[predictors].iloc[train, :])
    train_target = titanic["Survived"].iloc[train]
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)
    
import numpy as np
predictions = np.concatenate(predictions, axis=0)
predictions[predictions >.5]=1
predictions[predictions<=.5]=0
accuracy = sum(predictions[predictions == titanic["Survived"]])/len(predictions)

調用邏輯迴歸算法 嘗試

from sklearn import cross_validation
from sklearn linear_model import LogisticRegression
alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
scores.mean()

調用隨機森林算法 嘗試

from sklearn.ensemble import RandomForestClassifier
alg=RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"],cv=kf)

調整參數

alg=RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)

抽取特徵

titanic["Familysize"]=titanic[""]+titanic[""]
titanic["NameLength"]=titanic["Name"].apply(lambda x:len(x))
import re
def get_title(name):
    title_search = re.search('([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""
titles = titanic["Name"].apply(get_title)
print(pandas.value_counts(titles))
title_mapping={"Mr":1, "Miss":2, "Mrs":3, }
for k,v in title_mapping.items():
    title[titles == k] = v
titanic["Title"]=titles

驗證各特徵的重要性

import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
predictors=["", "", ""]
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors],titanic["Survived"])
scores = -np.log10(selector.pvalues)
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()

算法集成

import numpy as np
algorithms = [
[GradientBoostingClassifier(random_state=1, n_estimators=25,max_depth=3),["Pclass","Sex"]],
[LogisticRegression(random_state=1), ["Pclass","Sex"]]
]
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train,test in kf:
    train_target = titanic["Survived"].iloc[train]
    full_test_predictions = []
    for alg, predictors in alograms:
        alg.fit(titanic[predictors].iloc[train, :], train_target)
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
        full_test_predictions.append(test_predictions)
    test_predictions = (full_test_predictions[0] + full_test_predictions[1])/2
    test_predictions[test_predictions <=.5]=0
    test_predictions[test_predictions > .5]=1
發佈了95 篇原創文章 · 獲贊 62 · 訪問量 2萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章