欺詐檢測案例(樣本不平衡,標準化,交叉驗證,模型評估)
#繪製類別比例圖
count_classes = pd.value_counts(data['Class'], sort=True).sort_index()
count_classes.plot(kind="bar")
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
#標準化操作
from sklearn.preprocessing import StandardScaler
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1, 1)) # -1 系統自動計算行數
data =data.drop("Time", "Amount", axis=1)
#下采樣策略
X=data.ix[:, data.columns != 'Class']
y=data.ix[:, data.columns == 'Class']
#獲取類別爲1的個數
number_records_fraud = len(data[data.Class==1])
fraud_indices = np.array(data[data.Class==1].index)
normal_indices = data[data.Class == 0].index
#從類別爲0的索引組當中 隨機選擇 和類別爲1一樣數量的 拿出來
random_normal_indices = np.random.choice(normal_indices, number_records_fraud,replace=False)
random_normal_indices = np.array(random_normal_indices)
#混合兩種類別的索引
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
under_sample_data = data.iloc[under_sample_indices,:]
X_undersample = under_sample_data.ix[:, under_sample_data !='Class']
y_undersample = under_sample_data.ix[:, under_sample_data =='Class']
#調用切分訓練集和測試集的工具
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)
from sklearn.linear_model imprt LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report
混淆矩陣
# threshold值可以自己指定,值越大越嚴格
lr=LogisticRegression(C=0.01, penalty='l1')
lr.fit(X_train_undersample, y_train_undersample.values.ravel())
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values) #獲取模型給出的類別概率值
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
plt.figure(figsize=(10,10))
j=1
for i in thresholds:
y_test_predictions_high_recall = y_pred_undersample_proba[:,1]>i #概率值大於閾值才判定爲某類別
plt.subplot(3,3,j)
j+=1
cnf_matrix = confusion_matrix(y_test_undersample, y_test_predictions_high_recall)
np.set_printoptions(precision=2)
class_names =[0,1]
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Threshold >= %s'%i)
plt.show()
過採樣策略:
#SMOTE算法 訓練集生成
import imblearn.over_sampling import SMOTE
oversampler=SMOTE(random_state=0)
os_features, os_labels=oversampler.fit_sample(features_train, labels_train)
泰坦尼克號獲救案例(缺失值填充,數字字符映射,提取特徵,算法集成)
調用線性迴歸算法
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch"]
alg = LinearRegression()
kf = KFold(titaniic.shape[0], n_folds=3, random_state=1)
predictors=[]
for train, test in kf:
train_predictors = (titanic[predictors].iloc[train, :])
train_target = titanic["Survived"].iloc[train]
alg.fit(train_predictors, train_target)
test_predictions = alg.predict(titanic[predictors].iloc[test,:])
predictions.append(test_predictions)
import numpy as np
predictions = np.concatenate(predictions, axis=0)
predictions[predictions >.5]=1
predictions[predictions<=.5]=0
accuracy = sum(predictions[predictions == titanic["Survived"]])/len(predictions)
調用邏輯迴歸算法 嘗試
from sklearn import cross_validation
from sklearn linear_model import LogisticRegression
alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
scores.mean()
調用隨機森林算法 嘗試
from sklearn.ensemble import RandomForestClassifier
alg=RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"],cv=kf)
調整參數
alg=RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)
抽取特徵
titanic["Familysize"]=titanic[""]+titanic[""]
titanic["NameLength"]=titanic["Name"].apply(lambda x:len(x))
import re
def get_title(name):
title_search = re.search('([A-Za-z]+)\.', name)
if title_search:
return title_search.group(1)
return ""
titles = titanic["Name"].apply(get_title)
print(pandas.value_counts(titles))
title_mapping={"Mr":1, "Miss":2, "Mrs":3, }
for k,v in title_mapping.items():
title[titles == k] = v
titanic["Title"]=titles
驗證各特徵的重要性
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
predictors=["", "", ""]
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors],titanic["Survived"])
scores = -np.log10(selector.pvalues)
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()
算法集成
import numpy as np
algorithms = [
[GradientBoostingClassifier(random_state=1, n_estimators=25,max_depth=3),["Pclass","Sex"]],
[LogisticRegression(random_state=1), ["Pclass","Sex"]]
]
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train,test in kf:
train_target = titanic["Survived"].iloc[train]
full_test_predictions = []
for alg, predictors in alograms:
alg.fit(titanic[predictors].iloc[train, :], train_target)
test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
full_test_predictions.append(test_predictions)
test_predictions = (full_test_predictions[0] + full_test_predictions[1])/2
test_predictions[test_predictions <=.5]=0
test_predictions[test_predictions > .5]=1