用戶流失預警案例

1.數據預處理
from future import division
import pandas as pd
import numpy as np

churn_df = pd.read_csv(‘churn.csv’)
col_names = churn_df.columns.tolist()
print(“Column names:”,col_names) #打印列名

to_show = col_name[:6] + col_names[-6:]
print(“Sample data:”,churn_df[to_show].head(6))

(x-min)/(max-min)

churn_result = churn_df[‘Churn?’]
y = np.where(churn_result == ‘True.’,1,0)

to_drop = [‘State’, ‘Area Code’, ‘Phone’, ‘Churn?’]
churn_feat_space = churn_df.drop(to_drop,axis=1)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X) //標準化

from sklearn.cross_validation import KFold
def run_cv (X,y,clf_class,**kwargs):
kf = KFold(len(y),n_folds=5,shuffle=True)
y_pred = y.copy()

for train_index,test_index in kf:
    X_train,X_test = X[train_index],X[test_index]
    y_train = y[train_index]
    clf = clf_class(**kwargs)
    clf.fit(X_train,y_train)
    y_pred[test_index] = clf.predict(X_test)
return y_pred

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN

def accuracy(y_true,y_pred):
return np.mean(y_true == y_pred)
print(“Support vector machines:”)
print("%.3f"%accuracy(y,run_cv(X,y,SVC)))
print(“Random forest”)
print("%.3f"%accuracy(y,run_cv(X,y,RF)))
print(“K-nearest-neighbors:”)
print("%.3f"%accuracy(y,run_cv(X,y,KNN)))

流失的用戶能檢測出多少個 (混淆矩陣) recall
map是爲解決P,R,F-measure的單點侷限性的(曲線圍成的面積)

def run_prob_cv(X,y,clf_class,**kwargs):
kf = KFold(len(y), n_folds=5, shuffle=True)
y_prob = np.zeros(len(y),2)
for train-index, test_index in kf:
X_train,X_test = X[train_index],X[test_index]
y_train = y[train_index]
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_prob[test_index] = clf.predict_prob(X_test)
return y_prob

import warnings
warnings.filterwarnings(‘ignore’)
pred_prob = run_prob_cv(X,y,RF,n_estimators=10)
pred_churn = pred_prob[:,1]
is_churn = y ==1
counts = pd.value_counts(pred_churn)

true_prob = { }
for prob in counts.index:
true_prob[prob] = np.mean(is_churn[pred_churn == prob])
true_prob = pd.Series(true_prob)
counts = pd.concat([counts,true_prob],axis=1).reset_index()
counts.columns = [‘pred_prob’, ‘count’, ‘true_prob’]
counts

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章