文章目錄
下面用了一個油品相關的數據做了一個決策樹規則挖掘。
1. 導入相應的包並加載數據
import pandas as pd
import numpy as np
#消除Waring
import warnings
warnings.filterwarnings("ignore")
data = pd.read_excel('oil_data_for_tree.xlsx')
data.head()
data.shape
(50609, 19)
查看下class_new有幾類
data.class_new.unique()
array([‘B’, ‘E’, ‘C’, ‘A’, ‘D’, ‘F’], dtype=object)
2. 將數據分爲三類進行處理
org_lst 不需要做特殊變換,直接去重
agg_lst 數值型變量做聚合
dstc_lst 文本型變量做cnt
#在這裏,變量'total_oil_cnt', 'pay_amount_total'沒有添加進去
org_lst = ['uid','create_dt','oil_actv_dt','class_new','bad_ind']
agg_lst = ['oil_amount','discount_amount','sale_amount','amount','pay_amount','coupon_amount','payment_coupon_amount']
dstc_lst = ['channel_code','oil_code','scene','source_app','call_source']
數據重組
#爲不改變元數據,可以將元數據copy下來,進行處理
df = data[org_lst].copy()
df[agg_lst] = data[agg_lst].copy()
df[dstc_lst] = data[dstc_lst].copy()
df.shape
查看下缺失值情況
df.isnull().sum()
3. 填補缺失值並截取數據
下面要對對creat_dt做補全,缺失值用oil_actv_dt來填補,並且截取6個月的數據。
需要注意的是:構造變量的時候不能直接對歷史所有數據做累加 ,否則隨着時間推移,變量分佈會有很大的變化。
#這裏對creat_dt做補全,用oil_actv_dt來填補,並且截取6個月的數據。
def time_isna(x,y):
if str(x) == 'NaT':
x = y
else:
x = x
return x
df2 = df.sort_values(['uid','create_dt'],ascending = False)
df2['create_dt'] = df2.apply(lambda x: time_isna(x.create_dt,x.oil_actv_dt),axis = 1)
df2['dtn'] = (df2.oil_actv_dt - df2.create_dt).apply(lambda x: x.days)
df = df2[df2['dtn'] < 180]
df.head()
# 這裏可以看出,篩選掉日期差大於180天的數據後,減少了將近5000左右的數據
df.shape
(45039, 18)
4. 去重
對org_list變量求歷史貸款天數的最大間隔,也就是訂單日期和貸款日期最大差值(dtn),並且去重
base = df[org_lst]
base['dtn'] = df['dtn']
base = base.sort_values(by = ['uid','create_dt'],ascending = False)
base = base.drop_duplicates(['uid'],keep = 'first')
#去重後可以看出數據剩下10000個左右
base.shape
(11099, 6)
base.head()
5. 變量衍生
gn = pd.DataFrame()
for i in agg_lst:
#統計次數
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:len(df[i])).reset_index())
tp.columns = ['uid',i + '_cnt']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn, tp, on = 'uid',how = 'left')
#統計數值大於0的個數
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.where(df[i]>0,1,0).sum()).reset_index())
tp.columns = ['uid',i + '_num']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#統計所有數值的和
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nansum(df[i])).reset_index())
tp.columns = ['uid',i + '_tot']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#統計所有數值的均值
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])).reset_index())
tp.columns = ['uid',i + '_avg']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#統計所有數值的最大值
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i])).reset_index())
tp.columns = ['uid',i + '_max']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#統計所有數值的最小值
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmin(df[i])).reset_index())
tp.columns = ['uid',i + '_min']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#統計所有數值的方差
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanvar(df[i])).reset_index())
tp.columns = ['uid',i + '_var']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#統計所有數值最大、最小值的差值(極差)
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i]) -np.nanmin(df[i]) ).reset_index())
tp.columns = ['uid',i + '_t']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#統計變異係數(方差/均值)
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:max(np.nanvar(df[i])/np.nanmean(df[i]),1)).reset_index())
tp.columns = ['uid',i + '_tar']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
6. 對dstc_lst變量求distinct個數
gc = pd.DataFrame()
for i in dstc_lst:
tp = pd.DataFrame(df.groupby('uid').apply(lambda df: len(set(df[i]))).reset_index())
tp.columns = ['uid',i + '_dstc']
if gc.empty == True:
gc = tp
else:
gc = pd.merge(gc,tp,on = 'uid',how = 'left')
7. 將變量組合在一起
fn = pd.merge(base,gn,on= 'uid')
fn = pd.merge(fn,gc,on= 'uid')
fn.shape
(11099, 74)
將缺失值填充爲0
fn = fn.fillna(0)
fn.head()
8. 用決策樹進行訓練
x = fn.drop(['uid','oil_actv_dt','create_dt','bad_ind','class_new'],axis = 1)
y = fn.bad_ind.copy()
from sklearn import tree
dtree = tree.DecisionTreeRegressor(max_depth = 2,min_samples_leaf = 500,min_samples_split = 5000)
dtree = dtree.fit(x,y)
9. 輸出決策樹圖像,並作出決策
import pydotplus
from IPython.display import Image
from sklearn.externals.six import StringIO
with open("dt.dot", "w") as f:
tree.export_graphviz(dtree, out_file=f)
dot_data = StringIO()
tree.export_graphviz(dtree, out_file=dot_data,
feature_names=x.columns,
class_names=['bad_ind'],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
import os
#這是我自己的路徑,注意修改你的路徑
os.environ["PATH"] += os.pathsep + 'D:/軟件/graphviz-2.38/release/bin/'
Image(graph.create_png())
#壞賬率
sum(fn.bad_ind)/len(fn.bad_ind)
0.04658077304261645
#生成策略
dff1 = fn.loc[(fn.amount_tot>48077.5)&(fn.coupon_amount_cnt>3.5)].copy()
dff1['level'] = 'oil_A'
dff2 = fn.loc[(fn.amount_tot>48077.5)&(fn.coupon_amount_cnt<=3.5)].copy()
dff2['level'] = 'oil_B'
dff3 = fn.loc[(fn.amount_tot<=48077.5)].copy()
dff3['level'] = 'oil_C'
dff1.head()
dff1 = dff1.append(dff2)
dff1 = dff1.append(dff3)
dff1 = dff1.reset_index(drop = True)
dff1.head()
dff1.shape
(11099, 75)
last = dff1[['class_new','level','bad_ind','uid','oil_actv_dt','bad_ind']].copy()
last['oil_actv_dt'] = last['oil_actv_dt'] .apply(lambda x:str(x)[:7]).copy()
last.head(5)
last.to_excel('final_report.xlsx',index = False)