電費敏感數據挖掘一: 數據處理與特徵工程

一. 數據篩選

1.1 讀取數據

import numpy as np
import pandas as pd
import csv

data_path = r'..\電費敏感預測\rawdata'
# 工單信息
file_jobinfo_train = '01_arc_s_95598_wkst_train.tsv' 
file_jobinfo_test = '01_arc_s_95598_wkst_test.tsv'
# 通話信息記錄
file_comm = '02_s_comm_rec.tsv'
# 應收電費信息
file_flow_train = '09_arc_a_rcvbl_flow.tsv'
file_flow_test = '09_arc_a_rcvbl_flow_test.tsv'
# 訓練集
file_label = 'train_label.csv'
# 測試集
file_test = 'test_to_predict.csv'

train_info = pd.read_csv(data_path + '\processed_' + file_jobinfo_train,
                         sep = '\t', quoting = csv.QUOTE_NONE)
# quoting 防止文本里包含英文雙引號導致報錯

# 過濾CUST_NO爲空的用戶
train_info = train_info.loc[~train_info.CUST_NO.isnull()]
train_info['CUST_NO'] = train_info.CUST_NO.astype(np.int64)
train_info.head(2)

在這裏插入圖片描述

  • 統計單數
train = train_info.CUST_NO.value_counts().to_frame().reset_index()
train.columns = ['CUST_NO', 'counts_of_jobinfo']
train.head()

在這裏插入圖片描述

1.2 加入label值

temp = pd.read_csv(data_path + '/' + file_label, header = None)
temp.columns = ['CUST_NO']
train['label'] = 0
train.loc[train.CUST_NO.isin(temp.CUST_NO), 'label'] = 1
train = train[['CUST_NO', 'label', 'counts_of_jobinfo']]
print(train.shape)
train.head(4)

在這裏插入圖片描述

  • 測試集標籤用-1 表示
test_info = pd.read_csv(data_path + 'processed_' + file_jobinfo_test, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE)
test = test_info.CUST_NO.value_counts().to_frame().reset_index()
test.columns = ['CUST_NO', 'counts_of_jobinfo']
test['label'] = -1
test = test[['CUST_NO', 'label', 'counts_of_jobinfo']]
test.head()

df = train.append(test).copy()
del temp, train, test

在這裏插入圖片描述

1.3 留下一條工單記錄的數據

df = df.loc[df.counts_of_jobinfo == 1].copy()
df.reset_index(drop = True, inplace = True)
train = df.loc[df.label != -1]
test = df.loc[df.label == -1]
print('低敏用戶訓練集: ', train.shape[0])
print('低敏用戶正樣本: ', train.loc[train.label == 1].shape[0])
print('低敏用戶負樣本: ', train.loc[train.label == 0].shape[0])
print('低敏用戶測試集: ', test.shape[0])
df.drop(['counts_of_jobinfo'], axis = 1, inplace = True)

低敏用戶訓練集: 401626
低敏用戶正樣本: 13139
低敏用戶負樣本: 388487
低敏用戶測試集: 327437

jobinfo = train_info.append(test_info).copy()
jobinfo = jobinfo.loc[jobinfo.CUST_NO.isin(df.CUST_NO)].copy()
jobinfo.reset_index(drop = True, inplace = True)
jobinfo = jobinfo.merge(df[['CUST_NO', 'label']], on = 'CUST_NO', how = 'left')
print(jobinfo.shape)
jobinfo.head()

在這裏插入圖片描述

1.4 加載通話數據

comm = pd.read_csv(data_path + '/' + file_comm, sep = '\t')
print('總數據量: ', comm.shape)
comm.drop_duplicates(inplace = True)
print('去掉重複無用的: ', comm.shape)

# 過濾掉沒出現在jobinfo中的數據
comm = comm.loc[comm.APP_NO.isin(jobinfo.ID)]
comm = comm.rename(columns = {'APP_NO': 'ID'})
comm = comm.merge(jobinfo[['ID', 'CUST_NO']], on = 'ID', how = 'left')
print('可用數據量: ', comm.shape)

# 過濾掉日期錯誤的
comm['REQ_BEGIN_DATE'] = comm.REQ_BEGIN_DATE.apply(lambda x: pd.to_datetime(x))
comm['REQ_FINISH_DATE'] = comm.REQ_FINISH_DATE.apply(lambda x: pd.to_datetime(x))

comm = comm.loc[~(comm.REQ_BEGIN_DATE > comm.REQ_FINISH_DATE)]
print('過濾錯誤信息後數據量: ', comm.shape)
df = df.loc[df.CUST_NO.isin(comm.CUST_NO)].copy()

總數據量: (1593088, 8)
去掉重複無用的: (1584351, 8)
可用數據量: (726248, 9)
過濾錯誤信息後數據量: (726242, 9)

# 構建特徵:通話時間,並進行歸一化

comm['holding_time'] = comm['REQ_FINISH_DATE'] - comm['REQ_BEGIN_DATE']
comm['holding_time_seconds'] = comm.holding_time.apply(lambda x: x.seconds)
df = df.merge(comm[['CUST_NO', 'holding_time_seconds']], how = 'left', on = 'CUST_NO')

from sklearn.preprocessing import MinMaxScaler
df['holding_time_seconds'] = MinMaxScaler().fit_transform(df['holding_time_seconds'].values.reshape(-1, 1))
del comm
df.head()

在這裏插入圖片描述

jobinfo = jobinfo.loc[jobinfo.CUST_NO.isin(df.CUST_NO)].copy()
jobinfo.reset_index(drop = True, inplace = True)

# rank函數進行排名
df['rank_CUST_NO'] = df.CUST_NO.rank(method = 'max')
df.head()

在這裏插入圖片描述

# 歸一化
df['rank_CUST_NO'] = MinMaxScaler().fit_transform(df.rank_CUST_NO.values.reshape(-1, 1))
df.head()

在這裏插入圖片描述

二. 離散值處理

2.1 對離散型數值進行編碼

df = df.merge(jobinfo[['CUST_NO', 'BUSI_TYPE_CODE']], on = 'CUST_NO', how = 'left')
temp = pd.get_dummies(df.BUSI_TYPE_CODE, prefix = 'onehot_BUSI_TYPE_CODE', dummy_na = True)
df = pd.concat([df, temp], axis = 1)
df.drop(['BUSI_TYPE_CODE'], axis = 1, inplace = True)
del temp

df = df.merge(jobinfo[['CUST_NO', 'URBAN_RURAL_FLAG']], on='CUST_NO', how='left')
temp = pd.get_dummies(df.URBAN_RURAL_FLAG, prefix='onehot_URBAN_RURAL_FLAG', dummy_na=True)
df = pd.concat([df, temp], axis=1)
df.drop(['URBAN_RURAL_FLAG'], axis=1, inplace=True)
del temp

# 供電單位編碼,按長度
df = df.merge(jobinfo[['CUST_NO', 'ORG_NO']], on='CUST_NO', how='left')
df['len_of_ORG_NO'] = df.ORG_NO.apply(lambda x:len(str(x)))
df.fillna(-1, inplace=True)

2.2 數據編碼長度所佔比例

train = df[df.label != -1] 
ratio = {}
for i in train.ORG_NO.unique():
    ratio[i] = len(train.loc[(train.ORG_NO == i) & (train.label == 1)]) / len(train.loc[train.ORG_NO == i])

df['ratio_ORG_NO'] = df.ORG_NO.map(ratio)
df['ratio_ORG_NO'].head()

在這裏插入圖片描述

temp = pd.get_dummies(df.len_of_ORG_NO, prefix = 'onehot_len_of_ORG_NO')
df = pd.concat([df, temp], axis = 1)

df.drop(['ORG_NO', 'len_of_ORG_NO'], axis = 1, inplace = True)

2.3 時間數據處理

df = df.merge(jobinfo[['CUST_NO', 'HANDLE_TIME']], on = 'CUST_NO', how = 'left')
df['date'] = df['HANDLE_TIME'].apply(lambda x: pd.to_datetime(x.split()[0]))
df['time'] = df['HANDLE_TIME'].apply(lambda x: x.split()[1])
df['month'] = df['date'].apply(lambda x: x.month)
df['day'] = df.date.apply(lambda x: x.day)
features = ['CUST_NO','date','time','month','day']
df[features].head()

在這裏插入圖片描述

# 按照上旬,中旬,下旬進行統計
df['is_in_first_tendays'] = 0
df.loc[df.day.isin(range(1, 11)), 'is_in_first_tendays'] = 1
df['is_in_middle_tendays'] = 0
df.loc[df.day.isin(range(11, 21)), 'is_in_middle_tendays'] = 1
df['is_in_last_tendays'] = 0
df.loc[df.day.isin(range(21, 32)), 'is_in_last_tendays'] = 1

df['hour'] = df.time.apply(lambda x: int(x.split(':')[0]))
df.drop(['HANDLE_TIME', 'date', 'time'], axis = 1, inplace = True)

2.4 用電方式

# 用電方式首位
df = df.merge(jobinfo[['CUST_NO', 'ELEC_TYPE']], on = 'CUST_NO', how = 'left')
df.fillna(0, inplace = True)
df['head_of_ELEC_TYPE'] = df.ELEC_TYPE.apply(lambda x: str(x)[0])

# 是否是空值
df['is_ELEC_TYPE_NaN'] = 0
df.loc[df.ELEC_TYPE == 0, 'is_ELEC_TYPE_NaN'] = 1

# 1.label encoder
from sklearn.preprocessing import LabelEncoder
df['label_encoder_ELEC_TYPE'] = LabelEncoder().fit_transform(df['ELEC_TYPE'])

# 2.ratio 
train = df[df.label != -1]
ratio = {}
for i in train.ELEC_TYPE.unique():
    ratio[i] = len(train.loc[(train.ELEC_TYPE == i) & (train.label == 1)]) / len(train.loc[train.ELEC_TYPE == i])
df['ratio_ELEC_TYPE'] = df.ELEC_TYPE.map(ratio)
df.fillna(0, inplace = True)

df[['ratio_ELEC_TYPE','head_of_ELEC_TYPE']].head()

在這裏插入圖片描述

temp = pd.get_dummies(df.head_of_ELEC_TYPE, prefix = 'onehot_head_of_ELEC_TYPE')
df = pd.concat([df, temp], axis = 1)
df.drop(['ELEC_TYPE', 'head_of_ELEC_TYPE'], axis = 1, inplace = True)

2.5 城市編碼

df = df.merge(jobinfo[['CUST_NO', 'CITY_ORG_NO']], on = 'CUST_NO', how = 'left')
train = df[df.label != -1]
ratio = {}
for i in train.CITY_ORG_NO.unique():
    ratio[i] = len(train.loc[(train.CITY_ORG_NO == i) & (train.label == 1)]) / len(train.loc[train.CITY_ORG_NO == i])
df['ratio_CITY_ORG_NO'] = df.CITY_ORG_NO.map(ratio)
temp = pd.get_dummies(df.CITY_ORG_NO, prefix = 'onehot_CITY_ORG_NO')
df = pd.concat([df, temp], axis = 1)
df.drop(['CITY_ORG_NO'], axis = 1, inplace = True)

2.6 收費信息表數據

train_flow = pd.read_csv(data_path + '/' + file_flow_train, sep = '\t')
test_flow = pd.read_csv(data_path + '/' + file_flow_test, sep = '\t')
flow = train_flow.append(test_flow).copy()
flow.rename(columns = {'CONS_NO':'CUST_NO'}, inplace = True)
flow.drop_duplicates(inplace = True)
flow = flow.loc[flow.CUST_NO.isin(df.CUST_NO)].copy()
print(flow.shape)
flow.head()

在這裏插入圖片描述

flow['T_PQ'] = flow.T_PQ.apply(lambda x: -x if x < 0 else x)
flow['RCVBL_AMT'] = flow.RCVBL_AMT.apply(lambda x: -x if x < 0 else x)
flow['RCVED_AMT'] = flow.RCVED_AMT.apply(lambda x: -x if x < 0 else x)
flow['OWE_AMT'] = flow.OWE_AMT.apply(lambda x: -x if x < 0 else x)

# 有些數據缺失了
df['has_biao9'] = 0
df.loc[df.CUST_NO.isin(flow.CUST_NO), 'has_biao9'] = 1

df['counts_of_09flow'] = df.CUST_NO.map(flow.groupby('CUST_NO').size())
df[['CUST_NO', 'counts_of_09flow']].head()

在這裏插入圖片描述

三. 構建統計特徵

from numpy import log

# 應收金額
df['sum_yingshoujine'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVBL_AMT.sum()) + 1)
df['mean_yingshoujine']= log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVBL_AMT.mean())+ 1)
df['max_yingshoujine'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVBL_AMT.max()) + 1)
df['min_yingshoujine'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVBL_AMT.min()) + 1)
df['std_yingshoujine'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVBL_AMT.std()) + 1)

# 實收金額
df['sum_shishoujine'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVED_AMT.sum()) + 1)

# 少交了多少
df['qianfei'] = df['sum_yingshoujine'] - df['sum_shishoujine']

# 總電量
df['sum_T_PQ'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').T_PQ.sum()) + 1)
df['mean_T_PQ']= log(df.CUST_NO.map(flow.groupby('CUST_NO').T_PQ.mean())+ 1)
df['max_T_PQ'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').T_PQ.max()) + 1)
df['min_T_PQ'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').T_PQ.min()) + 1)
df['std_T_PQ'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').T_PQ.std()) + 1)

# 電費金額
df['sum_OWE_AMT'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').OWE_AMT.sum()) + 1)
df['mean_OWE_AMT']= log(df.CUST_NO.map(flow.groupby('CUST_NO').OWE_AMT.mean())+ 1)
df['max_OWE_AMT'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').OWE_AMT.max()) + 1)
df['min_OWE_AMT'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').OWE_AMT.min()) + 1)
df['std_OWE_AMT'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').OWE_AMT.std()) + 1)

# 電費金額和應收金額差多少
df['dianfei_chae'] = df['sum_OWE_AMT'] - df['sum_yingshoujine']

# 應收違約金
df['sum_RCVBL_PENALTY'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVBL_PENALTY.sum()) + 1)
df['mean_RCVBL_PENALTY']= log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVBL_PENALTY.mean())+ 1)
df['max_RCVBL_PENALTY'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVBL_PENALTY.max()) + 1)
df['min_RCVBL_PENALTY'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVBL_PENALTY.min()) + 1)
df['std_RCVBL_PENALTY'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVBL_PENALTY.std()) + 1)

# 實收違約金
df['sum_RCVED_PENALTY'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVED_PENALTY.sum()) + 1)
df['mean_RCVED_PENALTY']= log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVED_PENALTY.mean())+ 1)
df['max_RCVED_PENALTY'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVED_PENALTY.max()) + 1)
df['min_RCVED_PENALTY'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVED_PENALTY.min()) + 1)
df['std_RCVED_PENALTY'] = log(df.CUST_NO.map(flow.groupby('CUST_NO').RCVED_PENALTY.std()) + 1)

# 差多少違約金
df['chaduoshao_weiyuejin'] = df['sum_RCVBL_PENALTY'] - df['sum_RCVED_PENALTY']

# 每個用戶有幾個月的記錄
df['nunique_RCVBL_YM'] = df.CUST_NO.map(flow.groupby('CUST_NO').RCVBL_YM.nunique())

# 平均每個月幾條
df['mean_RCVBL_YM'] = df['counts_of_09flow'] / df['nunique_RCVBL_YM']

del train_flow, test_flow, flow

存下特徵

import os
import pickle

if not os.path.isdir(r'..\數據挖掘\電費'):
    os.makedirs(r'..\數據挖掘\電費')
# os.path.isdir()用於判斷某一對象(需提供絕對路徑)是否爲目錄

print('統計特徵搞定!')
pickle.dump(df, open(r'..\電費\statistical_features_1.pkl', 'wb'))

統計特徵搞定!

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章