基於客戶提取爲所屬客戶經理的信息

本地處理

#!/usr/bin/python
# -*- coding: utf-8 -*-

# UnicodeDecodeError: 'utf8' codec can't decode byte 0x9a in position 12的暫時解決方法——修改默認encoding
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import *  # 爲使用dataframe的方法
import re
from pyspark.sql.types import *
import datetime
import pandas as pd


conf = SparkConf().setAppName("miniProject").setMaster("local")
sc1 = SparkContext.getOrCreate(conf)
spark = SparkSession(sc1)

1. 客戶經理信息簡單查詢

# 客戶經理數據——已經將代碼的csv傳到本地,讀取
cm_df = pd.read_csv('file:///home/hadoop/xxx/project_0509_khjl/secret.csv', sep=',', encoding='utf-8')
cm_df = cm_df.ix[:,1:-1]
cm_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104467 entries, 0 to 104466
Data columns (total 12 columns):
staff_id             104467 non-null int64
staff_name           104467 non-null object
user_id_card_edit    104467 non-null object
id_county            104431 non-null object
id_province          104431 non-null object
id_city              104431 non-null object
id                   104467 non-null int64
role_id              104467 non-null int64
presona_id           104467 non-null int64
status               104467 non-null int64
city_number          104467 non-null int64
num                  104467 non-null int64
dtypes: int64(7), object(5)
memory usage: 9.6+ MB
cm_df.describe()
staff_id id role_id presona_id status city_number num
count 104467.000000 104467.000000 104467.000000 104467.000000 104467.000000 104467.0 104467.000000
mean 191295.747298 191295.747298 58.994228 3135.484143 1.857151 0.0 -0.000029
std 78064.637688 78064.637688 0.400920 2029.885308 0.349920 0.0 0.031703
min 218.000000 218.000000 0.000000 43.000000 1.000000 0.0 -4.000000
25% 162123.500000 162123.500000 59.000000 1603.000000 2.000000 0.0 0.000000
50% 206533.000000 206533.000000 59.000000 2374.000000 2.000000 0.0 0.000000
75% 247621.500000 247621.500000 59.000000 4828.000000 2.000000 0.0 0.000000
max 294921.000000 294921.000000 155.000000 9002.000000 2.000000 0.0 4.000000
# 城市數據
# 已經將代碼的csv傳到cluster,放在hdfs根目錄
ctcode_df = pd.read_csv('file:///home/hadoop/xxx/project_0509_khjl/xz.csv', sep=',', encoding='utf-8')
ctcode_df.head()
ctcode_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3219 entries, 0 to 3218
Data columns (total 2 columns):
xzqhdm         3219 non-null int64
xzqhdm_name    3219 non-null object
dtypes: int64(1), object(1)
memory usage: 50.4+ KB
ctcode_df['xzqhdm'] = ctcode_df['xzqhdm'].astype(str)
ctcode_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3219 entries, 0 to 3218
Data columns (total 2 columns):
xzqhdm         3219 non-null object
xzqhdm_name    3219 non-null object
dtypes: object(2)
memory usage: 50.4+ KB
# 鏈接
tmp_county = pd.merge(left=cm_df, right=ctcode_df, left_on='id_county', right_on='xzqhdm', how='left')


tmp_province = pd.merge(left=tmp_county, right=ctcode_df, left_on='id_province', right_on='xzqhdm', how='left')

tmp_city = pd.merge(left=tmp_province, right=ctcode_df, left_on='id_city', right_on='xzqhdm', how='left')

tmp_city.drop(['xzqhdm_x','xzqhdm_y','xzqhdm'], axis=1, inplace=True)
tmp_city.head()
tmp_city.count()
staff_id             104467
staff_name           104467
user_id_card_edit    104467
id_county            104431
id_province          104431
id_city              104431
id                   104467
role_id              104467
presona_id           104467
status               104467
city_number          104467
num                  104467
xzqhdm_name_x         45367
xzqhdm_name_y         65507
xzqhdm_name           51701
dtype: int64
# 不同省份的人數
tmp_city.groupby(['xzqhdm_name_y'])['staff_id'].count()
xzqhdm_name_y
上海市          240
雲南省         1699
內蒙古自治區      1731
北京市           79
吉林省         2200
四川省         4520
天津市          274
寧夏回族自治區      369
安徽省         3906
山東省         5584
山西省         1285
廣東省         3889
廣西壯族自治區     1823
新疆維吾爾自治區      55
江蘇省         4799
江西省         2386
河北省         3110
河南省         4545
浙江省         2202
海南省          337
湖北省         3596
湖南省         2706
甘肅省         1031
福建省         3204
西藏自治區          2
貴州省         1230
遼寧省         2562
重慶市         1347
陝西省         1929
青海省          132
黑龍江省        2735
Name: staff_id, dtype: int64
dis_ct_prov = tmp_city.groupby(['xzqhdm_name_y'])['staff_id'].count().sort_values(ascending=False) 
dis_ct_city = tmp_city.groupby(['xzqhdm_name_x'])['staff_id'].count().sort_values(ascending=False) 
dis_ct_county = tmp_city.groupby(['xzqhdm_name'])['staff_id'].count().sort_values(ascending=False) 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook
from pylab import *  
mpl.rcParams['font.sans-serif'] = ['SimHei']  
mpl.rcParams['axes.unicode_minus'] = False  

fig = plt.figure()
dis_ct_prov.plot(kind='bar', )
plt.title('province')
plt.show()
<IPython.core.display.Javascript object>
dis_ct_prov[:10]
xzqhdm_name_y
山東省     5584
江蘇省     4799
河南省     4545
四川省     4520
安徽省     3906
廣東省     3889
湖北省     3596
福建省     3204
河北省     3110
黑龍江省    2735
Name: staff_id, dtype: int64
fig2 = plt.figure()
dis_ct_city.plot(kind='bar', )
plt.title('city')
plt.show()
<IPython.core.display.Javascript object>
dis_ct_city[:10]
xzqhdm_name_x
市中區    241
惠安縣    187
睢寧縣    177
寧海縣    164
沭陽縣    161
灌雲縣    159
興化市    148
仙遊縣    142
臨泉縣    135
沛縣     134
Name: staff_id, dtype: int64
fig3 = plt.figure()
dis_ct_county.plot(kind='bar', )
plt.title('city')
plt.show()
<IPython.core.display.Javascript object>
dis_ct_county[:10]
xzqhdm_name
徐州市    892
阜陽市    618
泉州市    604
漳州市    566
濰坊市    556
淮安市    526
鹽城市    521
湛江市    509
南充市    508
宿遷市    505
Name: staff_id, dtype: int64
# 查看客戶經理的在職狀態,1-在職
tmp_city.groupby('status').count(staff_id)

2、客戶信息處理

#!/usr/bin/python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np

# from pyspark.sql import functions as F

def fraud_normal(already_num,cd):
    if cd>=2:
        flag='loan_fraud'
    else:
        if already_num>4:
            flag='loan_normal'
        else:
            flag='drop'
    return flag
# Fraud_normal=F.udf(fraud_normal)

# 讀入整理好的客戶信息,已經將代碼的csv傳到本地,讀取
cl_df = pd.read_csv('file:///home/hadoop/xxx/project_0509_khjl/datasource/client.csv', sep=',', encoding='utf-8')
cl_df = cl_df.ix[:,1:-1]
cl_df[['cd', 'already_num']].sort_values(['cd', 'already_num']).head()
cd already_num
160569 -8.0 17
114941 -3.0 10
1041 0.0 0
1176 0.0 0
2354 0.0 0
cl_df[cl_df['loan_type']==2].head()
cl_df['flag'] = cl_df[cl_df['loan_type']==2].apply(lambda x: fraud_normal(x['already_num'],x['cd']), axis=1)
cl_df_flaged=cl_df[cl_df['flag']!='drop']
cl_df_flaged.columns, cl_df_flaged.count()
(Index([             u'id',          u'id_num',      u'account_id',
          u'loan_staff_id',       u'education',       u'child_sum',
                 u'is_car',  u'account_number',        u'zs_money',
                u'zipCode',          u'status',             u'amt',
              u'loan_type',              u'cd',            u'year',
        u'intopieces_date',           u'count',   u'total_account',
        u'avg_month_men_a',       u'repay_num',     u'already_num',
             u'user_phone',     u'presona_pid',            u'flag'],
       dtype='object'), id                 771956
 id_num             771956
 account_id         771956
 loan_staff_id      771956
 education          771956
 child_sum          771806
 is_car             771956
 account_number     771956
 zs_money           771956
 zipCode            254976
 status             771956
 amt                771956
 loan_type          771956
 cd                 768909
 year               771956
 intopieces_date    771956
 count              771956
 total_account      771956
 avg_month_men_a    771956
 repay_num          771956
 already_num        771956
 user_phone         752550
 presona_pid        751350
 flag               271373
 dtype: int64)

3. 鏈接經理和客戶

def et_id_sex(x):
    
    if len(x)==18:
        if float(x[16])%2 == 0:
            sex = 0 # 'female'
        else:
            sex = 1 # 'male'
        return sex

    elif len(x)==15:
        if float(x[-1])%2 == 0:
            sex = 0 # 'female'
        else:
            sex = 1 # 'male'
        return sex
    else:
        return None
import datetime

def et_id_age(x):
            
    if len(x)==18:
        y_m_d = x[6:15]
        age = calculate_age(y_m_d)
        return age

    elif len(x)==15:
        y_m_d = x[6:12]
        age = calculate_age(y_m_d)
        return age
    else:
        return None

def calculate_age(input_born, today=[2018,5,10]):
        '''
        : input_born: string, len=8
        '''

        y_born = input_born[0:4]
        m_born = input_born[4:6]
        d_born = input_born[6:8]
        
        if (int(m_born)  in range(1, 13, 1) and int(d_born) in range(1, 32, 1)):
            born = datetime.date(int(y_born), int(m_born), int(d_born))
            today = datetime.date(today[0],today[1],today[2])

            born_days = born - datetime.date(born.year-1, 12, 31)  #減去上一年最後一天,可得解
            target_days = today - datetime.date(today.year-1, 12, 31)
            sub_days = target_days - born_days
            sub_days = sub_days.days

            if today > born:
                years = today.year-born.year
                if sub_days >= 0:
                    if sub_days>=0 and sub_days<183:
                        return years
                    else:
                        # sub_days in range(183,366,1):
                        return years+1

                else:
                    sub_days = sub_days*(-1)
                    if sub_days>=0 and sub_days<183:
                        return years
                    else:
                        return years-1
            else:
                print('error_date')
                return None
            
        else:
            return None

3.1 客戶經理數據讀入與處理

# 城市區劃數據,已經將代碼的csv傳到cluster,放在hdfs根目錄
ctcode_df = pd.read_csv('/home/hadoop/xxx/project_0509_khjl/datasource/xz.csv', sep=',', encoding='utf-8')
ctcode_df['xzqhdm'] = ctcode_df['xzqhdm'].astype(str)

# 客戶經理數據——已經將代碼的csv傳到本地,讀取
cm_df = pd.read_csv('/home/hadoop/xxx/project_0509_khjl/datasource/manager.csv', sep=',', encoding='utf-8')
cm_df = cm_df.ix[:,1:-1]

# 城市地點
tmp_county = pd.merge(left=cm_df, right=ctcode_df, left_on='id_county', right_on='xzqhdm', how='left')

tmp_province = pd.merge(left=tmp_county, right=ctcode_df, left_on='id_province', right_on='xzqhdm', how='left')

tmp_city = pd.merge(left=tmp_province, right=ctcode_df, left_on='id_city', right_on='xzqhdm', how='left')

# 性別
tmp_city['sex'] = tmp_city['user_id_card_edit'].apply(et_id_sex)

# 年齡
tmp_city['age'] = tmp_city['user_id_card_edit'].apply(et_id_age)

# 重命名客戶經理的在職狀態字段,status
tmp_city.rename(columns={u'status':'status_m'}, inplace=True)

cm_df_location = tmp_city.drop(['xzqhdm_x','xzqhdm_y','xzqhdm'], axis=1, inplace=False)
/home/hadoop/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (4,5,6) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)


error_date

3.1鏈接經理與客戶信息

joint_cm_cl = pd.merge(left=cm_df_location, right=cl_df_flaged, left_on='staff_id', right_on='loan_staff_id', how='left')
joint_cm_cl.tail()
cl_df_flaged['loan_staff_id'].dtypes
dtype('int64')
cm_df_location['staff_id'].dtypes
dtype('int64')
cm_df_location.sort_values('staff_id').tail()
cl_df_flaged.sort_values('loan_staff_id').tail()
joint_cm_cl.count()
staff_id             740250
staff_name           740250
user_id_card_edit    740250
id_county            740214
id_province          740214
id_city              740214
id_x                 740250
role_id              740250
presona_id           740250
status_m             740250
city_number          740250
num                  740250
xzqhdm_name_x        314714
xzqhdm_name_y        461990
xzqhdm_name          362371
sex                  740214
age                  740122
id_y                 690472
id_num               690472
account_id           690472
loan_staff_id        690472
education            690472
child_sum            690359
is_car               690472
account_number       690472
zs_money             690472
zipCode              221233
status               690472
amt                  690472
loan_type            690472
cd                   687819
year                 690472
intopieces_date      690472
count                690472
total_account        690472
avg_month_men_a      690472
repay_num            690472
already_num          690472
user_phone           690472
presona_pid          689456
flag                 236757
dtype: int64
cl_df_flaged.count()
id                 771956
id_num             771956
account_id         771956
loan_staff_id      771956
education          771956
child_sum          771806
is_car             771956
account_number     771956
zs_money           771956
zipCode            254976
status             771956
amt                771956
loan_type          771956
cd                 768909
year               771956
intopieces_date    771956
count              771956
total_account      771956
avg_month_men_a    771956
repay_num          771956
already_num        771956
user_phone         752550
presona_pid        751350
flag               271373
dtype: int64
joint_cm_cl.groupby('flag').agg('count')
staff_id staff_name user_id_card_edit id_county id_province id_city id_x role_id presona_id status_m ... cd year intopieces_date count total_account avg_month_men_a repay_num already_num user_phone presona_pid
flag
loan_fraud 64186 64186 64186 64186 64186 64186 64186 64186 64186 64186 ... 64186 64186 64186 64186 64186 64186 64186 64186 64186 64104
loan_normal 172571 172571 172571 172571 172571 172571 172571 172571 172571 172571 ... 172569 172571 172571 172571 172571 172571 172571 172571 172571 172352

2 rows × 40 columns

joint_cm_cl.ix[584100:584129,'flag'].notnull()
584100     True
584101     True
584102    False
584103    False
584104     True
584105    False
584106     True
584107    False
584108    False
584109    False
584110     True
584111    False
584112    False
584113     True
584114    False
584115    False
584116    False
584117    False
584118    False
584119    False
584120    False
584121    False
584122     True
584123    False
584124     True
584125    False
584126     True
584127    False
584128     True
584129     True
Name: flag, dtype: bool

3.3 選取感興趣數據一

feature_use1 = ['staff_id','id_province', 'sex', 'status_m', 'age', 'num', 'presona_id', 'city_number','flag']
dataset_use1 = joint_cm_cl.loc[:, feature_use1]
dataset_use1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 740250 entries, 0 to 740249
Data columns (total 9 columns):
staff_id       740250 non-null int64
id_province    740214 non-null object
sex            740214 non-null float64
status_m       740250 non-null int64
age            740122 non-null float64
num            740250 non-null int64
presona_id     740250 non-null int64
city_number    740250 non-null int64
flag           236757 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 56.5+ MB
dataset_use1.describe()
/home/hadoop/anaconda2/lib/python2.7/site-packages/numpy/lib/function_base.py:4291: RuntimeWarning: Invalid value encountered in percentile
  interpolation=interpolation)
staff_id sex status_m age num presona_id city_number
count 740250.000000 740214.000000 740250.000000 740122.000000 740250.000000 740250.000000 740250.0
mean 206828.474852 0.578088 1.715630 28.350092 -0.000009 3265.831038 0.0
std 49171.210610 0.493865 0.451114 4.882471 0.018777 2031.866055 0.0
min 218.000000 0.000000 1.000000 18.000000 -4.000000 43.000000 0.0
25% 184968.000000 NaN 1.000000 NaN 0.000000 1676.000000 0.0
50% 214036.000000 NaN 2.000000 NaN 0.000000 2457.000000 0.0
75% 238753.000000 NaN 2.000000 NaN 0.000000 5110.000000 0.0
max 294921.000000 1.000000 2.000000 1716.000000 4.000000 9002.000000 0.0

經驗證,其中sex=NaN的客戶經理,溯源其身份證,通過函數驗證身份證信息屬假的,故考慮去除sex=NaN的客戶經理信息(相應的flag也是NaN)

# 去除sex=NaN的觀測
dataset_use1 = dataset_use1[dataset_use1['sex'].notnull()]
# 去除flag==NaN的觀測。這部分值可能是由於在生成flag的過程中,cd=None或者already_num=None所帶來的。
dataset_use1 = dataset_use1[dataset_use1['flag'].notnull()]
# 去除sage=NaN的觀測
dataset_use1[dataset_use1['age'].isnull()]
dataset_use1 = dataset_use1[dataset_use1['age'].notnull()]
dataset_use1[dataset_use1['flag'].isnull()].groupby('staff_id').count()['sex']
Series([], Name: sex, dtype: int64)
# 計算客戶經理對應的客戶的違約情況,0-1,無違約,100%違約。獲取客戶經理對應的客戶數量
def groupby_calcu(data_df, goal_id='staff_id', flag_id='flag'):
    whole_num = data_df.groupby(goal_id).count()['sex']
    whole_num.rename('whole_count', inplace=True)
    
    loan_fraud_num = data_df[data_df[flag_id]=='loan_fraud'].groupby(goal_id).count()['sex']
    loan_fraud_num.rename('fraud_count', inplace=True)
    
    tmp_df = pd.concat([whole_num, loan_fraud_num], axis=1, join='outer')
    tmp_df['fraud_count'] = tmp_df['fraud_count'].fillna(value=0, inplace=False)
    
    # 類型轉換,減少內存
    tmp_df['whole_count'] = tmp_df['whole_count'].astype('int32')
    tmp_df['fraud_count'] = tmp_df['fraud_count'].astype('float32')
    
    flag_new_name = '%s_perc' %(flag_id)
    
    # 方式1:突破內存限制
    for i in tmp_df.index.values:
        tmp_df.ix[i, flag_new_name] = (tmp_df.ix[i, 'fraud_count']/tmp_df.ix[i, 'whole_count']).astype('float32')

    # 方式2:受到內存限制,報錯!!!死機!!!——不推薦!!!
    # np.float32(wf_df['f_count'][:100000]/wf_df['whole_count'][:100000])
    
    df = pd.DataFrame({goal_id:tmp_df.index, flag_new_name:tmp_df[flag_new_name], 'num_client':tmp_df['whole_count']}, columns=[goal_id, flag_new_name, 'num_client'])

    unique_data_df = data_df.drop_duplicates(goal_id).sort_values(goal_id)
    unique_data_df_final = pd.merge(left=unique_data_df, right=df, left_on=goal_id, right_on=goal_id, how='left')
    
    whole_num, loan_fraud_num, tmp_df, df, unique_data_df = None, None, None, None, None
    del whole_num, loan_fraud_num, tmp_df, df, unique_data_df
    
    return unique_data_df_final
 
dataset_use2 = groupby_calcu(dataset_use1.ix[:,])
dataset_use2.head()
staff_id id_province sex status_m age num presona_id city_number flag flag_perc num_client
0 4735 350000 0.0 2 28.0 0 112 0 loan_fraud 1.0 1
1 4857 620000 0.0 2 29.0 0 43 0 loan_fraud 1.0 2
2 5365 340000 1.0 2 32.0 0 141 0 loan_normal 0.0 2
3 5373 410000 1.0 2 28.0 0 142 0 loan_normal 0.0 2
4 5910 410000 0.0 2 25.0 0 1110 0 loan_normal 0.5 4
# w_num1 = dataset_use1.groupby('staff_id').count()['sex']
# w_num1 = w_num1.rename('whole_count')
# f_num1 = dataset_use1[dataset_use1['flag']=='fraud_count'].groupby('staff_id').count()['sex']
# f_num1 = f_num1.rename('f_count')

# wf_df = pd.concat([w_num1, f_num1], axis=1, join='outer')
# wf_df['f_count'] = wf_df['f_count'].fillna(value=int(0), inplace=False)

# wf_df['whole_count'] = wf_df['whole_count'].astype('int32')
# wf_df['f_count'] = wf_df['f_count'].astype('float32')
# wf_df.dtypes
dataset_use2.describe()
staff_id sex status_m age num presona_id city_number flag_perc num_client
count 39374.000000 39374.000000 39374.000000 39374.000000 39374.000000 39374.000000 39374.0 39374.000000 39374.000000
mean 219081.539468 0.616727 1.799055 27.900797 0.000000 3240.945421 0.0 0.268053 6.012851
std 42961.619859 0.486190 0.400712 4.126361 0.014254 2052.084694 0.0 0.313801 7.872544
min 4735.000000 0.000000 1.000000 19.000000 -1.000000 43.000000 0.0 0.000000 1.000000
25% 196255.250000 0.000000 2.000000 25.000000 0.000000 1676.000000 0.0 0.000000 1.000000
50% 226812.500000 1.000000 2.000000 28.000000 0.000000 2449.000000 0.0 0.181818 3.000000
75% 249347.750000 1.000000 2.000000 30.000000 0.000000 4961.000000 0.0 0.444444 7.000000
max 286704.000000 1.000000 2.000000 52.000000 2.000000 9002.000000 0.0 1.000000 103.000000
# flagd的處理
def flag_pd(x):
    if x=='loan_normal':
        label = int(1)
        
    else:
        label = int(0)
    return label

dataset_use2['flag_01'] = dataset_use2['flag'].apply(flag_pd)

3.3選取感興趣的字段二——爲建模準備

dataset_use3 = dataset_use2[['sex','age','num_client', 'status_m', 'presona_id', 'id_province', 'flag_perc']].reset_index()
dataset_use3[dataset_use3["flag_perc"] <= 0.5]['flag_perc'] = 0
dataset_use3[dataset_use3["flag_perc"] > 0.5]['flag_perc'] = 1
/home/hadoop/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/home/hadoop/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
dataset_use3.dtypes, dataset_use3.head()
(index            int64
 sex            float64
 age            float64
 num_client       int32
 status_m         int64
 presona_id       int64
 id_province     object
 flag_perc      float32
 dtype: object,
    index  sex   age  num_client  status_m  presona_id id_province  flag_perc
 0      0  0.0  28.0           1         2         112      350000        1.0
 1      1  0.0  29.0           2         2          43      620000        1.0
 2      2  1.0  32.0           2         2         141      340000        0.0
 3      3  1.0  28.0           2         2         142      410000        0.0
 4      4  0.0  25.0           4         2        1110      410000        0.5)
dataset_use3['sex'] = dataset_use3['sex'].astype(int).astype(str)
dataset_use3['age'] = dataset_use3['age'].astype(int)
dataset_use3['status_m'] = dataset_use3['status_m'].astype(int).astype(str)
dataset_use3['presona_id'] = dataset_use3['presona_id'].astype(str)
dataset_use3['id_province'] = dataset_use3['id_province'].astype(int).astype(str)
dataset_use3['flag_perc'] = dataset_use3['flag_perc'].astype(int)

data_tmp_onehot = pd.get_dummies(dataset_use3[['sex','age', 'num_client', 'status_m', 'id_province']])
data_tmp_onehot['flag_perc'] = dataset_use3['flag_perc']
data_tmp_onehot.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39374 entries, 0 to 39373
Data columns (total 38 columns):
age                   39374 non-null int64
num_client            39374 non-null int32
sex_0                 39374 non-null float64
sex_1                 39374 non-null float64
status_m_1            39374 non-null float64
status_m_2            39374 non-null float64
id_province_110000    39374 non-null float64
id_province_120000    39374 non-null float64
id_province_130000    39374 non-null float64
id_province_140000    39374 non-null float64
id_province_150000    39374 non-null float64
id_province_210000    39374 non-null float64
id_province_220000    39374 non-null float64
id_province_230000    39374 non-null float64
id_province_310000    39374 non-null float64
id_province_320000    39374 non-null float64
id_province_330000    39374 non-null float64
id_province_340000    39374 non-null float64
id_province_350000    39374 non-null float64
id_province_360000    39374 non-null float64
id_province_370000    39374 non-null float64
id_province_400000    39374 non-null float64
id_province_410000    39374 non-null float64
id_province_420000    39374 non-null float64
id_province_430000    39374 non-null float64
id_province_440000    39374 non-null float64
id_province_450000    39374 non-null float64
id_province_460000    39374 non-null float64
id_province_500000    39374 non-null float64
id_province_510000    39374 non-null float64
id_province_520000    39374 non-null float64
id_province_530000    39374 non-null float64
id_province_610000    39374 non-null float64
id_province_620000    39374 non-null float64
id_province_630000    39374 non-null float64
id_province_640000    39374 non-null float64
id_province_650000    39374 non-null float64
flag_perc             39374 non-null int64
dtypes: float64(35), int32(1), int64(2)
memory usage: 11.3 MB

3.4繪圖與描述統計分析

import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib notebook

3.4.1單變量的分佈情況

flg = plt.figure()
<IPython.core.display.Javascript object>

<img src="

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章