信用卡客戶畫像(年齡,性別,收入,支出 和信用卡種的關係)

import os
import pandas as pd

os.chdir(r'H:\2019-2-3新華書店筆記以及資料\資料\HW5')

card=pd.read_csv(r"card.csv",encoding="gbk") #讀取數據
disp=pd.read_csv(r"disp.csv",encoding="gbk")
clients=pd.read_csv(r"clients.csv",encoding="gbk")

card.head() #查看數據分佈

  card_id disp_id issued type
0 1005 9285 1993-11-07 普通卡
1 104 588 1994-01-19 普通卡
2 747 4915 1994-02-05 普通卡
3 70 439 1994-02-08 普通卡
4 577 3687 1994-02-15 普通卡

disp.head()

  disp_id client_id account_id type
0 1 1 1 所有者
1 2 2 2 所有者
2 3 3 2 用戶
3 4 4 3 所有者
4 5 5 3 用戶

clients.head()

  client_id sex birth_date district_id
0 1 1970-12-13 18
1 2 1945-02-04 1
2 3 1940-10-09 1
3 4 1956-12-01 5
4 5 1960-07-03 5

import sqlite3 # sqlite3相當於輕量版,更多功能可使用SQLAlchemy

con = sqlite3.connect(':memory:') # 數據庫連接,使用sql連接查詢表

card.to_sql('card', con)

disp.to_sql('disp', con)

clients.to_sql('clients', con)

car_sql='''select a.*,c.sex,c.birth_date,c.district_id from card a left join disp b on a.disp_id=b.disp_id left join clients c on b.client_id=c.client_id where b.type="所有者"'''

card_t=pd.read_sql(car_sql, con)

card_t.head()

  index card_id disp_id issued type sex birth_date district_id
0 0 1005 9285 1993-11-07 普通卡 1968-01-28 74
1 1 104 588 1994-01-19 普通卡 1960-10-20 61
2 2 747 4915 1994-02-05 普通卡 1963-07-19 40
3 3 70 439 1994-02-08 普通卡 1968-09-12 51
4 4 577 3687 1994-02-15 普通卡 1972-02-06 49

from pylab import mpl

mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默認字體

mpl.rcParams['axes.unicode_minus'] = False # 解決保存圖像是負號'-'顯示爲方塊的問題

card_t['type'].value_counts().plot(kind='pie',autopct='%.1f%%') #查看數據分佈 餅圖

<matplotlib.axes._subplots.AxesSubplot at 0x167ec390>

 

#1、髮卡趨勢
#總體髮卡趨勢
from  datetime  import  *  
card_t['issued_date']=pd.to_datetime(card_t['issued'])
card_t['issued_year']=card_t['issued_date'].map(lambda x:x.year)
card_t.card_id.groupby(card_t['issued_year']).count().plot(kind="bar") #年齡和髮卡的關係
<matplotlib.axes._subplots.AxesSubplot at 0x9354358>

#2、不同卡的分佈

card_t.type.value_counts().plot(kind="pie",autopct='%.1f%%') #卡種分佈

<matplotlib.axes._subplots.AxesSubplot at 0x93aa8d0>

 

pd.crosstab(card_t.issued_year,card_t.type).plot(kind = 'bar') #卡種和年齡的關係分佈
<matplotlib.axes._subplots.AxesSubplot at 0x940cb00>

 

t1=pd.crosstab(card_t.issued_year,card_t.type) #卡種和年齡佔比對比

t1["sum1"]=t1.sum(1)

t2=t1.div(t1.sum1,axis = 0)

t2.drop("sum1",1).plot(kind = 'bar',stacked= True)

<matplotlib.axes._subplots.AxesSubplot at 0xa8b5d30>

 

import matplotlib.pyplot as plt
labels=["青年卡","普通卡","金卡"]
y1=t1.loc[:,"青年卡"]#.astype('int')
y2=t1.loc[:,"普通卡"]#.astype('int')
y3=t1.loc[:,"金卡"]#.astype('int')
x=t1.index#.astype('int')
plt.stackplot(x,y1,y2,y3,labels = labels) # 圖表類型
plt.title('髮卡趨勢')
plt.ylabel('髮卡量')
plt.legend(loc = 'upper left')# 標籤位置
plt.show()

 

?plt.stackplot(x,y1,y2,y3,labels = labels) 
pd.crosstab(card_t.type,card_t.sex).plot(kind='bar')#性別和卡種數量對比
<matplotlib.axes._subplots.AxesSubplot at 0xa9960b8>

 

xb=pd.crosstab(card_t.type,card_t.sex) #性別和卡種百分比對比
xb.div(xb.sum(1),axis=0).plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0xaa24860>

 

from stack2dim import *
stack2dim(card_t,'type','sex') #調用之前寫的庫,查看卡種和性別關係(柱子粗細代表數量,縱座標代表百分比)

 

card_t['age']=(pd.to_datetime(card_t['issued'])-pd.to_datetime(card_t['birth_date']))
card_t['age1']=card_t['age'].map(lambda x:x.days/365)
import seaborn as sns
sns.boxplot(x='type',y='age1',data=card_t) #查看何須圖,年齡和卡種分佈
<matplotlib.axes._subplots.AxesSubplot at 0xc104940>

 


 
trans=pd.read_csv(r"trans.csv",encoding="gbk")
trans.to_sql('trans', con)
card_t.to_sql('card_t', con)
D:\anaconda\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
D:\anaconda\lib\site-packages\pandas\io\sql.py:450: UserWarning: the 'timedelta' type is not supported, and will be written as integer values (ns frequency) to the database.
  chunksize=chunksize, dtype=dtype)

 

trans.head()
  trans_id account_id date type operation amount balance k_symbol bank account
0 695247 2378 1993-01-01 信貸資金 $700 $700 NaN NaN NaN
1 171812 576 1993-01-01 信貸資金 $900 $900 NaN NaN NaN
2 207264 704 1993-01-01 信貸資金 $1,000 $1,000 NaN NaN NaN
3 1117247 3818 1993-01-01 信貸資金 $600 $600 NaN NaN NaN
4 579373 1972 1993-01-02 信貸資金 $400 $400 NaN NaN NaN

card_t.head()

  index card_id disp_id issued type sex birth_date district_id issued_date issued_year age age1
0 0 1005 9285 1993-11-07 普通卡 1968-01-28 74 1993-11-07 1993 9415 days 25.794521
1 1 104 588 1994-01-19 普通卡 1960-10-20 61 1994-01-19 1994 12144 days 33.271233
2 2 747 4915 1994-02-05 普通卡 1963-07-19 40 1994-02-05 1994 11159 days 30.572603
3 3 70 439 1994-02-08 普通卡 1968-09-12 51 1994-02-08 1994 9280 days 25.424658
4 4 577 3687 1994-02-15 普通卡 1972-02-06 49 1994-02-15 1994 8045 days 22.041096

car_sql='''

select a.card_id,a.issued,a.type,c.type as t_type,c.amount,c.balance,c.date as t_date
  from card as a
  left join disp as b on a.disp_id=b.disp_id
  left join trans as c on b.account_id=c.account_id
  where b.type="所有者"
  order by a.card_id,c.date
'''  #使用sql連接表
card_t2=pd.read_sql(car_sql, con)
card_t2.head()
  card_id issued type t_type amount balance t_date
0 1 1998-10-16 金卡 $900 $900 1996-11-24
1 1 1998-10-16 金卡 $33975 $34875 1996-12-07
2 1 1998-10-16 金卡 $6,600 $28275 1996-12-24
3 1 1998-10-16 金卡 $30 $28305 1996-12-31
4 1 1998-10-16 金卡 $22650 $50955 1997-01-07

 

card_t2['issued']=pd.to_datetime(card_t2['issued'])
card_t2['t_date']=pd.to_datetime(card_t2['t_date'])
card_t2.head()
  card_id issued type t_type amount balance t_date
0 1 1998-10-16 金卡 $900 $900 1996-11-24
1 1 1998-10-16 金卡 $33975 $34875 1996-12-07
2 1 1998-10-16 金卡 $6,600 $28275 1996-12-24
3 1 1998-10-16 金卡 $30 $28305 1996-12-31
4 1 1998-10-16 金卡 $22650 $50955 1997-01-07

 

card_t2['balance2'] = card_t2['balance'].map(lambda x: int(''.join(x[1:].split(','))))
card_t2['amount2'] = card_t2['amount'].map(lambda x: int(''.join(x[1:].split(','))))
import datetime #辦卡前一年的平均餘額
card_t3 = card_t2[card_t2.issued>card_t2.t_date][card_t2.issued<card_t2.t_date+datetime.timedelta(days=365)]
card_t3.head()
D:\anaconda\lib\site-packages\ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  
  card_id issued type t_type amount balance t_date balance2 amount2
47 1 1998-10-16 金卡 $13600 $57950 1997-10-20 57950 13600
48 1 1998-10-16 金卡 $15 $58197 1997-10-31 58197 15
49 1 1998-10-16 金卡 $262 $58212 1997-10-31 58212 262
50 1 1998-10-16 金卡 $4,880 $53317 1997-11-06 53317 4880
51 1 1998-10-16 金卡 $22650 $75967 1997-11-07 75967 22650

 

#card_t3["avg_balance"] = card_t3.groupby('card_id')['balance2'].mean() #生成的series  下面agg生成的dataframe
card_t4=card_t3.groupby(['type','card_id'])['balance2'].agg([('avg_balance','mean')])
card_t4.head()
#card_t4.to_sql('card_t4', con)
    avg_balance
type card_id  
普通卡 2 38898.610169
4 54222.060976
7 46888.702381
9 55854.223881
10 51387.500000

 

card_t3.groupby(['type','card_id'])['balance2'].agg([('avg_balance','mean'),'count']).head()
    avg_balance count
type card_id    
普通卡 2 38898.610169 59
4 54222.060976 82
7 46888.702381 84
9 55854.223881 67
10 51387.500000 66

 

card_t5=card_t4.reset_index()  #爲透視表添加行的名,type類型不變
#card_t5=pd.read_sql('select * from card_t4', con)
sns.boxplot(x = 'type', y = 'avg_balance', data = card_t5)
<matplotlib.axes._subplots.AxesSubplot at 0x21afb358>

 

#%%6、不同類型卡的持卡人在辦卡前一年內的平均收入對比
type_dict = {'借':'out','貸':'income'}
card_t3['type1'] = card_t3.t_type.map(type_dict)
card_t6= card_t3.groupby(['type','card_id','type1'])[['amount2']].sum()
card_t6.head()
card_t6.to_sql('card_t6', con)
card_t7=card_t6.reset_index()
#card_t7=pd.read_sql('select * from card_t6', con)
card_t7.to_sql('card_t7', con)
card_t8=pd.read_sql('select * from card_t7 where type1="income"', con)
card_t6.head()
      amount2
type card_id type1  
普通卡 2 income 193911
out 196384
4 income 474142
out 357224
7 income 299797

 

sns.boxplot(x = 'type', y = 'amount2', data = card_t8)  #收入和卡種的關係
<matplotlib.axes._subplots.AxesSubplot at 0x15fd6588>

 

card_t8.head()
  index type card_id type1 amount2
0 0 普通卡 2 income 193911
1 2 普通卡 4 income 474142
2 4 普通卡 7 income 299797
3 6 普通卡 9 income 322548
4 8 普通卡 10 income 644560

 

card_t9=pd.read_sql('select * from card_t7 where type1="out"', con) #支出和卡種的關係
sns.boxplot(x = 'type', y = 'amount2', data = card_t9)
<matplotlib.axes._subplots.AxesSubplot at 0xa993a58>

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章