用數據分析細分用戶:RFM分析
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
import seaborn as sns
import matplotlib. pyplot as plt
from pylab import *
mpl. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
from warnings import filterwarnings
filterwarnings( 'ignore' )
path= '../RFM模型/data.csv'
df= pd. read_csv( path)
df. head( )
InvoiceNo
StockCode
Description
Quantity
InvoiceDate
UnitPrice
CustomerID
Country
0
536365
85123A
WHITE HANGING HEART T-LIGHT HOLDER
6
12/1/2010 8:26
2.55
17850.0
United Kingdom
1
536365
71053
WHITE METAL LANTERN
6
12/1/2010 8:26
3.39
17850.0
United Kingdom
2
536365
84406B
CREAM CUPID HEARTS COAT HANGER
8
12/1/2010 8:26
2.75
17850.0
United Kingdom
3
536365
84029G
KNITTED UNION FLAG HOT WATER BOTTLE
6
12/1/2010 8:26
3.39
17850.0
United Kingdom
4
536365
84029E
RED WOOLLY HOTTIE WHITE HEART.
6
12/1/2010 8:26
3.39
17850.0
United Kingdom
數據清洗
去除重複數據
df= df. drop_duplicates( )
處理異常數據
1.快速查看統計信息
df. describe( )
Quantity
UnitPrice
CustomerID
count
536641.000000
536641.000000
401604.000000
mean
9.620029
4.632656
15281.160818
std
219.130156
97.233118
1714.006089
min
-80995.000000
-11062.060000
12346.000000
25%
1.000000
1.250000
13939.000000
50%
3.000000
2.080000
15145.000000
75%
10.000000
4.130000
16784.000000
max
80995.000000
38970.000000
18287.000000
df. loc[ df[ 'UnitPrice' ] < 0 ] . UnitPrice. count( )
2
df. loc[ df[ 'UnitPrice' ] < 0 , [ 'UnitPrice' , 'Description' ] ]
UnitPrice
Description
299983
-11062.06
Adjust bad debt
299984
-11062.06
Adjust bad debt
df= df[ df[ 'UnitPrice' ] >= 0 ]
2、統計缺失值
df. isnull( ) . sum ( )
InvoiceNo 0
StockCode 0
Description 1454
Quantity 0
InvoiceDate 0
UnitPrice 0
CustomerID 135035
Country 0
dtype: int64
df. isnull( ) . sum ( ) / df. shape[ 0 ] * 100
InvoiceNo 0.000000
StockCode 0.000000
Description 0.270946
Quantity 0.000000
InvoiceDate 0.000000
UnitPrice 0.000000
CustomerID 25.163098
Country 0.000000
dtype: float64
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 536639 entries, 0 to 541908
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 InvoiceNo 536639 non-null object
1 StockCode 536639 non-null object
2 Description 535185 non-null object
3 Quantity 536639 non-null int64
4 InvoiceDate 536639 non-null object
5 UnitPrice 536639 non-null float64
6 CustomerID 401604 non-null float64
7 Country 536639 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 36.8+ MB
df= df[ ~ ( df. CustomerID. isnull( ) ) ]
df[ 'InvoiceDate' ] = pd. to_datetime( df[ 'InvoiceDate' ] )
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 401604 entries, 0 to 541908
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 InvoiceNo 401604 non-null object
1 StockCode 401604 non-null object
2 Description 401604 non-null object
3 Quantity 401604 non-null int64
4 InvoiceDate 401604 non-null datetime64[ns]
5 UnitPrice 401604 non-null float64
6 CustomerID 401604 non-null float64
7 Country 401604 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.6+ MB
print ( '最大日期是:' , df[ 'InvoiceDate' ] . max ( ) )
print ( '最小日期是:' , df[ 'InvoiceDate' ] . min ( ) )
最大日期是: 2011-12-09 12:50:00
最小日期是: 2010-12-01 08:26:00
RFM模型
import datetime
df[ 'Sales' ] = df[ 'Quantity' ] * df[ 'UnitPrice' ]
groupby是分組,agg是用來聚合。agg用法
下面的sum,count,max都是groupby後數據的內置函數,可以直接用。sum表示求和,count表示計數,max表示求最大值。
df_group= df. groupby( 'CustomerID' )
df_rfm= df_group. agg( { 'Sales' : 'sum' , 'Quantity' : 'count' , 'InvoiceDate' : 'max' } )
dt.days:可以直接得到天數(注意這裏減法是帶時分秒的)。
df_rfm[ 'DateDiff' ] = ( pd. to_datetime( '2012-01-01' ) - df_rfm[ 'InvoiceDate' ] ) . dt. days
df_rfm= df_rfm. drop( 'InvoiceDate' , axis= 1 )
df_rfm. head( )
Sales
Quantity
DateDiff
CustomerID
12346.0
0.00
2
347
12347.0
4310.00
182
24
12348.0
1797.24
31
97
12349.0
1757.55
73
40
12350.0
334.40
17
332
rmd = df_rfm[ 'DateDiff' ] . median( )
fmd = df_rfm[ 'Quantity' ] . median( )
mmd = df_rfm[ 'Sales' ] . median( )
rmd, fmd, mmd
(72.0, 41.0, 644.0700000000002)
對8類用戶進行定義:與最近一次購物到現在的時間間隔均值rmd,最近一段時間內的購物頻次fmd,最近一段時間內的購物花費總額mmd分別進行比較,將滿足不同條件的用戶歸爲不同的類別。
def customer_type ( frame) :
customer_type = [ ]
for i in range ( len ( frame) ) :
if frame. iloc[ i, 2 ] <= rmd and frame. iloc[ i, 1 ] >= fmd and frame. iloc[ i, 0 ] >= mmd:
customer_type. append( '重要價值用戶' )
elif frame. iloc[ i, 2 ] > rmd and frame. iloc[ i, 1 ] >= fmd and frame. iloc[ i, 0 ] >= mmd:
customer_type. append( '重要喚回用戶' )
elif frame. iloc[ i, 2 ] <= rmd and frame. iloc[ i, 1 ] < fmd and frame. iloc[ i, 0 ] >= mmd:
customer_type. append( '重要深耕用戶' )
elif frame. iloc[ i, 2 ] > rmd and frame. iloc[ i, 1 ] < fmd and frame. iloc[ i, 0 ] >= mmd:
customer_type. append( '重要挽留用戶' )
elif frame. iloc[ i, 2 ] <= rmd and frame. iloc[ i, 1 ] >= fmd and frame. iloc[ i, 0 ] < mmd:
customer_type. append( '潛力用戶' )
elif frame. iloc[ i, 2 ] > rmd and frame. iloc[ i, 1 ] >= fmd and frame. iloc[ i, 0 ] < mmd:
customer_type. append( '一般維持用戶' )
elif frame. iloc[ i, 2 ] <= rmd and frame. iloc[ i, 1 ] < fmd and frame. iloc[ i, 0 ] < mmd:
customer_type. append( '新用戶' )
elif frame. iloc[ i, 2 ] > rmd and frame. iloc[ i, 1 ] < fmd and frame. iloc[ i, 0 ] < mmd:
customer_type. append( '流失用戶' )
frame[ 'classification' ] = customer_type
customer_type( df_rfm)
df_rfm. groupby( by= 'classification' ) . size( )
classification
一般維持用戶 184
新用戶 524
流失用戶 1276
潛力用戶 202
重要價值用戶 1337
重要喚回用戶 480
重要挽留用戶 209
重要深耕用戶 160
dtype: int64
fig, ax = plt. subplots( figsize= ( 12 , 8 ) )
sns. countplot( y= "classification" , order= df_rfm[ 'classification' ] . value_counts( ) . index , data= df_rfm, color= '#3c7f99' )
plt. box( False )
fig. text( x= 0.04 , y= 0.90 , s= ' 不同價值的客戶數量 ' ,
fontsize= 20 , weight= 'bold' )
plt. tick_params( axis= 'both' , which= 'major' , labelsize= 14 )
ax. xaxis. grid( which= 'both' , linewidth= 0.5 , color= '#3c7f99' )
plt. xlabel( '' )
plt. ylabel( '' )
con= list ( df_rfm. groupby( 'classification' ) . classification. count( ) . values)
con= sorted ( con, reverse= True )
for x, y in enumerate ( con) :
plt. text( y+ 0.1 , x, '%s' % y, va= 'center' , size= 14 )
plt. show( )