利用python建立用戶RFM模型

用數據分析細分用戶:RFM分析

# 加載必要的庫
import pandas as pd
import numpy as np
from pandas import DataFrame,Series
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
from warnings import filterwarnings
filterwarnings('ignore') 
# 導入數據
path='../RFM模型/data.csv'
df=pd.read_csv(path)
df.head()
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850.0 United Kingdom
1 536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850.0 United Kingdom
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850.0 United Kingdom

數據清洗

去除重複數據

df=df.drop_duplicates()

處理異常數據

1.快速查看統計信息

df.describe()
Quantity UnitPrice CustomerID
count 536641.000000 536641.000000 401604.000000
mean 9.620029 4.632656 15281.160818
std 219.130156 97.233118 1714.006089
min -80995.000000 -11062.060000 12346.000000
25% 1.000000 1.250000 13939.000000
50% 3.000000 2.080000 15145.000000
75% 10.000000 4.130000 16784.000000
max 80995.000000 38970.000000 18287.000000
#統計UnitPrice有多少異常的
df.loc[df['UnitPrice']<0].UnitPrice.count()
2
# 查看這2行的Description是什麼
df.loc[df['UnitPrice']<0,['UnitPrice','Description']]
UnitPrice Description
299983 -11062.06 Adjust bad debt
299984 -11062.06 Adjust bad debt
# 刪除UnitPrice小於0的的數據
df=df[df['UnitPrice']>=0]

2、統計缺失值

# 統計缺失值
df.isnull().sum()
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135035
Country             0
dtype: int64
# 統計缺失值的佔比
df.isnull().sum()/df.shape[0]*100
InvoiceNo       0.000000
StockCode       0.000000
Description     0.270946
Quantity        0.000000
InvoiceDate     0.000000
UnitPrice       0.000000
CustomerID     25.163098
Country         0.000000
dtype: float64
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 536639 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    536639 non-null  object 
 1   StockCode    536639 non-null  object 
 2   Description  535185 non-null  object 
 3   Quantity     536639 non-null  int64  
 4   InvoiceDate  536639 non-null  object 
 5   UnitPrice    536639 non-null  float64
 6   CustomerID   401604 non-null  float64
 7   Country      536639 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 36.8+ MB
# 刪除CustomerID爲空的數據
df=df[~(df.CustomerID.isnull())]
# 把InvoiceDate轉換爲datetime類型
df['InvoiceDate']=pd.to_datetime(df['InvoiceDate'])
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 401604 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    401604 non-null  object        
 1   StockCode    401604 non-null  object        
 2   Description  401604 non-null  object        
 3   Quantity     401604 non-null  int64         
 4   InvoiceDate  401604 non-null  datetime64[ns]
 5   UnitPrice    401604 non-null  float64       
 6   CustomerID   401604 non-null  float64       
 7   Country      401604 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.6+ MB
# 查看數據日期區間
print('最大日期是:',df['InvoiceDate'].max())
print('最小日期是:',df['InvoiceDate'].min())
最大日期是: 2011-12-09 12:50:00
最小日期是: 2010-12-01 08:26:00

RFM模型

import datetime
# 添加一列Sales
df['Sales']=df['Quantity']*df['UnitPrice']

groupby是分組,agg是用來聚合。agg用法
下面的sum,count,max都是groupby後數據的內置函數,可以直接用。sum表示求和,count表示計數,max表示求最大值。

# 按用戶ID進行分組
df_group=df.groupby('CustomerID')
df_rfm=df_group.agg({'Sales':'sum','Quantity':'count','InvoiceDate':'max'})

dt.days:可以直接得到天數(注意這裏減法是帶時分秒的)。

df_rfm['DateDiff']=(pd.to_datetime('2012-01-01') - df_rfm['InvoiceDate']).dt.days

df_rfm=df_rfm.drop('InvoiceDate',axis=1)
df_rfm.head()
Sales Quantity DateDiff
CustomerID
12346.0 0.00 2 347
12347.0 4310.00 182 24
12348.0 1797.24 31 97
12349.0 1757.55 73 40
12350.0 334.40 17 332
rmd = df_rfm['DateDiff'].median()
fmd = df_rfm['Quantity'].median()
mmd = df_rfm['Sales'].median()
rmd,fmd,mmd
(72.0, 41.0, 644.0700000000002)

對8類用戶進行定義:與最近一次購物到現在的時間間隔均值rmd,最近一段時間內的購物頻次fmd,最近一段時間內的購物花費總額mmd分別進行比較,將滿足不同條件的用戶歸爲不同的類別。

def customer_type(frame): 
    customer_type = []
    for i in range(len(frame)):
        if frame.iloc[i,2]<=rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要價值用戶')
        elif  frame.iloc[i,2]>rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要喚回用戶')
        elif  frame.iloc[i,2]<=rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要深耕用戶')
        elif  frame.iloc[i,2]>rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要挽留用戶')
        elif  frame.iloc[i,2]<=rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]<mmd:
            customer_type.append('潛力用戶')
        elif  frame.iloc[i,2]>rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]<mmd:
            customer_type.append('一般維持用戶')
        elif  frame.iloc[i,2]<=rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]<mmd:
            customer_type.append('新用戶')
        elif frame.iloc[i,2]>rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]<mmd:
            customer_type.append('流失用戶')
    frame['classification'] = customer_type
customer_type(df_rfm)
df_rfm.groupby(by='classification').size()
classification
一般維持用戶     184
新用戶        524
流失用戶      1276
潛力用戶       202
重要價值用戶    1337
重要喚回用戶     480
重要挽留用戶     209
重要深耕用戶     160
dtype: int64
# 繪製條形圖
fig, ax = plt.subplots(figsize=(12,8))
sns.countplot(y="classification",order=df_rfm['classification'].value_counts().index ,data=df_rfm,color='#3c7f99')
plt.box(False) 
fig.text(x=0.04, y=0.90, s='                                  不同價值的客戶數量                       ', 
         fontsize=20, weight='bold')
plt.tick_params(axis='both', which='major', labelsize=14)
ax.xaxis.grid(which='both', linewidth=0.5, color='#3c7f99')
plt.xlabel('')
plt.ylabel('')

con=list(df_rfm.groupby('classification').classification.count().values)
con=sorted(con,reverse=True)

for x,y in enumerate(con):
    plt.text(y+0.1,x,'%s' %y,va='center',size=14)
plt.show()

[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-oAujWQcC-1590740001414)(output_30_0.png)]

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章