利用python建立用戶RFM模型

用數據分析細分用戶：RFM分析

# 加載必要的庫
import pandas as pd
import numpy as np
from pandas import DataFrame,Series
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
from warnings import filterwarnings
filterwarnings('ignore')

# 導入數據
path='../RFM模型/data.csv'
df=pd.read_csv(path)

df.head()

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom

數據清洗

去除重複數據

df=df.drop_duplicates()

處理異常數據

1.快速查看統計信息

df.describe()

	Quantity	UnitPrice	CustomerID
count	536641.000000	536641.000000	401604.000000
mean	9.620029	4.632656	15281.160818
std	219.130156	97.233118	1714.006089
min	-80995.000000	-11062.060000	12346.000000
25%	1.000000	1.250000	13939.000000
50%	3.000000	2.080000	15145.000000
75%	10.000000	4.130000	16784.000000
max	80995.000000	38970.000000	18287.000000

#統計UnitPrice有多少異常的
df.loc[df['UnitPrice']<0].UnitPrice.count()

# 查看這2行的Description是什麼
df.loc[df['UnitPrice']<0,['UnitPrice','Description']]

	UnitPrice	Description
299983	-11062.06	Adjust bad debt
299984	-11062.06	Adjust bad debt

# 刪除UnitPrice小於0的的數據
df=df[df['UnitPrice']>=0]

2、統計缺失值

# 統計缺失值
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135035
Country             0
dtype: int64

# 統計缺失值的佔比
df.isnull().sum()/df.shape[0]*100

InvoiceNo       0.000000
StockCode       0.000000
Description     0.270946
Quantity        0.000000
InvoiceDate     0.000000
UnitPrice       0.000000
CustomerID     25.163098
Country         0.000000
dtype: float64

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 536639 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    536639 non-null  object 
 1   StockCode    536639 non-null  object 
 2   Description  535185 non-null  object 
 3   Quantity     536639 non-null  int64  
 4   InvoiceDate  536639 non-null  object 
 5   UnitPrice    536639 non-null  float64
 6   CustomerID   401604 non-null  float64
 7   Country      536639 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 36.8+ MB

# 刪除CustomerID爲空的數據
df=df[~(df.CustomerID.isnull())]

# 把InvoiceDate轉換爲datetime類型
df['InvoiceDate']=pd.to_datetime(df['InvoiceDate'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 401604 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    401604 non-null  object        
 1   StockCode    401604 non-null  object        
 2   Description  401604 non-null  object        
 3   Quantity     401604 non-null  int64         
 4   InvoiceDate  401604 non-null  datetime64[ns]
 5   UnitPrice    401604 non-null  float64       
 6   CustomerID   401604 non-null  float64       
 7   Country      401604 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.6+ MB

# 查看數據日期區間
print('最大日期是：',df['InvoiceDate'].max())
print('最小日期是：',df['InvoiceDate'].min())

最大日期是： 2011-12-09 12:50:00
最小日期是： 2010-12-01 08:26:00

RFM模型

import datetime

# 添加一列Sales
df['Sales']=df['Quantity']*df['UnitPrice']

groupby是分組，agg是用來聚合。agg用法
下面的sum，count，max都是groupby後數據的內置函數，可以直接用。sum表示求和，count表示計數，max表示求最大值。

# 按用戶ID進行分組
df_group=df.groupby('CustomerID')
df_rfm=df_group.agg({'Sales':'sum','Quantity':'count','InvoiceDate':'max'})

dt.days：可以直接得到天數（注意這裏減法是帶時分秒的）。

df_rfm['DateDiff']=(pd.to_datetime('2012-01-01') - df_rfm['InvoiceDate']).dt.days

df_rfm=df_rfm.drop('InvoiceDate',axis=1)
df_rfm.head()

	Sales	Quantity	DateDiff
CustomerID
12346.0	0.00	2	347
12347.0	4310.00	182	24
12348.0	1797.24	31	97
12349.0	1757.55	73	40
12350.0	334.40	17	332

rmd = df_rfm['DateDiff'].median()
fmd = df_rfm['Quantity'].median()
mmd = df_rfm['Sales'].median()
rmd,fmd,mmd

(72.0, 41.0, 644.0700000000002)

對8類用戶進行定義：與最近一次購物到現在的時間間隔均值rmd，最近一段時間內的購物頻次fmd，最近一段時間內的購物花費總額mmd分別進行比較，將滿足不同條件的用戶歸爲不同的類別。

def customer_type(frame): 
    customer_type = []
    for i in range(len(frame)):
        if frame.iloc[i,2]<=rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要價值用戶')
        elif  frame.iloc[i,2]>rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要喚回用戶')
        elif  frame.iloc[i,2]<=rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要深耕用戶')
        elif  frame.iloc[i,2]>rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要挽留用戶')
        elif  frame.iloc[i,2]<=rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]<mmd:
            customer_type.append('潛力用戶')
        elif  frame.iloc[i,2]>rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]<mmd:
            customer_type.append('一般維持用戶')
        elif  frame.iloc[i,2]<=rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]<mmd:
            customer_type.append('新用戶')
        elif frame.iloc[i,2]>rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]<mmd:
            customer_type.append('流失用戶')
    frame['classification'] = customer_type

customer_type(df_rfm)
df_rfm.groupby(by='classification').size()

classification
一般維持用戶     184
新用戶        524
流失用戶      1276
潛力用戶       202
重要價值用戶    1337
重要喚回用戶     480
重要挽留用戶     209
重要深耕用戶     160
dtype: int64

# 繪製條形圖
fig, ax = plt.subplots(figsize=(12,8))
sns.countplot(y="classification",order=df_rfm['classification'].value_counts().index ,data=df_rfm,color='#3c7f99')
plt.box(False) 
fig.text(x=0.04, y=0.90, s='                                  不同價值的客戶數量                       ', 
         fontsize=20, weight='bold')
plt.tick_params(axis='both', which='major', labelsize=14)
ax.xaxis.grid(which='both', linewidth=0.5, color='#3c7f99')
plt.xlabel('')
plt.ylabel('')

con=list(df_rfm.groupby('classification').classification.count().values)
con=sorted(con,reverse=True)

for x,y in enumerate(con):
    plt.text(y+0.1,x,'%s' %y,va='center',size=14)
plt.show()

利用python建立用戶RFM模型

數據清洗

去除重複數據

處理異常數據

1.快速查看統計信息

2、統計缺失值

RFM模型

k-means用戶劃分

office各種插件

汽車保險客戶分類問題

python 進行文本情感分析

Hive 窗口函數over()

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結