w4_聚類分析_airbnb_參考代碼

Airbnb數據字典

在這裏插入圖片描述

#調包
import pandas as pd
import seaborn as sns#更方便直接視圖,查看結果
import matplotlib.pyplot as plt#調參更加靈活
%matplotlib inline#用於jupter視圖語句
#數據導入
airbnb=pd.read_csv('w3_airbnb.csv')
#查看數據類型
#變量類別:用戶個人信息、用戶與airbnb的關係、app使用語言、用戶去的國家、用戶下單渠道
#這裏有2個日期變量,之後會進行操作
airbnb.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6752 entries, 0 to 6751
Data columns (total 14 columns):
age                     6752 non-null int64
date_account_created    6752 non-null object
date_first_booking      6752 non-null object
gender                  6752 non-null object
Language_EN             6752 non-null int64
Language_ZH             6752 non-null int64
Country_US              6752 non-null int64
Country_EUR             6752 non-null int64
android                 6752 non-null int64
moweb                   6752 non-null int64
web                     6752 non-null int64
ios                     6752 non-null int64
Married                 6752 non-null int64
Children                6752 non-null int64
dtypes: int64(11), object(3)
memory usage: 738.6+ KB
#用戶數據具體情況
airbnb.head()
age date_account_created date_first_booking gender Language_EN Language_ZH Country_US Country_EUR android moweb web ios Married Children
0 33 1/7/2010 1/8/2010 F 1 0 0 0 1 0 1 0 1 1
1 30 1/10/2010 1/11/2010 M 1 0 1 0 1 0 1 0 1 2
2 30 1/19/2010 1/21/2010 F 1 0 1 0 1 0 1 0 1 1
3 30 2/3/2010 2/4/2010 F 1 0 1 0 1 0 1 0 1 1
4 32 2/7/2010 2/7/2010 F 1 0 1 0 1 0 1 0 1 2
#單變量分析
#查看數字型變量核心指標
airbnb.describe()
age Language_EN Language_ZH Country_US Country_EUR android moweb web ios Married Children
count 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000 6752.000000
mean 47.791321 0.972156 0.006961 0.713270 0.162767 0.658472 0.340640 0.900770 0.064425 0.796949 1.535841
std 146.177746 0.164537 0.083147 0.452268 0.369180 0.474257 0.473959 0.298993 0.245527 0.402300 0.841394
min 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 28.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000 1.000000
50% 33.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 1.000000
75% 42.000000 1.000000 0.000000 1.000000 0.000000 1.000000 1.000000 1.000000 0.000000 1.000000 2.000000
max 2014.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 3.000000
#發現年齡最小是2最大是2014,屬於數據異常,進行數據清洗,這裏保留用戶年齡在18-80歲之間的羣體
airbnb=airbnb[airbnb['age']<=80]
airbnb=airbnb[airbnb['age']>=18]
airbnb.age.describe()
count    6607.000000
mean       35.982443
std        10.896507
min        18.000000
25%        28.000000
50%        33.000000
75%        41.000000
max        80.000000
Name: age, dtype: float64
#類別型變量(日期)的調整
#計算用戶註冊到2019年的時間
#第一步將註冊日期轉變爲日期時間格式
airbnb['date_account_created']=pd.to_datetime(airbnb['date_account_created'])
airbnb.info()
#發現data_account_created變量格式從object轉變爲datetime64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6607 entries, 0 to 6751
Data columns (total 14 columns):
age                     6607 non-null int64
date_account_created    6607 non-null datetime64[ns]
date_first_booking      6607 non-null object
gender                  6607 non-null object
Language_EN             6607 non-null int64
Language_ZH             6607 non-null int64
Country_US              6607 non-null int64
Country_EUR             6607 non-null int64
android                 6607 non-null int64
moweb                   6607 non-null int64
web                     6607 non-null int64
ios                     6607 non-null int64
Married                 6607 non-null int64
Children                6607 non-null int64
dtypes: datetime64[ns](1), int64(11), object(2)
memory usage: 774.3+ KB
#第二步,將年份從中提取出來,將2019-註冊日期的年份,並生成一個新的變量year_since_account_created
airbnb['year_since_account_created']=airbnb['date_account_created'].apply(lambda x:2019-x.year)
airbnb.year_since_account_created.describe()
#發現註冊時間最短的是5年,最長的是9年
count    6607.000000
mean        6.034812
std         0.961253
min         5.000000
25%         5.000000
50%         6.000000
75%         7.000000
max         9.000000
Name: year_since_account_created, dtype: float64
#計算用戶第一次預定到2019年的時間
#第一步將用戶第一次預定時間轉變爲日期時間格式
airbnb['date_first_booking']=pd.to_datetime(airbnb['date_first_booking'])
#第二步,將年份從中提取出來,將2019-第一次註冊的年份,並生成一個新的變量year_since_first_booking  
airbnb['year_since_first_booking']=airbnb['date_first_booking'].apply(lambda x:2019-x.year)
airbnb.year_since_first_booking.describe()
#發現距離第一次預定時間最短的是4年,最長的是9年
count    6607.000000
mean        5.910095
std         0.990769
min         4.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         9.000000
Name: year_since_first_booking, dtype: float64
#將類別型型轉化成啞變量(gender)
airbnb=pd.get_dummies(airbnb)
airbnb.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6607 entries, 0 to 6751
Data columns (total 18 columns):
age                           6607 non-null int64
date_account_created          6607 non-null datetime64[ns]
date_first_booking            6607 non-null datetime64[ns]
Language_EN                   6607 non-null int64
Language_ZH                   6607 non-null int64
Country_US                    6607 non-null int64
Country_EUR                   6607 non-null int64
android                       6607 non-null int64
moweb                         6607 non-null int64
web                           6607 non-null int64
ios                           6607 non-null int64
Married                       6607 non-null int64
Children                      6607 non-null int64
year_since_account_created    6607 non-null int64
year_since_first_booking      6607 non-null int64
gender_F                      6607 non-null uint8
gender_M                      6607 non-null uint8
gender_U                      6607 non-null uint8
dtypes: datetime64[ns](2), int64(13), uint8(3)
memory usage: 845.2 KB
#刪除兩個日期變量,可以根據數據格式來進行drop
airbnb.drop(airbnb.select_dtypes(['datetime64']),inplace=True,axis=1)
#數據準備完成
#選擇五個變量,作爲分羣的維度
#!這裏需要注意,變量變爲了airbnb_5,後面的操作中airbnb變爲airbnb_5,需要提醒用戶
airbnb_5=airbnb[['age','web','moweb','ios','android']]
#數據標準化,使用sklearn中預處理的scale
from sklearn.preprocessing import scale
x=pd.DataFrame(scale(airbnb_5))
#模型建立
#使用cluster建模
from sklearn import cluster
#先嚐試分爲3類
model=cluster.KMeans(n_clusters=3,random_state=10)
model.fit(x)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=10, tol=0.0001, verbose=0)
#提取標籤,查看分類結果
airbnb_5['cluster']=model.labels_
C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
airbnb_5.head(20)
age web moweb ios android cluster
0 33 1 0 0 1 1
1 30 1 0 0 1 1
2 30 1 0 0 1 1
3 30 1 0 0 1 1
4 32 1 0 0 1 1
5 46 1 1 0 0 0
6 30 1 0 0 1 1
7 46 1 0 0 1 1
9 33 1 0 0 1 1
10 45 1 0 0 1 1
11 32 1 1 0 0 0
12 46 1 0 0 1 1
13 29 1 0 0 1 1
14 29 1 0 0 1 1
16 33 1 1 0 0 0
17 37 1 0 0 1 1
18 28 1 0 0 1 1
19 41 1 0 0 1 1
21 30 1 1 0 0 0
22 35 1 0 0 1 1
#繪製散點圖,查看分羣結果
#橫座標爲age(年齡),縱座標爲ios(是否使用ios客戶端),類別會爲分羣類別
sns.scatterplot(x='age',y='ios',hue='cluster',data=airbnb_5)
<matplotlib.axes._subplots.AxesSubplot at 0x15e7a28c940>

在這裏插入圖片描述

#模型評估與優化
#使用groupby函數,評估各個變量維度的分羣效果
airbnb_5.groupby(['cluster'])['age'].describe()
count mean std min 25% 50% 75% max
cluster
0 2108.0 34.911290 9.866273 18.0 28.0 32.0 39.0 78.0
1 4072.0 36.871316 11.519153 18.0 29.0 34.0 43.0 80.0
2 427.0 32.793911 8.263822 18.0 27.0 31.0 36.0 70.0
airbnb_5.groupby(['cluster'])['ios'].describe()
count mean std min 25% 50% 75% max
cluster
0 2108.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 4072.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 427.0 1.0 0.0 1.0 1.0 1.0 1.0 1.0
#使用silhouette score,評估模型效果
from sklearn import metrics#調用sklearn的metrics庫
x_cluster=model.fit_predict(x)#個體與羣的距離
score=metrics.silhouette_score(x,x_cluster)#評分越高,個體與羣越近;評分越低,個體與羣越遠
print(score)
0.6359835014766492
centers=pd.DataFrame(model.cluster_centers_)
centers.to_csv('center_3.csv')
#將羣體分爲5組
model=cluster.KMeans(n_clusters=5,random_state=10)
model.fit(x)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=10, tol=0.0001, verbose=0)
centers=pd.DataFrame(model.cluster_centers_)
centers.to_csv('center_5.csv')
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章