Airbnb數據字典
import pandas as pd
import seaborn as sns
import matplotlib. pyplot as plt
% matplotlib inline
airbnb= pd. read_csv( 'w3_airbnb.csv' )
airbnb. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6752 entries, 0 to 6751
Data columns (total 14 columns):
age 6752 non-null int64
date_account_created 6752 non-null object
date_first_booking 6752 non-null object
gender 6752 non-null object
Language_EN 6752 non-null int64
Language_ZH 6752 non-null int64
Country_US 6752 non-null int64
Country_EUR 6752 non-null int64
android 6752 non-null int64
moweb 6752 non-null int64
web 6752 non-null int64
ios 6752 non-null int64
Married 6752 non-null int64
Children 6752 non-null int64
dtypes: int64(11), object(3)
memory usage: 738.6+ KB
airbnb. head( )
age
date_account_created
date_first_booking
gender
Language_EN
Language_ZH
Country_US
Country_EUR
android
moweb
web
ios
Married
Children
0
33
1/7/2010
1/8/2010
F
1
0
0
0
1
0
1
0
1
1
1
30
1/10/2010
1/11/2010
M
1
0
1
0
1
0
1
0
1
2
2
30
1/19/2010
1/21/2010
F
1
0
1
0
1
0
1
0
1
1
3
30
2/3/2010
2/4/2010
F
1
0
1
0
1
0
1
0
1
1
4
32
2/7/2010
2/7/2010
F
1
0
1
0
1
0
1
0
1
2
airbnb. describe( )
age
Language_EN
Language_ZH
Country_US
Country_EUR
android
moweb
web
ios
Married
Children
count
6752.000000
6752.000000
6752.000000
6752.000000
6752.000000
6752.000000
6752.000000
6752.000000
6752.000000
6752.000000
6752.000000
mean
47.791321
0.972156
0.006961
0.713270
0.162767
0.658472
0.340640
0.900770
0.064425
0.796949
1.535841
std
146.177746
0.164537
0.083147
0.452268
0.369180
0.474257
0.473959
0.298993
0.245527
0.402300
0.841394
min
2.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
25%
28.000000
1.000000
0.000000
0.000000
0.000000
0.000000
0.000000
1.000000
0.000000
1.000000
1.000000
50%
33.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
1.000000
75%
42.000000
1.000000
0.000000
1.000000
0.000000
1.000000
1.000000
1.000000
0.000000
1.000000
2.000000
max
2014.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
3.000000
airbnb= airbnb[ airbnb[ 'age' ] <= 80 ]
airbnb= airbnb[ airbnb[ 'age' ] >= 18 ]
airbnb. age. describe( )
count 6607.000000
mean 35.982443
std 10.896507
min 18.000000
25% 28.000000
50% 33.000000
75% 41.000000
max 80.000000
Name: age, dtype: float64
airbnb[ 'date_account_created' ] = pd. to_datetime( airbnb[ 'date_account_created' ] )
airbnb. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6607 entries, 0 to 6751
Data columns (total 14 columns):
age 6607 non-null int64
date_account_created 6607 non-null datetime64[ns]
date_first_booking 6607 non-null object
gender 6607 non-null object
Language_EN 6607 non-null int64
Language_ZH 6607 non-null int64
Country_US 6607 non-null int64
Country_EUR 6607 non-null int64
android 6607 non-null int64
moweb 6607 non-null int64
web 6607 non-null int64
ios 6607 non-null int64
Married 6607 non-null int64
Children 6607 non-null int64
dtypes: datetime64[ns](1), int64(11), object(2)
memory usage: 774.3+ KB
airbnb[ 'year_since_account_created' ] = airbnb[ 'date_account_created' ] . apply ( lambda x: 2019 - x. year)
airbnb. year_since_account_created. describe( )
count 6607.000000
mean 6.034812
std 0.961253
min 5.000000
25% 5.000000
50% 6.000000
75% 7.000000
max 9.000000
Name: year_since_account_created, dtype: float64
airbnb[ 'date_first_booking' ] = pd. to_datetime( airbnb[ 'date_first_booking' ] )
airbnb[ 'year_since_first_booking' ] = airbnb[ 'date_first_booking' ] . apply ( lambda x: 2019 - x. year)
airbnb. year_since_first_booking. describe( )
count 6607.000000
mean 5.910095
std 0.990769
min 4.000000
25% 5.000000
50% 6.000000
75% 6.000000
max 9.000000
Name: year_since_first_booking, dtype: float64
airbnb= pd. get_dummies( airbnb)
airbnb. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6607 entries, 0 to 6751
Data columns (total 18 columns):
age 6607 non-null int64
date_account_created 6607 non-null datetime64[ns]
date_first_booking 6607 non-null datetime64[ns]
Language_EN 6607 non-null int64
Language_ZH 6607 non-null int64
Country_US 6607 non-null int64
Country_EUR 6607 non-null int64
android 6607 non-null int64
moweb 6607 non-null int64
web 6607 non-null int64
ios 6607 non-null int64
Married 6607 non-null int64
Children 6607 non-null int64
year_since_account_created 6607 non-null int64
year_since_first_booking 6607 non-null int64
gender_F 6607 non-null uint8
gender_M 6607 non-null uint8
gender_U 6607 non-null uint8
dtypes: datetime64[ns](2), int64(13), uint8(3)
memory usage: 845.2 KB
airbnb. drop( airbnb. select_dtypes( [ 'datetime64' ] ) , inplace= True , axis= 1 )
airbnb_5= airbnb[ [ 'age' , 'web' , 'moweb' , 'ios' , 'android' ] ]
from sklearn. preprocessing import scale
x= pd. DataFrame( scale( airbnb_5) )
from sklearn import cluster
model= cluster. KMeans( n_clusters= 3 , random_state= 10 )
model. fit( x)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
random_state=10, tol=0.0001, verbose=0)
airbnb_5[ 'cluster' ] = model. labels_
C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""Entry point for launching an IPython kernel.
airbnb_5. head( 20 )
age
web
moweb
ios
android
cluster
0
33
1
0
0
1
1
1
30
1
0
0
1
1
2
30
1
0
0
1
1
3
30
1
0
0
1
1
4
32
1
0
0
1
1
5
46
1
1
0
0
0
6
30
1
0
0
1
1
7
46
1
0
0
1
1
9
33
1
0
0
1
1
10
45
1
0
0
1
1
11
32
1
1
0
0
0
12
46
1
0
0
1
1
13
29
1
0
0
1
1
14
29
1
0
0
1
1
16
33
1
1
0
0
0
17
37
1
0
0
1
1
18
28
1
0
0
1
1
19
41
1
0
0
1
1
21
30
1
1
0
0
0
22
35
1
0
0
1
1
sns. scatterplot( x= 'age' , y= 'ios' , hue= 'cluster' , data= airbnb_5)
<matplotlib.axes._subplots.AxesSubplot at 0x15e7a28c940>
airbnb_5. groupby( [ 'cluster' ] ) [ 'age' ] . describe( )
count
mean
std
min
25%
50%
75%
max
cluster
0
2108.0
34.911290
9.866273
18.0
28.0
32.0
39.0
78.0
1
4072.0
36.871316
11.519153
18.0
29.0
34.0
43.0
80.0
2
427.0
32.793911
8.263822
18.0
27.0
31.0
36.0
70.0
airbnb_5. groupby( [ 'cluster' ] ) [ 'ios' ] . describe( )
count
mean
std
min
25%
50%
75%
max
cluster
0
2108.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1
4072.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2
427.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
from sklearn import metrics
x_cluster= model. fit_predict( x)
score= metrics. silhouette_score( x, x_cluster)
print ( score)
0.6359835014766492
centers= pd. DataFrame( model. cluster_centers_)
centers. to_csv( 'center_3.csv' )
model= cluster. KMeans( n_clusters= 5 , random_state= 10 )
model. fit( x)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
random_state=10, tol=0.0001, verbose=0)
centers= pd. DataFrame( model. cluster_centers_)
centers. to_csv( 'center_5.csv' )