小費數據集來源python第三方庫seaborn中自帶數據;
一:數據導入
import numpy as np
from pandas import Series,DataFrame
import pandas as pd
import seaborn as sns #導入seaborn庫中自帶數據
In [6]:
tips = sns.load_dataset('tips')
tips.head()
Out[6]:
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
In [7]:
二:定義問題:
三:數據清洗:
tips.shape#查看數據集的大小
Out[7]:
(244, 7)
In [8]:
tips.describe()#查看數據集中各屬性
Out[8]:
total_bill | tip | size | |
---|---|---|---|
count |
244.000000 |
244.000000 | 244.000000 |
mean | 19.785943 | 2.998279 | 2.569672 |
std | 8.902412 | 1.383638 | 0.951100 |
min | 3.070000 | 1.000000 | 1.000000 |
25% | 13.347500 | 2.000000 | 2.000000 |
50% | 17.795000 | 2.900000 | 2.000000 |
75% | 24.127500 | 3.562500 | 3.000000 |
max | 50.810000 | 10.000000 | 6.000000 |
In [9]:
tips.info()#查看數據集是否有缺失值
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill 244 non-null float64
tip 244 non-null float64
sex 244 non-null category
smoker 244 non-null category
day 244 non-null category
time 244 non-null category
size 244 non-null int64
dtypes: category(4), float64(2), int64(1)
memory usage: 7.2 KB
In [10]:
四:數據探索:
#散點圖用來表示數據之間的規律 通過plot函數的kind = 'scatter'可進行繪製;
tips.plot(kind = 'scatter',x = 'total_bill',y = 'tip')
#小費金額與消費總額進行分析,看看之間的關聯(繪製散點圖)
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x2470bc30be0>
In [42]:
#線性圖
#線性圖用於繪製兩組數據之間的趨勢;plot()方法
tips.plot(x = 'total_bill',y = 'tip')
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x2470c727dd8>
In [11]:
#以下代碼爲性別與小費關係(使用柱狀圖)
#首先通過sex屬性名計算不通屬性值的平均值
male_tip = tips[tips['sex'] == 'Male']['tip'].mean()
male_tip
Out[11]:
3.0896178343949052
In [14]:
female_tip = tips[tips['sex'] == 'Female']['tip'].mean()
female_tip
Out[14]:
2.833448275862069
In [15]:
#Series 是一個一維數組對象 ,它包含一組索引和一組數據,可以把它理解爲一組帶索引的數組。
s = Series([male_tip,female_tip],index=['male','female'])
s
Out[15]:
male 3.089618
female 2.833448
dtype: float64
In [16]:
#柱狀圖
#通過plot函數的kind = 'bar'可進行繪製
s.plot(kind='bar')
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x2470c209278>
In [40]:
#水平柱狀圖(類別較多情況)
#通過plot函數的kind = 'barh'可進行繪製
s.plot(kind='barh')
Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0x2470c6569b0>
In [41]:
#堆積柱狀圖(類別較多情況)
#通過plot函數的kind = 'barh'可進行繪製再加stacked參數設置
s.plot(kind='barh',stacked=True,alpha=0.5)
Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x2470c6d1b00>
In [39]:
#通過unique函數查看屬性列下的唯一值
#查看日期的唯一值
tips['day'].unique()
Out[39]:
[Sun, Sat, Thur, Fri]
Categories (4, object): [Sun, Sat, Thur, Fri]
In [24]:
#以下爲日期平均小費柱狀圖構成
Sun_tip = tips[tips['day'] == 'Sun']['tip'].mean()
Sun_tip
Out[24]:
3.255131578947369
In [27]:
Sat_tip = tips[tips['day'] == 'Sat']['tip'].mean()
Sat_tip
Out[27]:
2.993103448275862
In [26]:
Thur_tip = tips[tips['day'] == 'Thur']['tip'].mean()
Thur_tip
Out[26]:
2.771451612903226
In [25]:
Fri_tip = tips[tips['day'] == 'Fri']['tip'].mean()
Fri_tip
Out[25]:
2.734736842105263
In [30]:
day_tip = Series([Sun,Sat,Thur,Fri],index=['Sun_tip','Sat_tip','Thur_tip','Fri_tip'])
day_tip
Out[30]:
Sun_tip 3.255132
Sat_tip 2.993103
Thur_tip 2.771452
Fri_tip 2.734737
dtype: float64
In [31]:
day_tip.plot(kind='bar')
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x2470c261eb8>
In [32]:
#小費百分比6
tips['percent_tip'] = tips['tip']/(tips['total_bill']+tips['tip'])
tips.head(8)
Out[32]:
total_bill | tip | sex | smoker | day | time | size | percent_tip | |
---|---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 0.056111 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 0.138333 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 0.142799 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 0.122638 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 0.128014 |
5 | 25.29 | 4.71 | Male | No | Sun | Dinner | 4 | 0.157000 |
6 | 8.77 | 2.00 | Male | No | Sun | Dinner | 2 | 0.185701 |
7 | 26.88 | 3.12 | Male | No | Sun | Dinner | 4 | 0.104000 |
In [33]:
#密度圖
#核密度估計(將數據的分佈近似爲一組核)(正態分佈)
tips['percent_tip'].plot(kind='kde')
#通過plot函數的kind='kde'可進行繪製;
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x2470c2cdbe0>
In [38]:
#直方圖
#直方圖可用於頻率分佈,y軸可爲數值或者比率(可以看出大概分佈規律)
#通過hist方法繪製直方圖(bin參數將值分爲多少段默認爲10,grid參數可圖表中添加網格)
tips['tip'].hist(bins=10,grid=False)
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x2470c518358>