EDA探索性數據分析-- 共享單車數據

一. 查看數據

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category = DeprecationWarning)

data = pd.read_csv('train.csv')
print(data.shape)
# data.info()
# data.dtypes
data.head()

在這裏插入圖片描述

二. 特徵組合

  • datetime列中可以日提取出日期,時間,月份與周幾(已經給出),是否工作日等信息
  • 去掉datetime這列,因爲我們已經拿出來有用的了
  • 將這種類型變量定義成category類型
# 提取新特徵

data['datetime'] = pd.to_datetime(data['datetime'])

data['date'] = data['datetime'].dt.date
data['hour'] = data['datetime'].dt.hour
data['weekday'] = data['datetime'].dt.weekday_name
data['month'] = data['datetime'].dt.month
data['season'] = data.season.map({1:'Spring',2:'Summer',3:'Fall',4:'Winter'})
data['weather'] = data.weather.map({1:'clear + few clouds + partly cloudy + partly cloudy',
                        2:'mist + cloudy,mist + broken clouds,mist + few clouds,mist',
                        3:'light snow,light rain + thunderstorm + scattered clouds,light rain + scattered clouds',
                        4:'heavy rain + ice pallets + thunderstorm + mist,snow + fog'})

# 將特徵定義爲category類型

CategoryList = ['hour','weekday','month','season','weather','holiday','workingday']
for variable in CategoryList:
    data[variable] = data[variable].astype('category')
    
# 去掉datetime這列

data = data.drop(['datetime'], axis = 1)
data.head(2)

在這裏插入圖片描述

三. 變量的類型

float_count = len(data.select_dtypes(include = 'float').columns)
int_count = len(data.select_dtypes(include = 'int64').columns)
object_count = len(data.select_dtypes(include = 'object').columns)
category_count = len(data.select_dtypes(include = 'category').columns)

dataType = pd.DataFrame({'Type': ['float_count', 'int_count','object_count', 'category_count'],
                        'Count': [float_count, int_count,object_count, category_count]})
dataType = dataType.sort_values('Count', ascending = False)

sns.set_style('darkgrid')
plt.figure(figsize = (8, 6))
sns.barplot(data = dataType, x = 'Type', y = 'Count')
plt.xlabel('VariableType'); plt.ylabel('Count')
plt.title('Variable DataType Count')

在這裏插入圖片描述

四. 離羣點

4.1 觀察變量

fig, axes = plt.subplots(nrows = 3, ncols = 2, figsize = (12, 12))
sns.boxplot(data = data, y = 'count', orient = 'v', ax = axes[0][0])
sns.boxplot(data = data, y = 'count', x = 'season',     orient = 'v', ax = axes[0][1])
sns.boxplot(data = data, y = 'count', x = 'hour',       orient = 'v', ax = axes[1][0])
sns.boxplot(data = data, y = 'count', x = 'workingday', orient = 'v', ax = axes[1][1])
sns.boxplot(data = data, y = 'count', x = 'month',      orient = 'v', ax = axes[2][0])
sns.boxplot(data = data, y = 'count', x = 'weather',    orient = 'v', ax = axes[2][1])

axes[0, 0].set(ylabel = 'Count', title = 'Box Plot On Count')
axes[0, 1].set(ylabel = 'Count', xlabel = 'Season',     title = 'Box Plot On Count Acorss Season')
axes[1, 0].set(ylabel = 'Count', xlabel = 'Hour',       title = 'Box Plot On Count Acorss Season')
axes[1, 1].set(ylabel = 'Count', xlabel = 'WorkingDay', title = 'Box Plot On Count Acorss Season')
axes[2, 0].set(ylabel = 'Count', xlabel = 'Month',      title = 'Box Plot On Count Acorss Season')
axes[2, 1].set(ylabel = 'Count', xlabel = 'Weahter',    title = 'Box Plot On Count Acorss Season',
              xticklabels = ['Clear','Mist','Light Snow','Heavy Rain'])
plt.tight_layout()

在這裏插入圖片描述

4.2 剔除離羣點

WithoutOutliers = data[np.abs(data['count'] - data['count'].mean()) <= (3 * data['count'].std())]
print('Shape of the before outliers:', data.shape)
print('Shape of the after outliers:', WithoutOutliers.shape)
WithoutOutliers.head(3)

在這裏插入圖片描述

五. 變量間相關係數

5.1 Heatmap

datacorr = data[["temp","atemp","casual","registered","humidity","windspeed","count"]].corr()
mask = np.array(datacorr)
mask[np.tril_indices_from(mask)] = False

plt.figure(figsize = (10, 8))
sns.heatmap(datacorr, mask = mask, vmax = 0.8, square = True, annot = True)

在這裏插入圖片描述

5.2 Seaborn.regplot

fig, (ax1, ax2, ax3) = plt.subplots(ncols = 3, figsize = (10, 4))
sns.regplot(data = data, y = 'count', x = 'temp',      ax = ax1, scatter_kws = {'s': 4})
sns.regplot(data = data, y = 'count', x = 'windspeed', ax = ax2, scatter_kws = {'s': 4})
sns.regplot(data = data, y = 'count', x = 'humidity',  ax = ax3, scatter_kws = {'s': 4})

在這裏插入圖片描述

六. 標籤與特徵變化可視化

fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows = 4, figsize = (12, 20))
monthOrder=['January', 'February', 'March', 'April', 'May', 'June',
            'July', 'August', 'September', 'October', 'Novermber', 'December']
weekOrder=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

monthdata = pd.DataFrame(data.groupby('month')['count'].mean()).reset_index()
# monthdata = monthdata.sort_values(by='count',ascending=False)
sns.barplot(data = monthdata, x = 'month', y = 'count', ax = ax1)
ax1.set(xlabel = 'Month', ylabel = 'Count', title = 'Average Count by Monts')
ax1.set_xticklabels(monthOrder)

hourdata = pd.DataFrame(data.groupby(['hour', 'season'])['count'].mean()).reset_index()
sns.pointplot(x = hourdata['hour'], y = hourdata['count'], hue = hourdata['season'],
              data = hourdata, join = True, ax = ax2)
ax2.set(xlabel = 'Hour', ylabel = 'Count', title = 'Average Count by Hour of Season')

weekdata = pd.DataFrame(data.groupby(['hour', 'weekday'], sort = True)['count'].mean()).reset_index()
sns.pointplot(x = weekdata['hour'], y = weekdata['count'], hue = weekdata['weekday'], ax = ax3,
              data = weekdata, hue_order = weekOrder, palette = sns.color_palette("hls", 7))
ax3.set(xlabel = 'Hour', ylabel = 'Count', title = 'Average Count by Hour of Weekday')

# pandas.melt()函數爲 pandas.poivt()函數的逆操作
casuals = pd.melt(data[['hour', 'casual', 'registered']], id_vars = ['hour'], value_vars = ['casual', 'registered'])
casualdata = pd.DataFrame(casuals.groupby(['hour', 'variable'], sort = True)['value'].mean()).reset_index()
sns.pointplot(x = casualdata['hour'], y = casualdata['value'], data = casualdata,
              hue = casualdata['variable'], join = True, ax = ax4)
ax4.set(xlabel = 'Hour', ylabel = 'Count', title = 'Average Count by Hour of User Type');

plt.tight_layout()

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章