一. 查看数据
% matplotlib inline
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import seaborn as sns
import warnings
warnings. filterwarnings( 'ignore' , category = DeprecationWarning)
data = pd. read_csv( 'train.csv' )
print ( data. shape)
data. head( )
二. 特征组合
datetime列中可以日提取出日期,时间,月份与周几(已经给出),是否工作日等信息
去掉datetime这列,因为我们已经拿出来有用的了
将这种类型变量定义成category类型
data[ 'datetime' ] = pd. to_datetime( data[ 'datetime' ] )
data[ 'date' ] = data[ 'datetime' ] . dt. date
data[ 'hour' ] = data[ 'datetime' ] . dt. hour
data[ 'weekday' ] = data[ 'datetime' ] . dt. weekday_name
data[ 'month' ] = data[ 'datetime' ] . dt. month
data[ 'season' ] = data. season. map ( { 1 : 'Spring' , 2 : 'Summer' , 3 : 'Fall' , 4 : 'Winter' } )
data[ 'weather' ] = data. weather. map ( { 1 : 'clear + few clouds + partly cloudy + partly cloudy' ,
2 : 'mist + cloudy,mist + broken clouds,mist + few clouds,mist' ,
3 : 'light snow,light rain + thunderstorm + scattered clouds,light rain + scattered clouds' ,
4 : 'heavy rain + ice pallets + thunderstorm + mist,snow + fog' } )
CategoryList = [ 'hour' , 'weekday' , 'month' , 'season' , 'weather' , 'holiday' , 'workingday' ]
for variable in CategoryList:
data[ variable] = data[ variable] . astype( 'category' )
data = data. drop( [ 'datetime' ] , axis = 1 )
data. head( 2 )
三. 变量的类型
float_count = len ( data. select_dtypes( include = 'float' ) . columns)
int_count = len ( data. select_dtypes( include = 'int64' ) . columns)
object_count = len ( data. select_dtypes( include = 'object' ) . columns)
category_count = len ( data. select_dtypes( include = 'category' ) . columns)
dataType = pd. DataFrame( { 'Type' : [ 'float_count' , 'int_count' , 'object_count' , 'category_count' ] ,
'Count' : [ float_count, int_count, object_count, category_count] } )
dataType = dataType. sort_values( 'Count' , ascending = False )
sns. set_style( 'darkgrid' )
plt. figure( figsize = ( 8 , 6 ) )
sns. barplot( data = dataType, x = 'Type' , y = 'Count' )
plt. xlabel( 'VariableType' ) ; plt. ylabel( 'Count' )
plt. title( 'Variable DataType Count' )
四. 离群点
4.1 观察变量
fig, axes = plt. subplots( nrows = 3 , ncols = 2 , figsize = ( 12 , 12 ) )
sns. boxplot( data = data, y = 'count' , orient = 'v' , ax = axes[ 0 ] [ 0 ] )
sns. boxplot( data = data, y = 'count' , x = 'season' , orient = 'v' , ax = axes[ 0 ] [ 1 ] )
sns. boxplot( data = data, y = 'count' , x = 'hour' , orient = 'v' , ax = axes[ 1 ] [ 0 ] )
sns. boxplot( data = data, y = 'count' , x = 'workingday' , orient = 'v' , ax = axes[ 1 ] [ 1 ] )
sns. boxplot( data = data, y = 'count' , x = 'month' , orient = 'v' , ax = axes[ 2 ] [ 0 ] )
sns. boxplot( data = data, y = 'count' , x = 'weather' , orient = 'v' , ax = axes[ 2 ] [ 1 ] )
axes[ 0 , 0 ] . set ( ylabel = 'Count' , title = 'Box Plot On Count' )
axes[ 0 , 1 ] . set ( ylabel = 'Count' , xlabel = 'Season' , title = 'Box Plot On Count Acorss Season' )
axes[ 1 , 0 ] . set ( ylabel = 'Count' , xlabel = 'Hour' , title = 'Box Plot On Count Acorss Season' )
axes[ 1 , 1 ] . set ( ylabel = 'Count' , xlabel = 'WorkingDay' , title = 'Box Plot On Count Acorss Season' )
axes[ 2 , 0 ] . set ( ylabel = 'Count' , xlabel = 'Month' , title = 'Box Plot On Count Acorss Season' )
axes[ 2 , 1 ] . set ( ylabel = 'Count' , xlabel = 'Weahter' , title = 'Box Plot On Count Acorss Season' ,
xticklabels = [ 'Clear' , 'Mist' , 'Light Snow' , 'Heavy Rain' ] )
plt. tight_layout( )
4.2 剔除离群点
WithoutOutliers = data[ np. abs ( data[ 'count' ] - data[ 'count' ] . mean( ) ) <= ( 3 * data[ 'count' ] . std( ) ) ]
print ( 'Shape of the before outliers:' , data. shape)
print ( 'Shape of the after outliers:' , WithoutOutliers. shape)
WithoutOutliers. head( 3 )
五. 变量间相关系数
5.1 Heatmap
datacorr = data[ [ "temp" , "atemp" , "casual" , "registered" , "humidity" , "windspeed" , "count" ] ] . corr( )
mask = np. array( datacorr)
mask[ np. tril_indices_from( mask) ] = False
plt. figure( figsize = ( 10 , 8 ) )
sns. heatmap( datacorr, mask = mask, vmax = 0.8 , square = True , annot = True )
5.2 Seaborn.regplot
fig, ( ax1, ax2, ax3) = plt. subplots( ncols = 3 , figsize = ( 10 , 4 ) )
sns. regplot( data = data, y = 'count' , x = 'temp' , ax = ax1, scatter_kws = { 's' : 4 } )
sns. regplot( data = data, y = 'count' , x = 'windspeed' , ax = ax2, scatter_kws = { 's' : 4 } )
sns. regplot( data = data, y = 'count' , x = 'humidity' , ax = ax3, scatter_kws = { 's' : 4 } )
六. 标签与特征变化可视化
fig, ( ax1, ax2, ax3, ax4) = plt. subplots( nrows = 4 , figsize = ( 12 , 20 ) )
monthOrder= [ 'January' , 'February' , 'March' , 'April' , 'May' , 'June' ,
'July' , 'August' , 'September' , 'October' , 'Novermber' , 'December' ]
weekOrder= [ 'Monday' , 'Tuesday' , 'Wednesday' , 'Thursday' , 'Friday' , 'Saturday' , 'Sunday' ]
monthdata = pd. DataFrame( data. groupby( 'month' ) [ 'count' ] . mean( ) ) . reset_index( )
sns. barplot( data = monthdata, x = 'month' , y = 'count' , ax = ax1)
ax1. set ( xlabel = 'Month' , ylabel = 'Count' , title = 'Average Count by Monts' )
ax1. set_xticklabels( monthOrder)
hourdata = pd. DataFrame( data. groupby( [ 'hour' , 'season' ] ) [ 'count' ] . mean( ) ) . reset_index( )
sns. pointplot( x = hourdata[ 'hour' ] , y = hourdata[ 'count' ] , hue = hourdata[ 'season' ] ,
data = hourdata, join = True , ax = ax2)
ax2. set ( xlabel = 'Hour' , ylabel = 'Count' , title = 'Average Count by Hour of Season' )
weekdata = pd. DataFrame( data. groupby( [ 'hour' , 'weekday' ] , sort = True ) [ 'count' ] . mean( ) ) . reset_index( )
sns. pointplot( x = weekdata[ 'hour' ] , y = weekdata[ 'count' ] , hue = weekdata[ 'weekday' ] , ax = ax3,
data = weekdata, hue_order = weekOrder, palette = sns. color_palette( "hls" , 7 ) )
ax3. set ( xlabel = 'Hour' , ylabel = 'Count' , title = 'Average Count by Hour of Weekday' )
casuals = pd. melt( data[ [ 'hour' , 'casual' , 'registered' ] ] , id_vars = [ 'hour' ] , value_vars = [ 'casual' , 'registered' ] )
casualdata = pd. DataFrame( casuals. groupby( [ 'hour' , 'variable' ] , sort = True ) [ 'value' ] . mean( ) ) . reset_index( )
sns. pointplot( x = casualdata[ 'hour' ] , y = casualdata[ 'value' ] , data = casualdata,
hue = casualdata[ 'variable' ] , join = True , ax = ax4)
ax4. set ( xlabel = 'Hour' , ylabel = 'Count' , title = 'Average Count by Hour of User Type' ) ;
plt. tight_layout( )