數據來源
https://fivethirtyeight.com/politics/
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
link = '/Users/bennyrhys/Desktop/數據分析可視化-數據集/homework/usa_flights.csv'
df = pd.read_csv(link)
df.head()
|
flight_date |
unique_carrier |
flight_num |
origin |
dest |
arr_delay |
cancelled |
distance |
carrier_delay |
weather_delay |
late_aircraft_delay |
nas_delay |
security_delay |
actual_elapsed_time |
0 |
02/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-19.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
381.0 |
1 |
03/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-39.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
358.0 |
2 |
04/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-12.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
385.0 |
3 |
05/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-8.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
389.0 |
4 |
06/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
25.0 |
0 |
2475 |
0.0 |
0.0 |
0.0 |
25.0 |
0.0 |
424.0 |
df.tail()
|
flight_date |
unique_carrier |
flight_num |
origin |
dest |
arr_delay |
cancelled |
distance |
carrier_delay |
weather_delay |
late_aircraft_delay |
nas_delay |
security_delay |
actual_elapsed_time |
201659 |
10/01/2015 0:00 |
NK |
188 |
OAK |
LAS |
-16.0 |
0 |
407 |
NaN |
NaN |
NaN |
NaN |
NaN |
77.0 |
201660 |
11/01/2015 0:00 |
NK |
188 |
OAK |
LAS |
-4.0 |
0 |
407 |
NaN |
NaN |
NaN |
NaN |
NaN |
87.0 |
201661 |
12/01/2015 0:00 |
NK |
188 |
OAK |
LAS |
-7.0 |
0 |
407 |
NaN |
NaN |
NaN |
NaN |
NaN |
82.0 |
201662 |
13/01/2015 0:00 |
NK |
188 |
OAK |
LAS |
23.0 |
0 |
407 |
3.0 |
0.0 |
0.0 |
20.0 |
0.0 |
103.0 |
201663 |
14/01/2015 0:00 |
NK |
188 |
OAK |
LAS |
-7.0 |
0 |
407 |
NaN |
NaN |
NaN |
NaN |
NaN |
82.0 |
df.shape
(201664, 14)
判斷延誤arr_delay>0就是延誤
排序航班到達時間,前十名遞減
df.sort_values('arr_delay', ascending=False)[:10]
|
flight_date |
unique_carrier |
flight_num |
origin |
dest |
arr_delay |
cancelled |
distance |
carrier_delay |
weather_delay |
late_aircraft_delay |
nas_delay |
security_delay |
actual_elapsed_time |
11073 |
11/01/2015 0:00 |
AA |
1595 |
AUS |
DFW |
1444.0 |
0 |
190 |
1444.0 |
0.0 |
0.0 |
0.0 |
0.0 |
59.0 |
10214 |
13/01/2015 0:00 |
AA |
1487 |
OMA |
DFW |
1392.0 |
0 |
583 |
1392.0 |
0.0 |
0.0 |
0.0 |
0.0 |
117.0 |
12430 |
03/01/2015 0:00 |
AA |
1677 |
MEM |
DFW |
1384.0 |
0 |
432 |
1380.0 |
0.0 |
0.0 |
4.0 |
0.0 |
104.0 |
8443 |
04/01/2015 0:00 |
AA |
1279 |
OMA |
DFW |
1237.0 |
0 |
583 |
1222.0 |
0.0 |
15.0 |
0.0 |
0.0 |
102.0 |
10328 |
05/01/2015 0:00 |
AA |
1495 |
EGE |
DFW |
1187.0 |
0 |
721 |
1019.0 |
0.0 |
168.0 |
0.0 |
0.0 |
127.0 |
36570 |
04/01/2015 0:00 |
DL |
1435 |
MIA |
MSP |
1174.0 |
0 |
1501 |
1174.0 |
0.0 |
0.0 |
0.0 |
0.0 |
231.0 |
36495 |
04/01/2015 0:00 |
DL |
1367 |
ROC |
ATL |
1138.0 |
0 |
749 |
1112.0 |
0.0 |
0.0 |
26.0 |
0.0 |
171.0 |
59072 |
14/01/2015 0:00 |
DL |
1687 |
SAN |
MSP |
1084.0 |
0 |
1532 |
1070.0 |
0.0 |
0.0 |
14.0 |
0.0 |
240.0 |
32173 |
05/01/2015 0:00 |
AA |
970 |
LAS |
LAX |
1042.0 |
0 |
236 |
1033.0 |
0.0 |
9.0 |
0.0 |
0.0 |
66.0 |
56488 |
12/01/2015 0:00 |
DL |
2117 |
ATL |
COS |
1016.0 |
0 |
1184 |
1016.0 |
0.0 |
0.0 |
0.0 |
0.0 |
193.0 |
計算延誤和沒有延誤所佔比例
df['cancelled'].value_counts()
0 196873
1 4791
Name: cancelled, dtype: int64
df['delayed'] = df['arr_delay'].apply(lambda x: x > 0)
df.head()
|
flight_date |
unique_carrier |
flight_num |
origin |
dest |
arr_delay |
cancelled |
distance |
carrier_delay |
weather_delay |
late_aircraft_delay |
nas_delay |
security_delay |
actual_elapsed_time |
delayed |
0 |
02/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-19.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
381.0 |
False |
1 |
03/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-39.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
358.0 |
False |
2 |
04/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-12.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
385.0 |
False |
3 |
05/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
-8.0 |
0 |
2475 |
NaN |
NaN |
NaN |
NaN |
NaN |
389.0 |
False |
4 |
06/01/2015 0:00 |
AA |
1 |
JFK |
LAX |
25.0 |
0 |
2475 |
0.0 |
0.0 |
0.0 |
25.0 |
0.0 |
424.0 |
True |
delay_data = df['delayed'].value_counts()
delay_data
False 103037
True 98627
Name: delayed, dtype: int64
type(delay_data)
pandas.core.series.Series
delay_data[1] / (delay_data[0] + delay_data[1])
0.4890659711202793
每一個航空公司延誤的情況
delay_group = df.groupby(['unique_carrier', 'delayed'])
delay_group
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11ff50710>
delay_group.size()
unique_carrier delayed
AA False 8912
True 9841
AS False 3527
True 2104
B6 False 4832
True 4401
DL False 17719
True 9803
EV False 10596
True 11371
F9 False 1103
True 1848
HA False 1351
True 1354
MQ False 4692
True 8060
NK False 1550
True 2133
OO False 9977
True 10804
UA False 7885
True 8624
US False 7850
True 6353
VX False 1254
True 781
WN False 21789
True 21150
dtype: int64
df_delay = delay_group.size().unstack()
df_delay
delayed |
False |
True |
unique_carrier |
|
|
AA |
8912 |
9841 |
AS |
3527 |
2104 |
B6 |
4832 |
4401 |
DL |
17719 |
9803 |
EV |
10596 |
11371 |
F9 |
1103 |
1848 |
HA |
1351 |
1354 |
MQ |
4692 |
8060 |
NK |
1550 |
2133 |
OO |
9977 |
10804 |
UA |
7885 |
8624 |
US |
7850 |
6353 |
VX |
1254 |
781 |
WN |
21789 |
21150 |
import matplotlib.pyplot as plt
df_delay.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1210efb50>
plt.show()
df_delay.plot(kind='barh', stacked=True, figsize=[16,6], colormap='winter')
<matplotlib.axes._subplots.AxesSubplot at 0x11c9e2290>
透視表功能
flights_by_carrier = df.pivot_table(index='flight_date', columns='unique_carrier')
flights_by_carrier.head()
|
actual_elapsed_time |
... |
weather_delay |
unique_carrier |
AA |
AS |
B6 |
DL |
EV |
F9 |
HA |
MQ |
NK |
OO |
... |
EV |
F9 |
HA |
MQ |
NK |
OO |
UA |
US |
VX |
WN |
flight_date |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
02/01/2015 0:00 |
176.852122 |
182.872117 |
174.580475 |
155.151703 |
103.325014 |
148.674603 |
108.654709 |
99.608205 |
156.775439 |
101.953434 |
... |
0.214118 |
0.000000 |
7.621849 |
4.956916 |
0.0 |
1.154150 |
0.816867 |
0.131429 |
4.413793 |
0.674080 |
03/01/2015 0:00 |
177.679298 |
189.126126 |
178.595474 |
161.668481 |
107.364508 |
158.116667 |
108.158416 |
102.381295 |
162.085106 |
107.836902 |
... |
1.605061 |
0.454545 |
0.000000 |
4.263838 |
0.0 |
1.369444 |
1.317901 |
2.938053 |
5.350000 |
1.442974 |
04/01/2015 0:00 |
178.200938 |
184.766376 |
179.517287 |
156.963620 |
104.893505 |
149.746888 |
104.878641 |
109.936095 |
162.289753 |
106.465820 |
... |
1.160754 |
0.374269 |
0.000000 |
9.286834 |
0.0 |
1.006859 |
2.996965 |
1.350000 |
6.414634 |
1.116999 |
05/01/2015 0:00 |
176.660858 |
178.226328 |
174.088000 |
142.719584 |
103.107938 |
152.104839 |
103.091787 |
108.128505 |
164.081560 |
103.141727 |
... |
4.005384 |
0.586957 |
0.000000 |
12.048822 |
0.0 |
2.732057 |
8.422122 |
0.116838 |
3.312500 |
1.370968 |
06/01/2015 0:00 |
171.155474 |
173.200483 |
175.029326 |
144.458049 |
100.694926 |
146.739837 |
100.168317 |
100.343423 |
165.410909 |
99.287270 |
... |
2.685092 |
4.866667 |
2.666667 |
7.971370 |
0.0 |
2.753521 |
6.551102 |
6.155660 |
13.615385 |
2.797213 |
5 rows × 154 columns