# 读取flights数据集
flights = pd.read_csv('data/flights.csv')
flights
|
MONTH |
DAY |
WEEKDAY |
AIRLINE |
ORG_AIR |
DEST_AIR |
SCHED_DEP |
DEP_DELAY |
AIR_TIME |
DIST |
SCHED_ARR |
ARR_DELAY |
DIVERTED |
CANCELLED |
0 |
1 |
1 |
4 |
WN |
LAX |
SLC |
1625 |
58.0 |
94.0 |
590 |
1905 |
65.0 |
0 |
0 |
1 |
1 |
1 |
4 |
UA |
DEN |
IAD |
823 |
7.0 |
154.0 |
1452 |
1333 |
-13.0 |
0 |
0 |
2 |
1 |
1 |
4 |
MQ |
DFW |
VPS |
1305 |
36.0 |
85.0 |
641 |
1453 |
35.0 |
0 |
0 |
3 |
1 |
1 |
4 |
AA |
DFW |
DCA |
1555 |
7.0 |
126.0 |
1192 |
1935 |
-7.0 |
0 |
0 |
4 |
1 |
1 |
4 |
WN |
LAX |
MCI |
1720 |
48.0 |
166.0 |
1363 |
2225 |
39.0 |
0 |
0 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
58487 |
12 |
31 |
4 |
AA |
SFO |
DFW |
515 |
5.0 |
166.0 |
1464 |
1045 |
-19.0 |
0 |
0 |
58488 |
12 |
31 |
4 |
F9 |
LAS |
SFO |
1910 |
13.0 |
71.0 |
414 |
2050 |
4.0 |
0 |
0 |
58489 |
12 |
31 |
4 |
OO |
SFO |
SBA |
1846 |
-6.0 |
46.0 |
262 |
1956 |
-5.0 |
0 |
0 |
58490 |
12 |
31 |
4 |
WN |
MSP |
ATL |
525 |
39.0 |
124.0 |
907 |
855 |
34.0 |
0 |
0 |
58491 |
12 |
31 |
4 |
OO |
SFO |
BOI |
859 |
5.0 |
73.0 |
522 |
1146 |
-1.0 |
0 |
0 |
58492 rows × 14 columns
# 用pivot_table方法求出每条航线每个始发地的被取消的航班总数
fp = flights.pivot_table(index='AIRLINE',columns='ORG_AIR',values='CANCELLED',
aggfunc='sum', fill_value=0).round(2)
fp.head()
ORG_AIR |
ATL |
DEN |
DFW |
IAH |
LAS |
LAX |
MSP |
ORD |
PHX |
SFO |
AIRLINE |
|
|
|
|
|
|
|
|
|
|
AA |
3 |
4 |
86 |
3 |
3 |
11 |
3 |
35 |
4 |
2 |
AS |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
B6 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
DL |
28 |
1 |
0 |
0 |
1 |
1 |
4 |
0 |
1 |
2 |
EV |
18 |
6 |
27 |
36 |
0 |
0 |
6 |
53 |
0 |
0 |
# groupby聚合不能直接复现这张表。需要先按所有index和columns的列聚合
fg = flights.groupby(['AIRLINE', 'ORG_AIR'])['CANCELLED'].sum()
fg.head()
'''
AIRLINE ORG_AIR
AA ATL 3
DEN 4
DFW 86
IAH 3
LAS 3
Name: CANCELLED, dtype: int64
'''
# 再使用unstack,将ORG_AIR这层索引作为列名
fg_unstack = fg.unstack('ORG_AIR', fill_value=0)
fg_unstack.head()
ORG_AIR |
ATL |
DEN |
DFW |
IAH |
LAS |
LAX |
MSP |
ORD |
PHX |
SFO |
AIRLINE |
|
|
|
|
|
|
|
|
|
|
AA |
3 |
4 |
86 |
3 |
3 |
11 |
3 |
35 |
4 |
2 |
AS |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
B6 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
DL |
28 |
1 |
0 |
0 |
1 |
1 |
4 |
0 |
1 |
2 |
EV |
18 |
6 |
27 |
36 |
0 |
0 |
6 |
53 |
0 |
0 |
# 判断两个方式是否等价
fg_unstack = fg.unstack('ORG_AIR', fill_value=0)
fp.equals(fg_unstack)
#True
import numpy as np
# 先实现一个稍微复杂的透视表
fp2 = flights.pivot_table(index=['AIRLINE', 'MONTH'],
columns=['ORG_AIR', 'CANCELLED'],
values=['DEP_DELAY', 'DIST'],
aggfunc=[np.mean, np.sum],
fill_value=0)
fp2.head()
|
mean |
|
... |
sum |
|
|
DEP_DELAY |
... |
DIST |
|
ORG_AIR |
ATL |
DEN |
DFW |
IAH |
LAS |
... |
LAX |
MSP |
ORD |
PHX |
SFO |
|
CANCELLED |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
... |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
AIRLINE |
MONTH |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AA |
1 |
-3.250000 |
0 |
7.062500 |
0 |
11.977591 |
-3.0 |
9.750000 |
0 |
32.375000 |
0 |
... |
135921 |
2475 |
7281 |
0 |
129334 |
0 |
21018 |
0 |
33483 |
0 |
2 |
-3.000000 |
0 |
5.461538 |
0 |
8.756579 |
0.0 |
1.000000 |
0 |
-3.055556 |
0 |
... |
113483 |
5454 |
5040 |
0 |
120572 |
5398 |
17049 |
868 |
32110 |
2586 |
3 |
-0.166667 |
0 |
7.666667 |
0 |
15.383784 |
0.0 |
10.900000 |
0 |
12.074074 |
0 |
... |
131836 |
1744 |
14471 |
0 |
127072 |
802 |
25770 |
0 |
43580 |
0 |
4 |
0.071429 |
0 |
20.266667 |
0 |
10.501493 |
0.0 |
6.933333 |
0 |
27.241379 |
0 |
... |
170285 |
0 |
4541 |
0 |
152154 |
4718 |
17727 |
0 |
51054 |
0 |
5 |
5.777778 |
0 |
23.466667 |
0 |
16.798780 |
0.0 |
3.055556 |
0 |
2.818182 |
0 |
... |
167484 |
0 |
6298 |
0 |
110864 |
1999 |
11164 |
0 |
40233 |
0 |
5 rows × 80 columns
# 用groupby和unstack复现上面的方法
flights.groupby(['AIRLINE', 'MONTH', 'ORG_AIR', 'CANCELLED'])['DEP_DELAY', 'DIST']\
.agg(['mean', 'sum']) \
.unstack(['ORG_AIR', 'CANCELLED'], fill_value=0) \
.swaplevel(0, 1, axis='columns') \
.head()
|
mean |
|
... |
sum |
|
|
DEP_DELAY |
... |
DIST |
|
ORG_AIR |
ATL |
DEN |
DFW |
IAH |
LAS |
... |
LAX |
MSP |
ORD |
PHX |
SFO |
|
CANCELLED |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
... |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
AIRLINE |
MONTH |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AA |
1 |
-3.250000 |
0.0 |
7.062500 |
0.0 |
11.977591 |
-3.0 |
9.750000 |
0.0 |
32.375000 |
0.0 |
... |
135921 |
2475 |
7281 |
0 |
129334 |
0 |
21018 |
0 |
33483 |
0 |
2 |
-3.000000 |
NaN |
5.461538 |
NaN |
8.756579 |
NaN |
1.000000 |
NaN |
-3.055556 |
NaN |
... |
113483 |
5454 |
5040 |
0 |
120572 |
5398 |
17049 |
868 |
32110 |
2586 |
3 |
-0.166667 |
NaN |
7.666667 |
0.0 |
15.383784 |
NaN |
10.900000 |
0.0 |
12.074074 |
0.0 |
... |
131836 |
1744 |
14471 |
0 |
127072 |
802 |
25770 |
0 |
43580 |
0 |
4 |
0.071429 |
0.0 |
20.266667 |
0.0 |
10.501493 |
NaN |
6.933333 |
0.0 |
27.241379 |
0.0 |
... |
170285 |
0 |
4541 |
0 |
152154 |
4718 |
17727 |
0 |
51054 |
0 |
5 |
5.777778 |
0.0 |
23.466667 |
NaN |
16.798780 |
NaN |
3.055556 |
NaN |
2.818182 |
0.0 |
... |
167484 |
0 |
6298 |
0 |
110864 |
1999 |
11164 |
0 |
40233 |
0 |
5 rows × 80 columns