用groupby 配合unstack,可以实现pivot_table

# 读取flights数据集
flights = pd.read_csv('data/flights.csv')
flights
  MONTH DAY WEEKDAY AIRLINE ORG_AIR DEST_AIR SCHED_DEP DEP_DELAY AIR_TIME DIST SCHED_ARR ARR_DELAY DIVERTED CANCELLED
0 1 1 4 WN LAX SLC 1625 58.0 94.0 590 1905 65.0 0 0
1 1 1 4 UA DEN IAD 823 7.0 154.0 1452 1333 -13.0 0 0
2 1 1 4 MQ DFW VPS 1305 36.0 85.0 641 1453 35.0 0 0
3 1 1 4 AA DFW DCA 1555 7.0 126.0 1192 1935 -7.0 0 0
4 1 1 4 WN LAX MCI 1720 48.0 166.0 1363 2225 39.0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58487 12 31 4 AA SFO DFW 515 5.0 166.0 1464 1045 -19.0 0 0
58488 12 31 4 F9 LAS SFO 1910 13.0 71.0 414 2050 4.0 0 0
58489 12 31 4 OO SFO SBA 1846 -6.0 46.0 262 1956 -5.0 0 0
58490 12 31 4 WN MSP ATL 525 39.0 124.0 907 855 34.0 0 0
58491 12 31 4 OO SFO BOI 859 5.0 73.0 522 1146 -1.0 0 0

58492 rows × 14 columns

# 用pivot_table方法求出每条航线每个始发地的被取消的航班总数
fp = flights.pivot_table(index='AIRLINE',columns='ORG_AIR',values='CANCELLED',
                         aggfunc='sum', fill_value=0).round(2)
fp.head()
ORG_AIR ATL DEN DFW IAH LAS LAX MSP ORD PHX SFO
AIRLINE                    
AA 3 4 86 3 3 11 3 35 4 2
AS 0 0 0 0 0 0 0 0 0 0
B6 0 0 0 0 0 0 0 0 0 1
DL 28 1 0 0 1 1 4 0 1 2
EV 18 6 27 36 0 0 6 53 0 0
# groupby聚合不能直接复现这张表。需要先按所有index和columns的列聚合
fg = flights.groupby(['AIRLINE', 'ORG_AIR'])['CANCELLED'].sum()
fg.head()
'''
AIRLINE  ORG_AIR
AA       ATL         3
         DEN         4
         DFW        86
         IAH         3
         LAS         3
Name: CANCELLED, dtype: int64
'''
# 再使用unstack,将ORG_AIR这层索引作为列名
fg_unstack = fg.unstack('ORG_AIR', fill_value=0)
fg_unstack.head()
ORG_AIR ATL DEN DFW IAH LAS LAX MSP ORD PHX SFO
AIRLINE                    
AA 3 4 86 3 3 11 3 35 4 2
AS 0 0 0 0 0 0 0 0 0 0
B6 0 0 0 0 0 0 0 0 0 1
DL 28 1 0 0 1 1 4 0 1 2
EV 18 6 27 36 0 0 6 53 0 0
# 判断两个方式是否等价
fg_unstack = fg.unstack('ORG_AIR', fill_value=0)
fp.equals(fg_unstack)
#True
import numpy as np
# 先实现一个稍微复杂的透视表
fp2 = flights.pivot_table(index=['AIRLINE', 'MONTH'],
                                   columns=['ORG_AIR', 'CANCELLED'],
                                   values=['DEP_DELAY', 'DIST'],
                                   aggfunc=[np.mean, np.sum],
                                   fill_value=0)
fp2.head()
  mean   ... sum
    DEP_DELAY ... DIST
  ORG_AIR ATL DEN DFW IAH LAS ... LAX MSP ORD PHX SFO
  CANCELLED 0 1 0 1 0 1 0 1 0 1 ... 0 1 0 1 0 1 0 1 0 1
AIRLINE MONTH                                          
AA 1 -3.250000 0 7.062500 0 11.977591 -3.0 9.750000 0 32.375000 0 ... 135921 2475 7281 0 129334 0 21018 0 33483 0
2 -3.000000 0 5.461538 0 8.756579 0.0 1.000000 0 -3.055556 0 ... 113483 5454 5040 0 120572 5398 17049 868 32110 2586
3 -0.166667 0 7.666667 0 15.383784 0.0 10.900000 0 12.074074 0 ... 131836 1744 14471 0 127072 802 25770 0 43580 0
4 0.071429 0 20.266667 0 10.501493 0.0 6.933333 0 27.241379 0 ... 170285 0 4541 0 152154 4718 17727 0 51054 0
5 5.777778 0 23.466667 0 16.798780 0.0 3.055556 0 2.818182 0 ... 167484 0 6298 0 110864 1999 11164 0 40233 0

5 rows × 80 columns

# 用groupby和unstack复现上面的方法
flights.groupby(['AIRLINE', 'MONTH', 'ORG_AIR', 'CANCELLED'])['DEP_DELAY', 'DIST']\
.agg(['mean', 'sum']) \
.unstack(['ORG_AIR', 'CANCELLED'], fill_value=0) \
.swaplevel(0, 1, axis='columns') \
.head()
  mean   ... sum
    DEP_DELAY ... DIST
  ORG_AIR ATL DEN DFW IAH LAS ... LAX MSP ORD PHX SFO
  CANCELLED 0 1 0 1 0 1 0 1 0 1 ... 0 1 0 1 0 1 0 1 0 1
AIRLINE MONTH                                          
AA 1 -3.250000 0.0 7.062500 0.0 11.977591 -3.0 9.750000 0.0 32.375000 0.0 ... 135921 2475 7281 0 129334 0 21018 0 33483 0
2 -3.000000 NaN 5.461538 NaN 8.756579 NaN 1.000000 NaN -3.055556 NaN ... 113483 5454 5040 0 120572 5398 17049 868 32110 2586
3 -0.166667 NaN 7.666667 0.0 15.383784 NaN 10.900000 0.0 12.074074 0.0 ... 131836 1744 14471 0 127072 802 25770 0 43580 0
4 0.071429 0.0 20.266667 0.0 10.501493 NaN 6.933333 0.0 27.241379 0.0 ... 170285 0 4541 0 152154 4718 17727 0 51054 0
5 5.777778 0.0 23.466667 NaN 16.798780 NaN 3.055556 NaN 2.818182 0.0 ... 167484 0 6298 0 110864 1999 11164 0 40233 0

5 rows × 80 columns

猜你喜欢

转载自blog.csdn.net/weixin_48135624/article/details/114157129
今日推荐