import pandas as pd
# 读取college数据集,分组后,统计本科生的SAT数学成绩信息
college = pd.read_csv('data/college.csv')
cg = college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATMTMID'] \
.agg(['count', 'min', 'max']).head(6)
cg
|
|
UGDS |
SATMTMID |
|
|
count |
min |
max |
count |
min |
max |
STABBR |
RELAFFIL |
|
|
|
|
|
|
AK |
0 |
7 |
109.0 |
12865.0 |
0 |
NaN |
NaN |
1 |
3 |
27.0 |
275.0 |
1 |
503.0 |
503.0 |
AL |
0 |
71 |
12.0 |
29851.0 |
13 |
420.0 |
590.0 |
1 |
18 |
13.0 |
3033.0 |
8 |
400.0 |
560.0 |
AR |
0 |
68 |
18.0 |
21405.0 |
9 |
427.0 |
565.0 |
1 |
14 |
20.0 |
4485.0 |
7 |
495.0 |
600.0 |
# 行索引的两级都有名字,而列索引没有名字。用rename_axis给列索引的两级命名
cg = cg.rename_axis(['AGG_COLS', 'AGG_FUNCS'], axis='columns')
cg
|
AGG_COLS |
UGDS |
SATMTMID |
|
AGG_FUNCS |
count |
min |
max |
count |
min |
max |
STABBR |
RELAFFIL |
|
|
|
|
|
|
AK |
0 |
7 |
109.0 |
12865.0 |
0 |
NaN |
NaN |
1 |
3 |
27.0 |
275.0 |
1 |
503.0 |
503.0 |
AL |
0 |
71 |
12.0 |
29851.0 |
13 |
420.0 |
590.0 |
1 |
18 |
13.0 |
3033.0 |
8 |
400.0 |
560.0 |
AR |
0 |
68 |
18.0 |
21405.0 |
9 |
427.0 |
565.0 |
1 |
14 |
20.0 |
4485.0 |
7 |
495.0 |
600.0 |
# 将AGG_FUNCS列移到行索引
cg.stack('AGG_FUNCS').head()
|
|
AGG_COLS |
UGDS |
SATMTMID |
STABBR |
RELAFFIL |
AGG_FUNCS |
|
|
AK |
0 |
count |
7.0 |
0.0 |
min |
109.0 |
NaN |
max |
12865.0 |
NaN |
1 |
count |
3.0 |
1.0 |
min |
27.0 |
503.0 |
# stack默认是将列放到行索引的最内层,可以使用swaplevel改变层级
cg.stack('AGG_FUNCS').swaplevel('AGG_FUNCS', 'STABBR', axis='index').head()
|
|
AGG_COLS |
UGDS |
SATMTMID |
AGG_FUNCS |
RELAFFIL |
STABBR |
|
|
count |
0 |
AK |
7.0 |
0.0 |
min |
0 |
AK |
109.0 |
NaN |
max |
0 |
AK |
12865.0 |
NaN |
count |
1 |
AK |
3.0 |
1.0 |
min |
1 |
AK |
27.0 |
503.0 |
# 在此前的基础上再做sort_index #.swaplevel 重排层次化索引分级顺序
cg.stack('AGG_FUNCS').swaplevel('AGG_FUNCS', 'STABBR', axis='index') \
.sort_index(level='RELAFFIL', axis='index') \
.sort_index(level='AGG_COLS', axis='columns').head(6)
|
|
AGG_COLS |
SATMTMID |
UGDS |
AGG_FUNCS |
RELAFFIL |
STABBR |
|
|
count |
0 |
AK |
0.0 |
7.0 |
AL |
13.0 |
71.0 |
AR |
9.0 |
68.0 |
max |
0 |
AK |
NaN |
12865.0 |
AL |
590.0 |
29851.0 |
AR |
565.0 |
21405.0 |
# 对一些列做stack,对其它列做unstack
cg.stack('AGG_FUNCS').unstack(['RELAFFIL', 'STABBR'])
AGG_COLS |
UGDS |
SATMTMID |
RELAFFIL |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
STABBR |
AK |
AK |
AL |
AL |
AR |
AR |
AK |
AK |
AL |
AL |
AR |
AR |
AGG_FUNCS |
|
|
|
|
|
|
|
|
|
|
|
|
count |
7.0 |
3.0 |
71.0 |
18.0 |
68.0 |
14.0 |
0.0 |
1.0 |
13.0 |
8.0 |
9.0 |
7.0 |
min |
109.0 |
27.0 |
12.0 |
13.0 |
18.0 |
20.0 |
NaN |
503.0 |
420.0 |
400.0 |
427.0 |
495.0 |
max |
12865.0 |
275.0 |
29851.0 |
3033.0 |
21405.0 |
4485.0 |
NaN |
503.0 |
590.0 |
560.0 |
565.0 |
600.0 |
# 对所有列做stack,会返回一个Series
cg.stack(['AGG_FUNCS', 'AGG_COLS']).head(12)
'''
STABBR RELAFFIL AGG_FUNCS AGG_COLS
AK 0 count UGDS 7.0
SATMTMID 0.0
min UGDS 109.0
max UGDS 12865.0
1 count UGDS 3.0
SATMTMID 1.0
min UGDS 27.0
SATMTMID 503.0
max UGDS 275.0
SATMTMID 503.0
AL 0 count UGDS 71.0
SATMTMID 13.0
dtype: float64
'''
# 删除行和列索引所有层级的名称
cg.rename_axis([None, None], axis='index').rename_axis([None, None],axis='columns')
|
|
UGDS |
SATMTMID |
|
|
count |
min |
max |
count |
min |
max |
AK |
0 |
7 |
109.0 |
12865.0 |
0 |
NaN |
NaN |
1 |
3 |
27.0 |
275.0 |
1 |
503.0 |
503.0 |
AL |
0 |
71 |
12.0 |
29851.0 |
13 |
420.0 |
590.0 |
1 |
18 |
13.0 |
3033.0 |
8 |
400.0 |
560.0 |
AR |
0 |
68 |
18.0 |
21405.0 |
9 |
427.0 |
565.0 |
1 |
14 |
20.0 |
4485.0 |
7 |
495.0 |
600.0 |