import pandas as pd
# 读取college数据集,学校名作为行索引,,只选取本科生的列
usecol_func = lambda x: 'UGDS_' in x or x == 'INSTNM'
college = pd.read_csv('data/college.csv', index_col='INSTNM', usecols=usecol_func)
#用read_csv()方法只选取特定的列,指定uscols参数,index_col列索引
college.head()
|
UGDS_WHITE |
UGDS_BLACK |
UGDS_HISP |
UGDS_ASIAN |
UGDS_AIAN |
UGDS_NHPI |
UGDS_2MOR |
UGDS_NRA |
UGDS_UNKN |
INSTNM |
|
|
|
|
|
|
|
|
|
Alabama A & M University |
0.0333 |
0.9353 |
0.0055 |
0.0019 |
0.0024 |
0.0019 |
0.0000 |
0.0059 |
0.0138 |
University of Alabama at Birmingham |
0.5922 |
0.2600 |
0.0283 |
0.0518 |
0.0022 |
0.0007 |
0.0368 |
0.0179 |
0.0100 |
Amridge University |
0.2990 |
0.4192 |
0.0069 |
0.0034 |
0.0000 |
0.0000 |
0.0000 |
0.0000 |
0.2715 |
University of Alabama in Huntsville |
0.6988 |
0.1255 |
0.0382 |
0.0376 |
0.0143 |
0.0002 |
0.0172 |
0.0332 |
0.0350 |
Alabama State University |
0.0158 |
0.9208 |
0.0121 |
0.0019 |
0.0010 |
0.0006 |
0.0098 |
0.0243 |
0.0137 |
# 用stack方法,将所有水平列名,转化为垂直的行索引
college_stacked = college.stack()
college_stacked.head(18)
# stack 的作用就是把列索引的层压入行索引
INSTNM
Alabama A & M University UGDS_WHITE 0.0333
UGDS_BLACK 0.9353
UGDS_HISP 0.0055
UGDS_ASIAN 0.0019
UGDS_AIAN 0.0024
UGDS_NHPI 0.0019
UGDS_2MOR 0.0000
UGDS_NRA 0.0059
UGDS_UNKN 0.0138
University of Alabama at Birmingham UGDS_WHITE 0.5922
UGDS_BLACK 0.2600
UGDS_HISP 0.0283
UGDS_ASIAN 0.0518
UGDS_AIAN 0.0022
UGDS_NHPI 0.0007
UGDS_2MOR 0.0368
UGDS_NRA 0.0179
UGDS_UNKN 0.0100
dtype: float64
# unstack方法可以将其还原,# unstack 的作用是把行索引转为列索引
college_stacked.unstack().head()
|
UGDS_WHITE |
UGDS_BLACK |
UGDS_HISP |
UGDS_ASIAN |
UGDS_AIAN |
UGDS_NHPI |
UGDS_2MOR |
UGDS_NRA |
UGDS_UNKN |
INSTNM |
|
|
|
|
|
|
|
|
|
Alabama A & M University |
0.0333 |
0.9353 |
0.0055 |
0.0019 |
0.0024 |
0.0019 |
0.0000 |
0.0059 |
0.0138 |
University of Alabama at Birmingham |
0.5922 |
0.2600 |
0.0283 |
0.0518 |
0.0022 |
0.0007 |
0.0368 |
0.0179 |
0.0100 |
Amridge University |
0.2990 |
0.4192 |
0.0069 |
0.0034 |
0.0000 |
0.0000 |
0.0000 |
0.0000 |
0.2715 |
University of Alabama in Huntsville |
0.6988 |
0.1255 |
0.0382 |
0.0376 |
0.0143 |
0.0002 |
0.0172 |
0.0332 |
0.0350 |
Alabama State University |
0.0158 |
0.9208 |
0.0121 |
0.0019 |
0.0010 |
0.0006 |
0.0098 |
0.0243 |
0.013 |
# 另一种方式是先用melt,再用pivot。先加载数据,不指定行索引名
#可以使用Pandas melt()和pivot_table()完成多列到多行的转行
#melt 是溶解/分解的意思, 即拆分数据。
#df.pivot() 将长数据集转换成宽数据集,df.melt() 则是将宽数据集变成长数据集
college2 = pd.read_csv('data/college.csv', usecols=usecol_func)
college2.head()
|
INSTNM |
UGDS_WHITE |
UGDS_BLACK |
UGDS_HISP |
UGDS_ASIAN |
UGDS_AIAN |
UGDS_NHPI |
UGDS_2MOR |
UGDS_NRA |
UGDS_UNKN |
0 |
Alabama A & M University |
0.0333 |
0.9353 |
0.0055 |
0.0019 |
0.0024 |
0.0019 |
0.0000 |
0.0059 |
0.0138 |
1 |
University of Alabama at Birmingham |
0.5922 |
0.2600 |
0.0283 |
0.0518 |
0.0022 |
0.0007 |
0.0368 |
0.0179 |
0.0100 |
2 |
Amridge University |
0.2990 |
0.4192 |
0.0069 |
0.0034 |
0.0000 |
0.0000 |
0.0000 |
0.0000 |
0.2715 |
3 |
University of Alabama in Huntsville |
0.6988 |
0.1255 |
0.0382 |
0.0376 |
0.0143 |
0.0002 |
0.0172 |
0.0332 |
0.0350 |
4 |
Alabama State University |
0.0158 |
0.9208 |
0.0121 |
0.0019 |
0.0010 |
0.0006 |
0.0098 |
0.0243 |
0.0137 |
# 使用melt,将所有race列变为一列
college_melted = college2.melt(id_vars='INSTNM', var_name='Race',value_name='Percentage')
college_melted.head()
|
INSTNM |
Race |
Percentage |
0 |
Alabama A & M University |
UGDS_WHITE |
0.0333 |
1 |
University of Alabama at Birmingham |
UGDS_WHITE |
0.5922 |
2 |
Amridge University |
UGDS_WHITE |
0.2990 |
3 |
University of Alabama in Huntsville |
UGDS_WHITE |
0.6988 |
4 |
Alabama State University |
UGDS_WHITE |
0.0158 |
# 用pivot还原
melted_inv = college_melted.pivot(index='INSTNM',columns='Race',values='Percentage')
melted_inv.head()
Race |
UGDS_2MOR |
UGDS_AIAN |
UGDS_ASIAN |
UGDS_BLACK |
UGDS_HISP |
UGDS_NHPI |
UGDS_NRA |
UGDS_UNKN |
UGDS_WHITE |
INSTNM |
|
|
|
|
|
|
|
|
|
A & W Healthcare Educators |
0.0000 |
0.0 |
0.0000 |
0.9750 |
0.0250 |
0.0 |
0.0000 |
0.0000 |
0.0000 |
A T Still University of Health Sciences |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
ABC Beauty Academy |
0.0000 |
0.0 |
0.9333 |
0.0333 |
0.0333 |
0.0 |
0.0000 |
0.0000 |
0.0000 |
ABC Beauty College Inc |
0.0000 |
0.0 |
0.0000 |
0.6579 |
0.0526 |
0.0 |
0.0000 |
0.0000 |
0.2895 |
AI Miami International University of Art and Design |
0.0018 |
0.0 |
0.0018 |
0.0198 |
0.4773 |
0.0 |
0.0025 |
0.4644 |
0.0324 |
数据转置
# 使用最外层的行索引做unstack
college.stack().unstack(0)
INSTNM |
Alabama A & M University |
University of Alabama at Birmingham |
Amridge University |
University of Alabama in Huntsville |
Alabama State University |
The University of Alabama |
Central Alabama Community College |
Athens State University |
Auburn University at Montgomery |
Auburn University |
... |
MCI Institute of Technology-Boca Raton |
West Coast University-Miami |
National American University-Houston |
Aparicio-Levy Technical College |
Fred D. Learey Technical College |
Hollywood Institute of Beauty Careers-West Palm Beach |
Hollywood Institute of Beauty Careers-Casselberry |
Coachella Valley Beauty College-Beaumont |
Dewey University-Mayaguez |
Coastal Pines Technical College |
UGDS_WHITE |
0.0333 |
0.5922 |
0.2990 |
0.6988 |
0.0158 |
0.7825 |
0.7255 |
0.7823 |
0.5328 |
0.8507 |
... |
0.0199 |
0.1522 |
0.1858 |
0.2431 |
0.3731 |
0.2182 |
0.1200 |
0.3284 |
0.0 |
0.6762 |
UGDS_BLACK |
0.9353 |
0.2600 |
0.4192 |
0.1255 |
0.9208 |
0.1119 |
0.2613 |
0.1200 |
0.3376 |
0.0704 |
... |
0.2815 |
0.1739 |
0.6443 |
0.1215 |
0.1388 |
0.4182 |
0.3333 |
0.1045 |
0.0 |
0.2508 |
UGDS_HISP |
0.0055 |
0.0283 |
0.0069 |
0.0382 |
0.0121 |
0.0348 |
0.0044 |
0.0191 |
0.0074 |
0.0248 |
... |
0.6854 |
0.6087 |
0.0672 |
0.6243 |
0.3080 |
0.2364 |
0.4400 |
0.4925 |
1.0 |
0.0359 |
UGDS_ASIAN |
0.0019 |
0.0518 |
0.0034 |
0.0376 |
0.0019 |
0.0106 |
0.0025 |
0.0053 |
0.0221 |
0.0227 |
... |
0.0132 |
0.0217 |
0.0079 |
0.0055 |
0.0000 |
0.0182 |
0.0000 |
0.0149 |
0.0 |
0.0045 |
UGDS_AIAN |
0.0024 |
0.0022 |
0.0000 |
0.0143 |
0.0010 |
0.0038 |
0.0044 |
0.0157 |
0.0044 |
0.0074 |
... |
0.0000 |
0.0000 |
0.0079 |
0.0055 |
0.0000 |
0.0000 |
0.0000 |
0.0299 |
0.0 |
0.0034 |
UGDS_NHPI |
0.0019 |
0.0007 |
0.0000 |
0.0002 |
0.0006 |
0.0009 |
0.0000 |
0.0010 |
0.0016 |
0.0000 |
... |
0.0000 |
0.0000 |
0.0000 |
0.0000 |
0.0000 |
0.0000 |
0.0000 |
0.0149 |
0.0 |
0.0017 |
UGDS_2MOR |
0.0000 |
0.0368 |
0.0000 |
0.0172 |
0.0098 |
0.0261 |
0.0000 |
0.0174 |
0.0297 |
0.0000 |
... |
0.0000 |
0.0435 |
0.0751 |
0.0000 |
0.0022 |
0.0000 |
0.0400 |
0.0149 |
0.0 |
0.0191 |
UGDS_NRA |
0.0059 |
0.0179 |
0.0000 |
0.0332 |
0.0243 |
0.0268 |
0.0000 |
0.0057 |
0.0397 |
0.0100 |
... |
0.0000 |
0.0000 |
0.0000 |
0.0000 |
0.0000 |
0.0182 |
0.0000 |
0.0000 |
0.0 |
0.0028 |
UGDS_UNKN |
0.0138 |
0.0100 |
0.2715 |
0.0350 |
0.0137 |
0.0026 |
0.0019 |
0.0334 |
0.0246 |
0.0140 |
... |
0.0000 |
0.0000 |
0.0119 |
0.0000 |
0.1779 |
0.0909 |
0.0667 |
0.0000 |
0.0 |
0.0056 |
9 rows × 6874 columns
# 转置DataFrame更简单的方法是transpose()或T
#transpose()简单来说,就相当于数学中的转置,在矩阵中,转置就是把行与列相互调换位置;
college.T
INSTNM |
Alabama A & M University |
University of Alabama at Birmingham |
Amridge University |
University of Alabama in Huntsville |
Alabama State University |
The University of Alabama |
Central Alabama Community College |
Athens State University |
Auburn University at Montgomery |
Auburn University |
... |
Strayer University-North Dallas |
Strayer University-San Antonio |
Strayer University-Stafford |
WestMed College - Merced |
Vantage College |
SAE Institute of Technology San Francisco |
Rasmussen College - Overland Park |
National Personal Training Institute of Cleveland |
Bay Area Medical Academy - San Jose Satellite Location |
Excel Learning Center-San Antonio South |
UGDS_WHITE |
0.0333 |
0.5922 |
0.2990 |
0.6988 |
0.0158 |
0.7825 |
0.7255 |
0.7823 |
0.5328 |
0.8507 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
UGDS_BLACK |
0.9353 |
0.2600 |
0.4192 |
0.1255 |
0.9208 |
0.1119 |
0.2613 |
0.1200 |
0.3376 |
0.0704 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
UGDS_HISP |
0.0055 |
0.0283 |
0.0069 |
0.0382 |
0.0121 |
0.0348 |
0.0044 |
0.0191 |
0.0074 |
0.0248 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
UGDS_ASIAN |
0.0019 |
0.0518 |
0.0034 |
0.0376 |
0.0019 |
0.0106 |
0.0025 |
0.0053 |
0.0221 |
0.0227 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
UGDS_AIAN |
0.0024 |
0.0022 |
0.0000 |
0.0143 |
0.0010 |
0.0038 |
0.0044 |
0.0157 |
0.0044 |
0.0074 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
UGDS_NHPI |
0.0019 |
0.0007 |
0.0000 |
0.0002 |
0.0006 |
0.0009 |
0.0000 |
0.0010 |
0.0016 |
0.0000 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
UGDS_2MOR |
0.0000 |
0.0368 |
0.0000 |
0.0172 |
0.0098 |
0.0261 |
0.0000 |
0.0174 |
0.0297 |
0.0000 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
UGDS_NRA |
0.0059 |
0.0179 |
0.0000 |
0.0332 |
0.0243 |
0.0268 |
0.0000 |
0.0057 |
0.0397 |
0.0100 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
UGDS_UNKN |
0.0138 |
0.0100 |
0.2715 |
0.0350 |
0.0137 |
0.0026 |
0.0019 |
0.0334 |
0.0246 |
0.0140 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
9 rows × 7535 columns
wide_to_long同时stack多列
:https://blog.csdn.net/weixin_48135624/article/details/114156665?spm=1001.2014.3001.5501