0. 准备
import pandas as pd
df = pd.read_excel('https://www.gairuo.com/file/data/dataset/team.xlsx').head()
df
|
name |
team |
Q1 |
Q2 |
Q3 |
Q4 |
0 |
Liver |
E |
89 |
21 |
24 |
64 |
1 |
Arry |
C |
36 |
37 |
37 |
57 |
2 |
Ack |
A |
57 |
60 |
18 |
84 |
3 |
Eorge |
C |
93 |
96 |
71 |
78 |
4 |
Oah |
D |
65 |
49 |
61 |
86 |
1. 逻辑运算
df['Q1'] > 30
0 True
1 True
2 True
3 True
4 True
Name: Q1, dtype: bool
df.index <= 5
array([ True, True, True, True, True])
df.loc[:, 'Q1':'Q4'] > 60
|
Q1 |
Q2 |
Q3 |
Q4 |
0 |
True |
False |
False |
True |
1 |
False |
False |
False |
False |
2 |
False |
False |
False |
True |
3 |
True |
True |
True |
True |
4 |
True |
False |
True |
True |
(df['Q1'] > 60) & (df['Q2'] > 60) & (df['Q3'] > 60) & (df['Q4'] > 60)
0 False
1 False
2 False
3 True
4 False
dtype: bool
(df.loc[:, 'Q1':'Q4'] > 60).all(axis=1)
0 False
1 False
2 False
3 True
4 False
dtype: bool
~((df.loc[:, 'Q1':'Q4'] > 60).all(axis=1))
0 True
1 True
2 True
3 False
4 True
dtype: bool
df['name'].str.contains('A')
0 False
1 True
2 True
3 False
4 False
Name: name, dtype: bool
df['name'].isin(['Arry','Ack'])
0 False
1 True
2 True
3 False
4 False
Name: name, dtype: bool
2. 基于上述逻辑筛选数据
df[df['name'].str.contains('ge')]
|
name |
team |
Q1 |
Q2 |
Q3 |
Q4 |
3 |
Eorge |
C |
93 |
96 |
71 |
78 |
df[(df.loc[:, 'Q1':'Q4'] > 60).all(axis=1)]
|
name |
team |
Q1 |
Q2 |
Q3 |
Q4 |
3 |
Eorge |
C |
93 |
96 |
71 |
78 |
df.iloc[1+1]
name Ack
team A
Q1 57
Q2 60
Q3 18
Q4 84
Name: 2, dtype: object
df.iloc[1+1:3]
|
name |
team |
Q1 |
Q2 |
Q3 |
Q4 |
2 |
Ack |
A |
57 |
60 |
18 |
84 |
df[df.loc[:, 'Q1':'Q4'] > 60]
|
name |
team |
Q1 |
Q2 |
Q3 |
Q4 |
0 |
NaN |
NaN |
89.0 |
NaN |
NaN |
64.0 |
1 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2 |
NaN |
NaN |
NaN |
NaN |
NaN |
84.0 |
3 |
NaN |
NaN |
93.0 |
96.0 |
71.0 |
78.0 |
4 |
NaN |
NaN |
65.0 |
NaN |
61.0 |
86.0 |
3. 函数筛选
df.loc[lambda x: x.index == max(df.index), 'Q1':'Q4']
|
Q1 |
Q2 |
Q3 |
Q4 |
4 |
65 |
49 |
61 |
86 |
df.loc[:, lambda x: x.columns.str.contains('Q')]
|
Q1 |
Q2 |
Q3 |
Q4 |
0 |
89 |
21 |
24 |
64 |
1 |
36 |
37 |
37 |
57 |
2 |
57 |
60 |
18 |
84 |
3 |
93 |
96 |
71 |
78 |
4 |
65 |
49 |
61 |
86 |
5. 数据查询 df.query()
df.query('name!="Eorge" and Q1>60')
|
name |
team |
Q1 |
Q2 |
Q3 |
Q4 |
0 |
Liver |
E |
89 |
21 |
24 |
64 |
4 |
Oah |
D |
65 |
49 |
61 |
86 |
a = df.Q1.mean()
df.query('Q1>@a')
|
name |
team |
Q1 |
Q2 |
Q3 |
Q4 |
0 |
Liver |
E |
89 |
21 |
24 |
64 |
3 |
Eorge |
C |
93 |
96 |
71 |
78 |
df.eval('Q1>Q2')
0 True
1 False
2 False
3 False
4 True
dtype: bool
6. 筛选df.filter(),支持正则表达式、模糊匹配
df.filter(items=['name','Q4'])
|
name |
Q4 |
0 |
Liver |
64 |
1 |
Arry |
57 |
2 |
Ack |
84 |
3 |
Eorge |
78 |
4 |
Oah |
86 |
df.filter(regex='Q', axis=1)
|
Q1 |
Q2 |
Q3 |
Q4 |
0 |
89 |
21 |
24 |
64 |
1 |
36 |
37 |
37 |
57 |
2 |
57 |
60 |
18 |
84 |
3 |
93 |
96 |
71 |
78 |
4 |
65 |
49 |
61 |
86 |
df.filter(regex='2', axis=0)
|
name |
team |
Q1 |
Q2 |
Q3 |
Q4 |
2 |
Ack |
A |
57 |
60 |
18 |
84 |
df.set_index('name').filter(regex='ge', axis=0)
|
team |
Q1 |
Q2 |
Q3 |
Q4 |
name |
|
|
|
|
|
Eorge |
C |
93 |
96 |
71 |
78 |
7. 按数据类型筛选
df.dtypes
name object
team object
Q1 int64
Q2 int64
Q3 int64
Q4 int64
dtype: object
df.select_dtypes(include='int64')
|
Q1 |
Q2 |
Q3 |
Q4 |
0 |
89 |
21 |
24 |
64 |
1 |
36 |
37 |
37 |
57 |
2 |
57 |
60 |
18 |
84 |
3 |
93 |
96 |
71 |
78 |
4 |
65 |
49 |
61 |
86 |
df.select_dtypes(exclude='int64')
|
name |
team |
0 |
Liver |
E |
1 |
Arry |
C |
2 |
Ack |
A |
3 |
Eorge |
C |
4 |
Oah |
D |