12-Groupby扩展操作--数据分析

 
 

groupby扩展操作

In [3]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'A':['foo','bar','foo','bar','foo','bar','foo','foo'],
                  'B':['one','one','two','three','two','two','one','three'],
                  'C':np.random.randn(8),
                  'D':np.random.randn(8)})
df
Out[3]:
  A B C D
0 foo one -0.363353 0.968036
1 bar one -0.713551 -0.540636
2 foo two -0.002899 -2.430552
3 bar three 0.452300 -0.280095
4 foo two -1.083832 1.556935
5 bar two -0.266669 1.286793
6 foo one 1.110641 -0.023415
7 foo three 1.337744 0.463671
In [5]:
grouped = df.groupby('A')
grouped
Out[5]:
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000000000530CEF0>
In [7]:
 
           
grouped.count()
Out[7]:
  B C D
A      
bar 3 3 3
foo 5 5 5
In [8]:
 
           
grouped = df.groupby(['A','B','C'])
grouped.count()
Out[8]:
      D
A B C  
bar one -0.713551 1
three 0.452300 1
two -0.266669 1
foo one -0.363353 1
1.110641 1
three 1.337744 1
two -1.083832 1
-0.002899 1
In [9]:
def get_letter_type(letter):
    if letter.lower() in 'aeiou':
        return 'a'
    else:
        return 'b'
grouped = df.groupby(get_letter_type,axis = 1)
grouped.count().iloc[0]
Out[9]:
a    1
b    3
Name: 0, dtype: int64
In [10]:
s = pd.Series([1,2,3,1,2,3],[8,7,5,8,7,5])
s
Out[10]:
8    1
7    2
5    3
8    1
7    2
5    3
dtype: int64

level=0 分组级别,其中0代表按照第一列分组。也可以直接写列的名字

In [13]:
grouped = s.groupby(level=0,sort=False)
In [14]:
grouped.first()
Out[14]:
8    1
7    2
5    3
dtype: int64
In [15]:
 
           
grouped.last()
Out[15]:
8    1
7    2
5    3
dtype: int64
In [16]:
 
           
grouped.sum()
Out[16]:
8    2
7    4
5    6
dtype: int64
In [17]:
 
           
df2 = pd.DataFrame({'X':['A','B','A','B'],
                   'Y':[1,2,3,4]})
df2
Out[17]:
  X Y
0 A 1
1 B 2
2 A 3
3 B 4
In [19]:
 
           
df2.groupby(['X']).get_group('A')
Out[19]:
  X Y
0 A 1
2 A 3
In [20]:
 
           
df2.groupby(['X']).get_group('B')
Out[20]:
  X Y
1 B 2
3 B 4
In [21]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
In [23]:
index = pd.MultiIndex.from_arrays(arrays,names=['first','second'])
In [24]:
s = pd.Series(np.random.randn(8),index=index)
s
Out[24]:
first  second
bar    one       1.032756
       two       1.155843
baz    one      -0.862770
       two       0.412540
foo    one       1.961790
       two       0.383244
qux    one       1.095939
       two       0.871055
dtype: float64
 
          
### groupby(level=0)  level属性分组级别,对应的值可以是数字,0代表第一列。或者直接写列的名字
In [25]:
grouped = s.groupby(level = 0)
grouped.sum()
Out[25]:
first
bar    2.188598
baz   -0.450230
foo    2.345034
qux    1.966994
dtype: float64
In [26]:
 
           
grouped = s.groupby(level = 1)
grouped.sum()
Out[26]:
second
one    3.227715
two    2.822682
dtype: float64
In [27]:
 
           
grouped = s.groupby(level = 'first')
grouped.sum()
Out[27]:
first
bar    2.188598
baz   -0.450230
foo    2.345034
qux    1.966994
dtype: float64
In [28]:
df
Out[28]:
  A B C D
0 foo one -0.363353 0.968036
1 bar one -0.713551 -0.540636
2 foo two -0.002899 -2.430552
3 bar three 0.452300 -0.280095
4 foo two -1.083832 1.556935
5 bar two -0.266669 1.286793
6 foo one 1.110641 -0.023415
7 foo three 1.337744 0.463671
In [29]:
 
           
grouped = df.groupby(['A','B'])
grouped.aggregate(np.sum)
Out[29]:
    C D
A B    
bar one -0.713551 -0.540636
three 0.452300 -0.280095
two -0.266669 1.286793
foo one 0.747288 0.944621
three 1.337744 0.463671
two -1.086731 -0.873618
In [30]:
grouped = df.groupby(['A','B'],as_index=False)
grouped.aggregate(np.sum)
Out[30]:
  A B C D
0 bar one -0.713551 -0.540636
1 bar three 0.452300 -0.280095
2 bar two -0.266669 1.286793
3 foo one 0.747288 0.944621
4 foo three 1.337744 0.463671
5 foo two -1.086731 -0.873618

分组求和后,进行重新构建索引

In [33]:
grouped = df.groupby(['A','B']).sum().reset_index()
grouped
Out[33]:
  A B C D
0 bar one -0.713551 -0.540636
1 bar three 0.452300 -0.280095
2 bar two -0.266669 1.286793
3 foo one 0.747288 0.944621
4 foo three 1.337744 0.463671
5 foo two -1.086731 -0.873618
In [34]:
 
           
grouped = df.groupby(['A','B'])
grouped.size()
Out[34]:
A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     three    1
     two      2
dtype: int64

describe() 数据统计

In [35]:
 
           
grouped.describe().head()
Out[35]:
    C D
    count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
A B                                
bar one 1.0 -0.713551 NaN -0.713551 -0.713551 -0.713551 -0.713551 -0.713551 1.0 -0.540636 NaN -0.540636 -0.540636 -0.540636 -0.540636 -0.540636
three 1.0 0.452300 NaN 0.452300 0.452300 0.452300 0.452300 0.452300 1.0 -0.280095 NaN -0.280095 -0.280095 -0.280095 -0.280095 -0.280095
two 1.0 -0.266669 NaN -0.266669 -0.266669 -0.266669 -0.266669 -0.266669 1.0 1.286793 NaN 1.286793 1.286793 1.286793 1.286793 1.286793
foo one 2.0 0.373644 1.042271 -0.363353 0.005145 0.373644 0.742142 1.110641 2.0 0.472310 0.701061 -0.023415 0.224448 0.472310 0.720173 0.968036
three 1.0 1.337744 NaN 1.337744 1.337744 1.337744 1.337744 1.337744 1.0 0.463671 NaN 0.463671 0.463671 0.463671 0.463671 0.463671

agg([np.sum,np.mean,np.std]) 自定义统计参数

In [38]:
 
           
grouped = df.groupby('A')
grouped['C'].agg([np.sum,np.mean,np.std])
Out[38]:
  sum mean std
A      
bar -0.527920 -0.175973 0.588194
foo 0.998301 0.199660 1.016180
In [39]:
grouped['C'].agg({'求和':np.sum,'平均':np.mean,'方差':np.std})
d:\pythons\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.
Out[39]:
  求和 平均 方差
A      
bar -0.527920 -0.175973 0.588194
foo 0.998301 0.199660 1.016180
In [ ]:

猜你喜欢

转载自blog.csdn.net/m0_38039437/article/details/80764534