数据规整化:合并
- 进行多个键的合并
left = DataFrame({'key1' : ['foo','foo','bar'],'key2' :['one','two','on
...: e'],'lval' : [1,2,3]})
right = DataFrame({'key1' : ['foo','foo','bar','bar'],'key2' : ['one',
...: 'one','one','two'],'rval' : [4,5,6,7]})
pd.merge(left,right,on = ['key1','key2'],how = 'outer')
- 对重复列名的处理
pd.merge(left,right,on = 'key1')
pd.merge(left,right,on = 'key1',suffixes = ('_left','_right'))
- 索引上的合并
left = DataFrame({'key' :['a','b','a','a','b','c'],'value' : range(6)}
...: )
right1 = DataFrame({'group_val' : [3.5,7]},index = ['a','b'])
pd.merge(left1,right1,left_on = 'key',right_index = True)
pd.merge(left1,right1,left_on = 'key',right_index = True,how = 'outer'
...: )
right3 = DataFrame({'group_val' : [3.5,7,8]},index = ['a','b','c'])
pd.merge(left1,right3,left_on = 'key',right_index = True)
- 对于层次化索引的数据:
lefth = DataFrame({'key1' : ['Ohio','Ohio','Ohio','Nevada','Nevada'],
...: 'key2' : [2000,2001,2002,2001,2002],'data' : np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6,2)),index = [['Nevada','Ne
...: vada','Ohio','Ohio','Ohio','Ohio'],[2001,2000,2000,2000,2001,2002]],co
...: lumns = ['event1','event2'])
pd.merge(lefth,righth,left_on = ['key1','key2'],right_index = True)
错误示例:
pd.merge(lefth,righth,left_on ='key1,right_index = True)
最后,我们尝试一下同时合并双方的索引:
left2 = DataFrame([[1.,2.],[3.,4.],[5.,6.]],index = ['a','c','e'],colu
...: mns = ['Ohio','Nevada'])
right2 = DataFrame([[7.,8.],[9.,10.],[11.,12.],[13,14]],index = ['b','
...: c','d','e'],columns = ['Missourri','Alabama'])
pd.merge(left2,right2,how = 'outer',left_index = True,right_index = Tr
...: ue)
DataFrame还有一个join实例方法,可以更方便的实现按索引合并:
left2.join(right2,how = 'outer')
当然,还可以join还支持参数DataFrame的索引跟调用者DataFrame的某个列之间的连接:
right1 = DataFrame({'group_val' : [3.5,7]},index = ['a','b'])
left1 = DataFrame({'key' :['a','b','a','a','b','c'],'value' : range(6)
...: })
left1.join(right1,on = 'key',how = 'inner')
同时,还可以向join传入一组Dataframe,进行多组连接:
left2 = DataFrame([[1.,2.],[3.,4.],[5.,6.]],index = ['a','c','e'],colu
...: mns = ['Ohio','Nevada'])
right2 = DataFrame([[7.,8.],[9.,10.],[11.,12.],[13,14]],index = ['b','
...: c','d','e'],columns = ['Missourri','Alabama'])
another = DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17.]],index = ['a
...: ','c','e','f'],columns = ['New','Oregon'])
left2.join([right2,another])