05数据转换学习笔记

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/fantacy10000/article/details/82056375
#一.合并数据集
#1.数据库风格的DataFrame合并(merge和join)
import pandas as pd
import numpy as np
tr1=pd.DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)})
tr2=pd.DataFrame({'key':['a','b','d'],'data2':range(3)})
print(tr1)
print(tr2)
tr3=pd.merge(tr1,tr2,on='key',how='outer')
print(tr3)
#默认情况下,merge做的是‘inner’连接,结果中的键是交集,而‘outer’连接是外连接,取的是键的并集。其他还有left,right
#merge函数的参数P191
data1 key 0 0 b 1 1 b 2 2 a 3 3 c 4 4 a 5 5 a 6 6 b data2 key 0 0 a 1 1 b 2 2 d data1 key data2 0 0.0 b 1.0 1 1.0 b 1.0 2 6.0 b 1.0 3 2.0 a 0.0 4 4.0 a 0.0 5 5.0 a 0.0 6 3.0 c NaN 7 NaN d 2.0
#2.索引上的合并
left1=pd.DataFrame({'key':['a','b','c'],'value':range(3)})
right1=pd.DataFrame({'group_val':[3,7]},index=['a','b'])
tr3=pd.merge(left1,right1,left_on='key',right_index=True)
print(left1)
print(right1)
print(tr3)
key value 0 a 0 1 b 1 2 c 2 group_val a 3 b 7 key value group_val 0 a 0 3 1 b 1 7
#3.轴向连接
#3.1numpy自带的合并原始numpy数组的函数(concatenate)
arr=np.arange(12).reshape((3,4))
print(arr)
arr1=np.concatenate([arr,arr],axis=1)
print(arr1)
arr2=np.concatenate([arr,arr],axis=0)
print(arr2)
#3.2pandas的concat函数
arr1=pd.Series([1,2,3,4],index=['a','b','c','d'])
arr2=pd.Series([1,4],index=['b','c'])
arr3=pd.Series([3,5,6],index=['d','e','f'])
arr4=pd.concat([arr1,arr2,arr3],axis=1)
print(arr4)
#concat函数的参数见P198
[[ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11]] [[ 0 1 2 3 0 1 2 3] [ 4 5 6 7 4 5 6 7] [ 8 9 10 11 8 9 10 11]] [[ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11] [ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11]] 0 1 2 a 1.0 NaN NaN b 2.0 1.0 NaN c 3.0 4.0 NaN d 4.0 NaN 3.0 e NaN NaN 5.0 f NaN NaN 6.0
#4.合并重叠数据
#4.1numpy的where函数(相当于if/else)
a=pd.Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],index=['f','e','d','c','b','a'])
b=pd.Series(np.arange(len(a),dtype=np.float64),index=['f','e','d','c','b','a'])
c=np.where(pd.isnull(a),b,a)
#4.2Series的combine_first函数
b[:-2].combine_first(a[2:])
a NaN b 4.5 c 3.0 d 2.0 e 1.0 f 0.0 dtype: float64
#二.重塑和轴向旋转
#待补
#三.数据转换
#1.移除重复数据
data=pd.DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]})
print(data.duplicated())#返回表示该行是否重复的布尔值
print(data.drop_duplicates())#返回移除重复行后的结果
0 False 1 True 2 False 3 False 4 True 5 False 6 True dtype: bool k1 k2 0 one 1 2 one 2 3 two 3 5 two 4
#2.利用函数或映射进行数据转换
#3.替换值(replace)
data=pd.Series([1,2,3,4,-999,-1000,-999,1])
print(data.replace(-999,np.nan))
print(data.replace([-999,-1000],[np.nan,1]))
0 1.0 1 2.0 2 3.0 3 4.0 4 NaN 5 -1000.0 6 NaN 7 1.0 dtype: float64 0 1.0 1 2.0 2 3.0 3 4.0 4 NaN 5 1.0 6 NaN 7 1.0 dtype: float64
#4.重命名轴索引
data=pd.DataFrame(np.arange(12).reshape(3,4),index=['ohio','colorado','new york'],columns=['one','two','three','four'])
print(data)
#4.1.使用rename方法,创建数据集的转换版(复制版)
print(data.rename(index=str.title,columns=str.upper))
print(data.rename(index={'Ohio':'invida'},columns={'four':'hello'}))
#4.2.使用rename方法,就地修改某个数据集(原地版)
data=pd.DataFrame(np.arange(12).reshape(3,4),index=['ohio','colorado','new york'],columns=['one','two','three','four'])
_=data.rename(index={'ohio':'invida'},inplace=True)
print(data)
one two three four ohio 0 1 2 3 colorado 4 5 6 7 new york 8 9 10 11 ONE TWO THREE FOUR Ohio 0 1 2 3 Colorado 4 5 6 7 New York 8 9 10 11 one two three hello ohio 0 1 2 3 colorado 4 5 6 7 new york 8 9 10 11 one two three four invida 0 1 2 3 colorado 4 5 6 7 new york 8 9 10 11
#5.离散化和面元划分
#6.检测和过滤异常值
data=pd.DataFrame(np.random.randn(1000,4))
data.describe()
col=data[3]
col[np.abs(col)>3]#找出某列中绝对值大小超过3的值
data[(np.abs(data)>3).any(1)]#选出全部含有‘超过3或-3的值’的行
0 1 2 3
18 0.609418 3.324428 -0.108789 0.171618
99 -1.066103 -0.353316 -0.365273 3.555021
329 -1.657900 -1.991298 0.901694 3.395590
438 -0.473641 1.164948 0.604448 3.638542
500 0.572468 -0.243033 0.293410 -3.010369
513 3.265122 -1.466344 0.467078 -0.578180
614 3.213056 0.014980 0.214329 -0.591175
764 1.928544 -0.904684 0.339387 -3.383201
858 3.284090 0.404298 0.687844 0.414698
#7.排列和随机采样
#numpy.random.permutation函数实现对Series或DataFrame的列的排列工作,通过对需要排列的轴的长度调用permutation,可产生一个表示新顺序的整数数组
df=pd.DataFrame(np.arange(5*4).reshape(5,4))
sampler=np.random.permutation(5)
print(sampler)
df.take(sampler)
#相当于permutation基于给定长度产生一组序列,而后的数组基于该序列,采样take函数来对数组重新进行排列
[2 3 0 4 1]
0 1 2 3
2 8 9 10 11
3 12 13 14 15
0 0 1 2 3
4 16 17 18 19
1 4 5 6 7
#四.字符串操作
#1.python自有的字符串处理方法:split,strip,+,join,in,index,find,count,replace

#2.正则表达式

#3.pandas中矢量化的字符串函数**

猜你喜欢

转载自blog.csdn.net/fantacy10000/article/details/82056375