1、pandas删除方法
#Series利用drop方法删除数据,删除的为原来元素的副本,原元素不变
In [100]: ser
Out[100]:
one 0
two 1
three 2
four 3
five 4
six 5
dtype: int32
In [101]: ser3=ser.drop('three')
In [102]: ser3
Out[102]:
one 0
two 1
four 3
five 4
six 5
dtype: int32
In [103]: ser4=ser.drop(['three','one'])
In [104]: ser4
Out[104]:
two 1
four 3
five 4
six 5
dtype: int32
#DataFrame利用drop方法删除数据,删除的为原来元素的副本,原元素不变
In [9]: frame = pd.DataFrame(np.arange(16).reshape((4, 4)), index = ['one', 'two', 'three', 'four'], columns = ['zhou', 'wu', 'zheng', 'wang'])
In [10]: frame
Out[10]:
zhou wu zheng wang
one 0 1 2 3
two 4 5 6 7
three 8 9 10 11
four 12 13 14 15
In [11]: frame2=frame.drop(['one', 'two']) # 删除行,默认axis = 0(0行1列)
In [12]: frame2
Out[12]:
zhou wu zheng wang
three 8 9 10 11
four 12 13 14 15
In [13]: frame2=frame.drop(['zheng', 'wang'], axis = 1) # 删除行,默认axis = 0(0行1列)
In [14]: frame3=frame.drop(['zheng', 'wang'], axis = 1) # 删除行,默认axis = 0(0行1列)
In [15]: frame3
Out[15]:
zhou wu
one 0 1
two 4 5
three 8 9
four 12 13
2、 axis的理解(axis=0跨行;axis=1跨列)
定义:轴axis用来为超过一维的数组定义的属性,二维数据拥有两个轴:第0轴沿着行的垂直往下,第1轴沿着列的方向水平延伸。
简单理解:第一重中括号表示 0维操作;第二重中括号表示 1维操作
….依次类推,最内层的括号代表最高维度,
也即 n维数组,最内层数据操作为 axis = n
In [16]: df = pd.DataFrame([[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]],columns=["col1", "col2", "col3", "col4"])
In [17]: df
Out[17]:
col1 col2 col3 col4
0 1 1 1 1
1 2 2 2 2
2 3 3 3 3
In [18]: df.mean(axis=1)
Out[18]:
0 1.0
1 2.0
2 3.0
dtype: float64
In [19]: df2=df.drop("col4", axis=1)
In [20]: df2
Out[20]:
col1 col2 col3
0 1 1 1
1 2 2 2
2 3 3 3
3、等级索引
#单条轴可以有多级索引
In [23]: ser = pd.Series(np.random.random(5), index = [['one', 'two', 'three', 'four', 'five'], ['zhou', 'wu', 'zheng', 'wang', 'yang']])
In [24]: ser
Out[24]:
one zhou 0.236652
two wu 0.086834
three zheng 0.368482
four wang 0.004649
five yang 0.510545
dtype: float64
In [25]: ser['one']
Out[25]:
zhou 0.236652
dtype: float64
In [26]: ser[:,'zhou']
Out[26]:
one 0.236652
dtype: float64
In [27]: ser['one','zhou']
Out[27]: 0.23665181542809877
"""
# 把使用的等级索引Series对象转换为一个简单的Dataframe对象,把第二列索引转换为#相应的columns
In [28]: ser.unstack()
Out[28]:
wang wu yang zheng zhou
five NaN NaN 0.510545 NaN NaN
four 0.004649 NaN NaN NaN NaN
one NaN NaN NaN NaN 0.236652
three NaN NaN NaN 0.368482 NaN
two NaN 0.086834 NaN NaN NaN
## 逆操作,把DataFrame对象转换为Series对象 ----只能用于操作dataframe
In [29]: frame
Out[29]:
zhou wu zheng wang
one 0 1 2 3
two 4 5 6 7
three 8 9 10 11
four 12 13 14 15
In [30]: frame.stack()
Out[30]:
one zhou 0
wu 1
zheng 2
wang 3
two zhou 4
wu 5
zheng 6
wang 7
three zhou 8
wu 9
zheng 10
wang 11
four zhou 12
wu 13
zheng 14
wang 15
dtype: int32
# 对DataFrame对象,可以为它的行和列都定义等级索引
In [33]: mframe = pd.DataFrame(np.random.randn(16).reshape(4, 4), index = [['one', 'two', 'three', 'four'], ['zhou', 'wu', 'zheng', 'wang']], columns = [['biji', 'pen', 'paper', 'notebook'], [1, 2, 1, 2]])
In [34]: mframe
Out[34]:
biji pen paper notebook
1 2 1 2
one zhou 0.192029 0.375193 -1.713453 0.667226
two wu 1.053937 -0.124462 -0.285356 0.786514
three zheng -0.506220 1.710322 0.490901 0.014406
four wang -0.241337 -0.180017 -1.503502 -2.420159
#转换为series
In [35]: mframe.stack().stack()
Out[35]:
one zhou 1 biji 0.192029
paper -1.713453
2 notebook 0.667226
pen 0.375193
two wu 1 biji 1.053937
paper -0.285356
2 notebook 0.786514
pen -0.124462
three zheng 1 biji -0.506220
paper 0.490901
2 notebook 0.014406
pen 1.710322
four wang 1 biji -0.241337
paper -1.503502
2 notebook -2.420159
pen -0.180017
dtype: float64
#对DataFrame重新调整为层级排序,通过mframe.columns.names 对行列命名
In [36]: mframe.columns.names = ['object', 'id']
In [37]: mframe
Out[37]:
object biji pen paper notebook
id 1 2 1 2
one zhou 0.192029 0.375193 -1.713453 0.667226
two wu 1.053937 -0.124462 -0.285356 0.786514
three zheng -0.506220 1.710322 0.490901 0.014406
four wang -0.241337 -0.180017 -1.503502 -2.420159
# 更改单个列名称或者多个
In [38]: mframe.rename(columns = {'pen':'pen_new'})
Out[38]:
object biji pen_new paper notebook
id 1 2 1 2
one zhou 0.192029 0.375193 -1.713453 0.667226
two wu 1.053937 -0.124462 -0.285356 0.786514
three zheng -0.506220 1.710322 0.490901 0.014406
four wang -0.241337 -0.180017 -1.503502 -2.420159
# 更改单个行名称或者多个
In [41]: mframe.rename(index = {'two':'hello'})
Out[41]:
object biji pen paper notebook
id 1 2 1 2
one zhou 0.192029 0.375193 -1.713453 0.667226
hello wu 1.053937 -0.124462 -0.285356 0.786514
three zheng -0.506220 1.710322 0.490901 0.014406
four wang -0.241337 -0.180017 -1.503502 -2.420159
# swaplevel()以要互换位置的两个层级的名称为参数,返回交换位置后的一个新对象,其中各元素的顺序保持不变
In [42]: mframe.swaplevel(axis=0) # 0行1列
Out[42]:
object biji pen paper notebook
id 1 2 1 2
zhou one 0.192029 0.375193 -1.713453 0.667226
wu two 1.053937 -0.124462 -0.285356 0.786514
zheng three -0.506220 1.710322 0.490901 0.014406
wang four -0.241337 -0.180017 -1.503502 -2.420159
"""
#层级:索引排序
In [43]: mframe.sortlevel()
__main__:1: FutureWarning: sortlevel is deprecated, use sort_index(level= ...)
Out[43]:
object biji pen paper notebook
id 1 2 1 2
four wang -0.241337 -0.180017 -1.503502 -2.420159
one zhou 0.192029 0.375193 -1.713453 0.667226
three zheng -0.506220 1.710322 0.490901 0.014406
two wu 1.053937 -0.124462 -0.285356 0.786514
In [44]: mframe.sortlevel(level=0, axis=0, ascending=True) #影响行的第0层级的索引,升序
__main__:1: FutureWarning: sortlevel is deprecated, use sort_index(level= ...)
Out[44]:
object biji pen paper notebook
id 1 2 1 2
four wang -0.241337 -0.180017 -1.503502 -2.420159
one zhou 0.192029 0.375193 -1.713453 0.667226
three zheng -0.506220 1.710322 0.490901 0.014406
two wu 1.053937 -0.124462 -0.285356 0.786514
In [45]: mframe.sortlevel(level=0, axis=1, ascending=False)
__main__:1: FutureWarning: sortlevel is deprecated, use sort_index(level= ...)
Out[45]:
object pen paper notebook biji
id 2 1 2 1
one zhou 0.375193 -1.713453 0.667226 0.192029
two wu -0.124462 -0.285356 0.786514 1.053937
three zheng 1.710322 0.490901 0.014406 -0.506220
four wang -0.180017 -1.503502 -2.420159 -0.241337
4、排序和排位次
In [52]: ser = pd.Series([5, 0, 3, 8, 4], index = ['red', 'blue', 'yellow', 'white', 'green'])
In [53]: ser
Out[53]:
red 5
blue 0
yellow 3
white 8
green 4
dtype: int64
In [54]: ser.sort_index()
Out[54]:
blue 0
green 4
red 5
white 8
yellow 3
dtype: int64
"""
#按索引的ascii降序排列
In [55]: ser.sort_index(ascending=False)
Out[55]:
yellow 3
white 8
red 5
green 4
blue 0
dtype: int64
In [56]: frame = pd.DataFrame(np.arange(16).reshape((4, 4)), index = ['red', 'blue',
...: 'yellow', 'white'], columns = ['ball', 'pen', 'pencil', 'paper'])
In [57]: frame.sort_index()
Out[57]:
ball pen pencil paper
blue 4 5 6 7
red 0 1 2 3
white 12 13 14 15
yellow 8 9 10 11
In [58]: frame.sort_index(axis=1)
Out[58]:
ball paper pen pencil
red 0 3 1 2
blue 4 7 5 6
yellow 8 11 9 10
white 12 15 13 14
In [60]: frame.sort_index(by = 'pen')
__main__:1: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
Out[60]:
ball pen pencil paper
red 0 1 2 3
blue 4 5 6 7
yellow 8 9 10 11
white 12 13 14 15
"""
#排位次操作
ser.rank(method='average'/'first'/'min'/'dense'/'max') 'average'/'first'/'min'/'dense'/'max')
5、数据结构之间的运算
In [64]: frame
Out[64]:
ball pen pencil paper
red 0 1 2 3
blue 4 5 6 7
yellow 8 9 10 11
white 12 13 14 15
In [65]: ser = pd.Series(np.arange(4), index = ['ball', 'pen', 'pencil', 'paper'])
In [66]: ser
Out[66]:
ball 0
pen 1
pencil 2
paper 3
dtype: int32
In [67]: frame-ser
Out[67]:
ball pen pencil paper
red 0 0 0 0
blue 4 4 4 4
yellow 8 8 8 8
white 12 12 12 12
6、函数应用和映射
"""
#numpy可以对dataframe进行数学类的操作
In [69]: frame
Out[69]:
ball pen pencil paper
red 0 1 2 3
blue 4 5 6 7
yellow 8 9 10 11
white 12 13 14 15
#开方
In [70]: np.sqrt(frame)
Out[70]:
ball pen pencil paper
red 0.000000 1.000000 1.414214 1.732051
blue 2.000000 2.236068 2.449490 2.645751
yellow 2.828427 3.000000 3.162278 3.316625
white 3.464102 3.605551 3.741657 3.872983
#求绝对值
In [71]: np.abs(frame)
Out[71]:
ball pen pencil paper
red 0 1 2 3
blue 4 5 6 7
yellow 8 9 10 11
white 12 13 14 15
"""
7、apply()函数
用apply()函数可以在DataFrame对象上调用刚定义的函数。
In [74]: frame
Out[74]:
ball pen pencil paper
red 0 1 2 3
blue 4 5 6 7
yellow 8 9 10 11
white 12 13 14 15
In [75]: f = lambda x:x.max() - x.min()
In [76]: frame.apply(f)
Out[76]:
ball 12
pen 12
pencil 12
paper 12
dtype: int64
In [77]: frame.apply(f,axis=1)
Out[77]:
red 3
blue 3
yellow 3
white 3
dtype: int64
In [78]: frame.apply(lambda x:x.max() - x.min(),axis=1)
Out[78]:
red 3
blue 3
yellow 3
white 3
dtype: int64
8、统计学之相关性和协方差
describe()函数可计算多个统计量。针对单独某一列也是可以操作的
In [79]: frame
Out[79]:
ball pen pencil paper
red 0 1 2 3
blue 4 5 6 7
yellow 8 9 10 11
white 12 13 14 15
In [80]: frame.describe()
Out[80]:
ball pen pencil paper
count 4.000000 4.000000 4.000000 4.000000
mean 6.000000 7.000000 8.000000 9.000000
std 5.163978 5.163978 5.163978 5.163978
min 0.000000 1.000000 2.000000 3.000000
25% 3.000000 4.000000 5.000000 6.000000
50% 6.000000 7.000000 8.000000 9.000000
75% 9.000000 10.000000 11.000000 12.000000
max 12.000000 13.000000 14.000000 15.000000
#四分位数 df.quantile(axis=0)
#相关性 范围 [-1, 1] :越接近-1,负相关;越接近0,无关;越接近1,正相关。
#两个series之间的相关性
In [81]: seq2 = pd.Series([3, 4, 3, 4, 5, 4, 3, 2], ['2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013'])
In [82]: seq = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], ['2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013'])
In [83]: seq.corr(seq2)
Out[83]: -0.25197631533948484
#计算单个DataFrame的相关性
In [86]: frame2 = pd.DataFrame([[1, 4, 3, 6], [4, 5, 6, 1], [3, 3, 1, 5], [4, 1, 6, 4]], index = ['red', 'blue', 'yellow', 'white'], columns
...: = ['ball', 'pen', 'pencil', 'paper'])
In [87]: frame2
Out[87]:
ball pen pencil paper
red 1 4 3 6
blue 4 5 6 1
yellow 3 3 1 5
white 4 1 6 4
In [88]: frame2.corr()
Out[88]:
ball pen pencil paper
ball 1.000000 -0.276026 0.577350 -0.763763
pen -0.276026 1.000000 -0.079682 -0.361403
pencil 0.577350 -0.079682 1.000000 -0.692935
paper -0.763763 -0.361403 -0.692935 1.000000
#dataframe和serries相关性计算,用corrwith
In [89]: frame2.corrwith(ser)
Out[89]:
ball NaN
pen NaN
pencil NaN
paper NaN
dtype: float64
#两个dataframe之间的相关性计算,用corrwith
In [92]: frame.corrwith(frame2)
Out[92]:
ball 0.730297
pen -0.831522
pencil 0.210819
paper -0.119523
dtype: float64
#协方差
#范围[-无穷, +无穷]:=0,无关;=正无穷:正相关;=负无穷:负相关
# 绝对值越大,相关性越强;反之,相关性就越弱
In [93]: seq.cov(seq2)
Out[93]: -0.5714285714285714
In [94]: frame2.corr()
Out[94]:
ball pen pencil paper
ball 1.000000 -0.276026 0.577350 -0.763763
pen -0.276026 1.000000 -0.079682 -0.361403
pencil 0.577350 -0.079682 1.000000 -0.692935
paper -0.763763 -0.361403 -0.692935 1.000000
9、过滤NaN
"""
In [95]: ser = pd.Series([0, 1, None, np.NaN, 9], index = ['red', 'blue', 'yellow', 'white', 'green'])
In [96]: ser
Out[96]:
red 0.0
blue 1.0
yellow NaN
white NaN
green 9.0
dtype: float64
#drop去掉某指定的行或者列
In [97]: ser.drop('white')
Out[97]:
red 0.0
blue 1.0
yellow NaN
green 9.0
dtype: float64
#dropna去掉全部的空格
In [98]: ser.dropna()
Out[98]:
red 0.0
blue 1.0
green 9.0
dtype: float64
In [101]: ser[ser.notnull()]
Out[101]:
red 0.0
blue 1.0
green 9.0
dtype: float64
In [102]: frame3 = pd.DataFrame([[6, np.nan, 6], [np.nan, np.nan, np.nan], [2, np.nan, 5]], index
...: = ['blue', 'green', 'red'], columns = ['ball', 'mug', 'pen'])
In [103]: frame3
Out[103]:
ball mug pen
blue 6.0 NaN 6.0
green NaN NaN NaN
red 2.0 NaN 5.0
In [104]: frame3.dropna()
Out[104]:
Empty DataFrame
Columns: [ball, mug, pen]
Index: []
In [105]: frame3.dropna(axis = 1)
Out[105]:
Empty DataFrame
Columns: []
Index: [blue, green, red]
"""
# 只删除所有元素均为NaN的行或列。
In [108]: frame3.dropna(how = 'all')
Out[108]:
ball mug pen
blue 6.0 NaN 6.0
red 2.0 NaN 5.0
"""