1. ndarray 对象内幕
import numpy as np
np.ones((3, 4, 5), dtype=np.float64).strides
(160, 40, 8)
反映了在不同轴上遍历的步长,可见每个np.float64
的长度是8Byte=64bit
。跨度大的轴的计算代价更高。
np.ones((3, 4, 5), dtype=np.float32).strides
(80, 20, 4)
np.ones((3, 4, 5), dtype=np.uint16).strides
(40, 10, 2)
np.ones((3, 4, 5), dtype=np.float32).strides
(80, 20, 4)
np.ones((3, 4, 5), dtype=np.integer).strides
(160, 40, 8)
1.1 Numpy dtype 层次结构
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
ints.dtype
dtype('uint16')
np.issubdtype(ints.dtype, np.integer)
True
np.issubdtype(ints.dtype, np.floating)
False
np.issubdtype(floats.dtype, np.floating)
True
np.issubdtype('float', np.floating)
True
# 查看父类
np.float64.mro()
[numpy.float64,
numpy.floating,
numpy.inexact,
numpy.number,
numpy.generic,
float,
object]
np.uint16.mro()
[numpy.uint16,
numpy.unsignedinteger,
numpy.integer,
numpy.number,
numpy.generic,
object]
np.issubdtype(np.float64, float)
True
np.issubdtype(np.float32, float)
False
np.float32.mro()
[numpy.float32,
numpy.floating,
numpy.inexact,
numpy.number,
numpy.generic,
object]
2. 高阶数组操作
2.1 重塑数组
arr = np.arange(8)
arr
array([0, 1, 2, 3, 4, 5, 6, 7])
# C-order 按照行方向重塑
# F-order 按照列方向重塑
arr.reshape((4, 2))
array([[0, 1],
[2, 3],
[4, 5],
[6, 7]])
arr.reshape((4, 2), order='F')
array([[0, 4],
[1, 5],
[2, 6],
[3, 7]])
arr.reshape((-1, 4))
array([[0, 1, 2, 3],
[4, 5, 6, 7]])
# 扁平化:flatten(),返回数据的副本
# 分散化:ravel(),无副本,直接改变原数组的value
arr = np.arange(15).reshape((5, 3))
arr
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11],
[12, 13, 14]])
arr.flatten()
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
arr.flatten()[2]=99
arr
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11],
[12, 13, 14]])
arr.ravel()
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
arr.ravel()[2] = 99
arr
array([[ 0, 1, 99],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11],
[12, 13, 14]])
arr.flatten(order='F')
array([ 0, 3, 6, 9, 12, 1, 4, 7, 10, 13, 99, 5, 8, 11, 14])
2.2 C顺序和Fortran顺序
# C-order 从数组shape属性中末尾的轴开始往前遍历,即,先遍历高索引位的轴
# F-order 从数组shape属性中开头的轴开始往后遍历,即,先遍历低索引位的轴
arr = np.arange(12).reshape((3, 4))
arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
arr.ravel()
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
arr.ravel(order='f')
array([ 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11])
2.3 连接和分割数组
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
# axis指定在哪个轴进行拼接
np.concatenate([arr1, arr2], axis=0)
array([[ 1, 2, 3],
[ 4, 5, 6],
[ 7, 8, 9],
[10, 11, 12]])
np.concatenate([arr1, arr2], axis=1)
array([[ 1, 2, 3, 7, 8, 9],
[ 4, 5, 6, 10, 11, 12]])
arr3 = np.array([1, 2, 3])
arr4 = np.array([4, 5, 6])
np.column_stack((arr3, arr4)) # 会自动将1D数组转为2D
array([[1, 4],
[2, 5],
[3, 6]])
np.column_stack((arr3[None, :], arr4[None, :]))
array([[1, 2, 3, 4, 5, 6]])
np.hstack((arr3, arr4)) # 不会江1D数组转为2D
array([1, 2, 3, 4, 5, 6])
np.row_stack((arr3, arr4))
array([[1, 2, 3],
[4, 5, 6]])
np.vstack((arr3, arr4))
array([[1, 2, 3],
[4, 5, 6]])
# split 切割/分割数组
arr = np.random.randn(5, 4)
print(arr)
first, second, third = np.split(arr, [1, 3], axis=1)
first, second, third
[[ 0.84435272 -1.41113975 0.15211615 -1.09964343]
[-0.05692753 1.54980138 0.84249968 -0.2415958 ]
[ 1.28897645 -0.00376694 -0.09085957 0.38944626]
[ 1.08790741 0.13027303 -0.82255024 -0.42911225]
[-1.70847678 -0.84645461 -0.2277294 -0.61552024]]
(array([[ 0.84435272],
[-0.05692753],
[ 1.28897645],
[ 1.08790741],
[-1.70847678]]),
array([[-1.41113975, 0.15211615],
[ 1.54980138, 0.84249968],
[-0.00376694, -0.09085957],
[ 0.13027303, -0.82255024],
[-0.84645461, -0.2277294 ]]),
array([[-1.09964343],
[-0.2415958 ],
[ 0.38944626],
[-0.42911225],
[-0.61552024]]))
first, second, third = np.hsplit(arr, [1, 3])
first, second, third
(array([[ 0.84435272],
[-0.05692753],
[ 1.28897645],
[ 1.08790741],
[-1.70847678]]),
array([[-1.41113975, 0.15211615],
[ 1.54980138, 0.84249968],
[-0.00376694, -0.09085957],
[ 0.13027303, -0.82255024],
[-0.84645461, -0.2277294 ]]),
array([[-1.09964343],
[-0.2415958 ],
[ 0.38944626],
[-0.42911225],
[-0.61552024]]))
first, second, third = np.vsplit(arr, [1, 3])
first, second, third
(array([[ 0.84435272, -1.41113975, 0.15211615, -1.09964343]]),
array([[-0.05692753, 1.54980138, 0.84249968, -0.2415958 ],
[ 1.28897645, -0.00376694, -0.09085957, 0.38944626]]),
array([[ 1.08790741, 0.13027303, -0.82255024, -0.42911225],
[-1.70847678, -0.84645461, -0.2277294 , -0.61552024]]))
2.3.1 堆叠助手:r_和c_
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)
np.row_stack([arr1, arr2])
array([[ 0. , 1. ],
[ 2. , 3. ],
[ 4. , 5. ],
[ 0.70333255, 0.26275106],
[ 1.99202938, -1.46490714],
[ 0.73384258, 0.43515298]])
np.r_[arr1, arr2]
array([[ 0. , 1. ],
[ 2. , 3. ],
[ 4. , 5. ],
[ 0.70333255, 0.26275106],
[ 1.99202938, -1.46490714],
[ 0.73384258, 0.43515298]])
np.column_stack([np.vstack([arr1, arr2]), arr])
array([[ 0. , 1. , 0. ],
[ 2. , 3. , 1. ],
[ 4. , 5. , 2. ],
[ 0.70333255, 0.26275106, 3. ],
[ 1.99202938, -1.46490714, 4. ],
[ 0.73384258, 0.43515298, 5. ]])
np.c_[np.r_[arr1, arr2], arr]
array([[ 0. , 1. , 0. ],
[ 2. , 3. , 1. ],
[ 4. , 5. , 2. ],
[ 0.70333255, 0.26275106, 3. ],
[ 1.99202938, -1.46490714, 4. ],
[ 0.73384258, 0.43515298, 5. ]])
np.column_stack([1:6, -10:-5])
File "<ipython-input-81-d3ac66c3723b>", line 1
np.column_stack([1:6, -10:-5])
^
SyntaxError: invalid syntax
np.c_[1:6, -10:-5]
array([[ 1, -10],
[ 2, -9],
[ 3, -8],
[ 4, -7],
[ 5, -6]])
2.4 重复元素:tile 和 repeat
# repeat 元素重复
arr = np.arange(3)
arr
array([0, 1, 2])
arr.repeat(3)
array([0, 0, 0, 1, 1, 1, 2, 2, 2])
arr.repeat([2, 1, 4])
array([0, 0, 1, 2, 2, 2, 2])
arr = np.random.randn(2, 3)
arr
array([[ 0.16201186, -0.76919668, -0.71688664],
[-0.70057032, -0.96810267, -1.61212582]])
arr.repeat([2, 1, 4, 2, 2, 2]) # 会扁平化
array([ 0.16201186, 0.16201186, -0.76919668, -0.71688664, -0.71688664,
-0.71688664, -0.71688664, -0.70057032, -0.70057032, -0.96810267,
-0.96810267, -1.61212582, -1.61212582])
arr = np.random.randn(2, 2)
arr
array([[ 1.28555077, 0.33821435],
[-0.51810958, -1.69789573]])
arr.repeat(2) # 如果不指定轴,会造成扁平化
array([ 1.28555077, 1.28555077, 0.33821435, 0.33821435, -0.51810958,
-0.51810958, -1.69789573, -1.69789573])
arr.repeat(2, axis=0)
array([[ 1.28555077, 0.33821435],
[ 1.28555077, 0.33821435],
[-0.51810958, -1.69789573],
[-0.51810958, -1.69789573]])
arr.repeat(2, axis=1)
array([[ 1.28555077, 1.28555077, 0.33821435, 0.33821435],
[-0.51810958, -0.51810958, -1.69789573, -1.69789573]])
arr.repeat([2, 1], axis=0)
array([[ 1.28555077, 0.33821435],
[ 1.28555077, 0.33821435],
[-0.51810958, -1.69789573]])
arr.repeat([2, 3], axis=1)
array([[ 1.28555077, 1.28555077, 0.33821435, 0.33821435, 0.33821435],
[-0.51810958, -0.51810958, -1.69789573, -1.69789573, -1.69789573]])
# tile 是直接整体数组重复
arr
array([[ 1.28555077, 0.33821435],
[-0.51810958, -1.69789573]])
np.tile(arr, 2)
array([[ 1.28555077, 0.33821435, 1.28555077, 0.33821435],
[-0.51810958, -1.69789573, -0.51810958, -1.69789573]])
np.tile(arr, [2, 1])
array([[ 1.28555077, 0.33821435],
[-0.51810958, -1.69789573],
[ 1.28555077, 0.33821435],
[-0.51810958, -1.69789573]])
np.tile(arr, (3, 2))
array([[ 1.28555077, 0.33821435, 1.28555077, 0.33821435],
[-0.51810958, -1.69789573, -0.51810958, -1.69789573],
[ 1.28555077, 0.33821435, 1.28555077, 0.33821435],
[-0.51810958, -1.69789573, -0.51810958, -1.69789573],
[ 1.28555077, 0.33821435, 1.28555077, 0.33821435],
[-0.51810958, -1.69789573, -0.51810958, -1.69789573]])
2.5 神奇索引的等价方法: take 和 put
arr = np.arange(10) * 100
arr
array([ 0, 100, 200, 300, 400, 500, 600, 700, 800, 900])
inds = [7, 1, 2, 6]
arr[inds]
array([700, 100, 200, 600])
arr.take(inds)
array([700, 100, 200, 600])
arr.put(inds, 22)
arr
array([ 0, 22, 22, 300, 400, 500, 22, 22, 800, 900])
arr.put(inds, [1, 2, 3, 4])
arr
array([ 0, 2, 3, 300, 400, 500, 4, 1, 800, 900])
arr = np.random.randn(2, 4)
inds = [2, 0, 2, 1]
arr
array([[-0.49565355, 0.78522712, -0.06629777, -1.00791514],
[ 0.39132436, -0.52828662, -0.82480479, 0.49250005]])
arr.take(inds, axis=1)
array([[-0.06629777, -0.49565355, -0.06629777, 0.78522712],
[-0.82480479, 0.39132436, -0.82480479, -0.52828662]])
注意:put
不接受axis
参数
3. 广播 broadcast
首先,其实数组与标量间的运算其实是一种先广播,后element-wise的运算
import numpy as np
arr = np.arange(5)
arr
array([0, 1, 2, 3, 4])
arr * 4
array([ 0, 4, 8, 12, 16])
广播的规则是每个末尾维度,轴长匹配或者长度是1,广播会在丢失的轴,比如(4, 3) + (3,)
后面的数组就复合末尾轴长相等,会广播一个缺失的轴;或者,广播在轴长为1的轴上进行,比如(4, 3) + (1, 3)
,会将轴0从1广播为4。对于数组和标量的运算,其实也是利用了广播,比如(4, 3) + scale
,其中scale的shape其实可以认为(1,)
,则末尾长度为1,广播时,末尾的维度广播成3,缺失的轴广播成4。
广播可在两个数组中都进行,比如(4, 4) + (4, 1, 4)
,首先末尾轴长一致,其次不一致的轴长其中一个为1,那么1广播成4,另外缺失轴广播为4.
基于这种规则,有时候想计算(4, 3) + (4, 1)
,而实际上后者为(4,)
的时候,由于末尾轴长不是1,而且3与4也不匹配,因此不能够广播,必须通过reshape,或者[:, None]的方式增加坐标轴,或者利用np.newaxis
因此,其实抓住两个数组的末尾轴长是关键,一看轴长既不是1,也不一致,那么别想广播了,看看怎么写循环操作吧。
arr = np.random.randn(4, 3)
arr.mean(0)
array([ 0.27783846, 0.36009253, -0.1499029 ])
demeaned = arr - arr.mean(0)
demeaned
array([[-0.79969385, -1.6011334 , -0.00747013],
[-0.0381061 , 0.64865496, -0.97992594],
[ 1.13694786, 0.81091045, 0.73967573],
[-0.29914791, 0.14156799, 0.24772034]])
arr.shape, arr.mean(0).shape
((4, 3), (3,))
aaa = np.array([1])
aaa.shape
(1,)
ans = arr - aaa
arr - ans
array([[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.]])
arr.shape, aaa.shape
((4, 3), (1,))
arr.shape
(4, 3)
arr.mean(1).shape
(4,)
arr - arr.mean(1)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-8b8ada26fac0> in <module>
----> 1 arr - arr.mean(1)
ValueError: operands could not be broadcast together with shapes (4,3) (4,)
arr - arr.mean(1).reshape(-1, 1)
array([[ 0.11823437, -0.6009511 , 0.48271673],
[ 0.20018202, 0.96919716, -1.16937918],
[ 0.35626561, 0.11248227, -0.46874788],
[-0.21403229, 0.30893769, -0.0949054 ]])
arr - arr.mean(1)[:, None]
array([[ 0.11823437, -0.6009511 , 0.48271673],
[ 0.20018202, 0.96919716, -1.16937918],
[ 0.35626561, 0.11248227, -0.46874788],
[-0.21403229, 0.30893769, -0.0949054 ]])
arr - arr.mean(1)[:, np.newaxis]
array([[ 0.11823437, -0.6009511 , 0.48271673],
[ 0.20018202, 0.96919716, -1.16937918],
[ 0.35626561, 0.11248227, -0.46874788],
[-0.21403229, 0.30893769, -0.0949054 ]])
一个三维例子:
arr = np.ones((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr.shape, arr_3d.shape
((4, 4), (4, 1, 4))
arr + arr_3d
array([[[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.]],
[[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.]],
[[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.]],
[[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.]]])
一个常用的模式,比如,减掉/除掉某个轴的求和/方差/均值之类的:
arr = np.random.randn(3, 4, 5)
# 加入叫减掉1轴的均值
means = arr.mean(1)
means
array([[-0.95808939, -0.59395877, 0.44605451, 0.06325242, 0.14369531],
[ 0.2600657 , -0.92595688, -0.75528343, -0.2486933 , -0.02936524],
[-0.22052564, 0.14549496, -0.67660057, -0.10151047, 0.26275483]])
arr.shape, means.shape
((3, 4, 5), (3, 5))
demeaned = arr - means[:, np.newaxis, :]
demeaned.mean(1) < 1e-16
array([[ True, True, True, True, True],
[ True, True, True, True, True],
[ True, True, True, True, True]])
# 可以将其写为一个函数
def demean_axis(arr, axis=0):
means = arr.mean(axis)
indexer = [slice(None)] * arr.ndim
indexer[axis] = np.newaxis
return arr - means[indexer]
arr = np.random.randn(3, 4, 5)
demeaned = demean_axis(arr, axis=1)
demeaned.mean(1) < 1e-16
<ipython-input-45-8051ed80feee>:6: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return arr - means[(indexer)]
array([[ True, True, True, True, True],
[ True, True, True, True, True],
[ True, True, True, True, True]])
数组赋值其实也用到了广播:
arr = np.zeros((4, 3))
col = np.array([1.28, 0, 33, 0.5])
arr[:] = col[:, np.newaxis]
arr
array([[ 1.28, 1.28, 1.28],
[ 0. , 0. , 0. ],
[33. , 33. , 33. ],
[ 0.5 , 0.5 , 0.5 ]])
arr[:2] = [[2], [3]]
arr
array([[ 2. , 2. , 2. ],
[ 3. , 3. , 3. ],
[33. , 33. , 33. ],
[ 0.5, 0.5, 0.5]])
4. 高阶 ufunc 用法
4.1 ufunc 实例方法
有点像pandas中,agg这样的函数
# reduce 连续使用ufunc(比如add,multiply),reduce先建立好气函数内数组的形状和哪些元素将要进行ufunc运算,然后做ufunc运算
arr = np.arange(10)
np.add.reduce(arr)
45
arr.sum()
45
arr = np.arange(1, 5)
np.multiply.reduce(arr)
24
np.prod(arr)
24
arr.prod()
24
np.random.seed(12346)
arr = np.random.randn(5, 5)
arr[::2]
array([[-8.99822478e-02, 7.59372617e-01, 7.48336101e-01,
-9.81497953e-01, 3.65775545e-01],
[ 2.48256116e-01, -3.21536673e-01, -8.48730755e-01,
4.60468309e-04, -5.46459347e-01],
[-6.49092950e-01, -4.79535727e-01, -9.53521432e-01,
1.42253882e+00, 1.75403128e-01]])
arr[::2].sort(1)
arr[::2]
array([[-9.81497953e-01, -8.99822478e-02, 3.65775545e-01,
7.48336101e-01, 7.59372617e-01],
[-8.48730755e-01, -5.46459347e-01, -3.21536673e-01,
4.60468309e-04, 2.48256116e-01],
[-9.53521432e-01, -6.49092950e-01, -4.79535727e-01,
1.75403128e-01, 1.42253882e+00]])
arr
array([[-9.81497953e-01, -8.99822478e-02, 3.65775545e-01,
7.48336101e-01, 7.59372617e-01],
[-3.15442628e-01, -8.66135605e-01, 2.78568155e-02,
-4.55597723e-01, -1.60189223e+00],
[-8.48730755e-01, -5.46459347e-01, -3.21536673e-01,
4.60468309e-04, 2.48256116e-01],
[ 2.53915229e-01, 1.93684246e+00, -7.99504902e-01,
-5.69159281e-01, 4.89244731e-02],
[-9.53521432e-01, -6.49092950e-01, -4.79535727e-01,
1.75403128e-01, 1.42253882e+00]])
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)
array([ True, False, True, False, True])
np.logical_and.accumulate(arr[:, :-1] < arr[:, 1:], axis=1)
array([[ True, True, True, True],
[False, False, False, False],
[ True, True, True, True],
[ True, False, False, False],
[ True, True, True, True]])
arr[:, :-1] < arr[:, 1:]
array([[ True, True, True, True],
[False, True, False, False],
[ True, True, True, True],
[ True, False, True, True],
[ True, True, True, True]])
np.all(arr[:, :-1] < arr[:, 1:], axis=1)
array([ True, False, True, False, True])
np.logical_and.reduce(arr[:, [0]] < arr[:, [1]], axis=1)
array([ True, False, True, True, True])
arr[:, [0]] < arr[:, [1]]
array([[ True],
[False],
[ True],
[ True],
[ True]])
# 类似于sum与cumsum,与recuce对应的是accumulate
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)
array([[ 0, 1, 3, 6, 10],
[ 5, 11, 18, 26, 35],
[10, 21, 33, 46, 60]])
# outer返回一个类似“外积”只不过不是计算积,而是进行ufunc运算
arr = np.arange(3).repeat([1, 2, 2])
arr
array([0, 1, 1, 2, 2])
np.multiply.outer(arr, np.arange(5))
array([[0, 0, 0, 0, 0],
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[0, 2, 4, 6, 8],
[0, 2, 4, 6, 8]])
np.outer(arr, np.arange(5))
array([[0, 0, 0, 0, 0],
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[0, 2, 4, 6, 8],
[0, 2, 4, 6, 8]])
np.logical_and.outer(arr, np.arange(5))
array([[False, False, False, False, False],
[False, True, True, True, True],
[False, True, True, True, True],
[False, True, True, True, True],
[False, True, True, True, True]])
# outer 的维度为两个数组shape直接拼接
x, y = np.random.randn(3, 4), np.random.randn(5)
result = np.subtract.outer(x, y)
result.shape
(3, 4, 5)
# reduceat 相当于groupby
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])
array([10, 18, 17])
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
array([[ 0, 0, 0, 0, 0],
[ 0, 1, 2, 3, 4],
[ 0, 2, 4, 6, 8],
[ 0, 3, 6, 9, 12]])
np.add.reduceat(arr, [0, 2, 4], axis=1)
array([[ 0, 0, 0],
[ 1, 5, 4],
[ 2, 10, 8],
[ 3, 15, 12]])
4.2 使用python编写新的ufunc方法
# 利用 numpy.vectorize
def add_elements(x, y):
return x + y
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))
array([ 0., 2., 4., 6., 8., 10., 12., 14.])
add_them(np.random.randn(2, 2), np.random.randn(2, 2))
array([[-1.33080793, -1.43407981],
[ 0.15584993, 1.0519004 ]])
arr = np.random.randn(10000)
%timeit add_them(arr, arr)
1.18 ms ± 15.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
arr = np.random.randn(10000)
%timeit np.add(arr, arr)
2.53 µs ± 43.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
5. 结构化和记录数组
import numpy as np
# 利用(field_name, field_data_type)作为dtype的元组列表
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr
array([(1.5 , 6), (3.14159265, -2)],
dtype=[('x', '<f8'), ('y', '<i4')])
sarr[0]
(1.5, 6)
sarr['x']
array([1.5 , 3.14159265])
sarr['y']
array([ 6, -2], dtype=int32)
sarr['x'].dtype.name
'float64'
sarr.dtype.name
'void96'
sarr.dtype.names
('x', 'y')
sarr.dtype
dtype([('x', '<f8'), ('y', '<i4')])
sarr[0]['y']
6
5.1 嵌套dtype和多维字段
# 可以向dtype多传递一个形状,用于指定作用次数
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr
array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
dtype=[('x', '<i8', (3,)), ('y', '<i4')])
np.array([1, 2, 3, 4], dtype=dtype)
array([([1, 1, 1], 1), ([2, 2, 2], 2), ([3, 3, 3], 3), ([4, 4, 4], 4)],
dtype=[('x', '<i8', (3,)), ('y', '<i4')])
arr[0]
([0, 0, 0], 0)
arr[0]['x']
array([0, 0, 0])
arr['x']
array([[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0]])
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 3), ((3, 4), 5)], dtype=dtype)
data
array([((1., 2.), 3), ((3., 4.), 5)],
dtype=[('x', [('a', '<f8'), ('b', '<f4')]), ('y', '<i4')])
data['x']
array([(1., 2.), (3., 4.)], dtype=[('a', '<f8'), ('b', '<f4')])
data['y']
array([3, 5], dtype=int32)
data['x']['a']
array([1., 3.])
6. 更多关于排序的内容
import numpy as np
# ndarray 的 sort 实例方法 与 python 内建的列表排序类似,是一种原位排序,而不生成新数组
arr = np.random.randn(6)
arr.sort()
arr
array([-1.20245647, -1.06934741, 0.23694375, 0.46847888, 1.33116886,
2.07072179])
arr = np.random.randn(3, 5)
arr
array([[-0.1373051 , 0.8347231 , -2.13610283, 0.6911535 , 1.29073812],
[-0.89613231, 0.40151617, 1.19168597, -0.3273313 , 1.15674067],
[-0.59111152, -0.13488416, 0.13590381, 0.07592941, -0.92518222]])
arr[:, 0].sort()
arr
array([[-0.89613231, 0.8347231 , -2.13610283, 0.6911535 , 1.29073812],
[-0.59111152, 0.40151617, 1.19168597, -0.3273313 , 1.15674067],
[-0.1373051 , -0.13488416, 0.13590381, 0.07592941, -0.92518222]])
# 与实例方法不同的是,numpy.sort 产生的是新的副本
arr = np.random.randn(5)
arr
array([-2.1607857 , -0.98139601, -1.74567649, -0.93574966, 0.53958451])
np.sort(arr)
array([-2.1607857 , -1.74567649, -0.98139601, -0.93574966, 0.53958451])
arr
array([-2.1607857 , -0.98139601, -1.74567649, -0.93574966, 0.53958451])
# 可以指定轴
arr = np.random.randn(3, 5)
arr
array([[-2.9614586 , 0.43584634, -0.27190644, 0.03146461, 0.21746412],
[ 0.4819753 , -1.1517702 , 1.59403466, -0.51082439, 0.05183487],
[ 0.6184096 , -1.34489717, 0.05997099, 1.23059888, 1.84840695]])
arr.sort(axis=1)
arr
array([[-2.9614586 , -0.27190644, 0.03146461, 0.21746412, 0.43584634],
[-1.1517702 , -0.51082439, 0.05183487, 0.4819753 , 1.59403466],
[-1.34489717, 0.05997099, 0.6184096 , 1.23059888, 1.84840695]])
# 逆序的方法
arr[:, ::-1]
array([[ 0.43584634, 0.21746412, 0.03146461, -0.27190644, -2.9614586 ],
[ 1.59403466, 0.4819753 , 0.05183487, -0.51082439, -1.1517702 ],
[ 1.84840695, 1.23059888, 0.6184096 , 0.05997099, -1.34489717]])
6.1 间接排序: argsort 和 lexsort
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer
array([1, 2, 4, 3, 0])
values[indexer]
array([0, 1, 2, 3, 5])
arr = np.random.randn(3, 5)
arr[0] = values
arr
array([[ 5. , 0. , 1. , 3. , 2. ],
[-1.27871978, 1.65385215, 1.04044587, 0.89253023, 0.12713788],
[ 1.58960486, 1.06406754, 0.06449551, 0.17571087, 1.35782749]])
arr[:, indexer]
array([[ 0. , 1. , 2. , 3. , 5. ],
[ 1.65385215, 1.04044587, 0.12713788, 0.89253023, -1.27871978],
[ 1.06406754, 0.06449551, 1.35782749, 0.17571087, 1.58960486]])
arr[:, arr[0].argsort()]
array([[ 0. , 1. , 2. , 3. , 5. ],
[ 1.65385215, 1.04044587, 0.12713788, 0.89253023, -1.27871978],
[ 1.06406754, 0.06449551, 1.35782749, 0.17571087, 1.58960486]])
lexsort 与 argsort 类似,不过其对多键数组进行间接字典排序
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name)) # 先执行first_name排序,再在其基础上执行last_name排序
sorter
array([1, 2, 3, 0, 4])
zip(last_name[sorter], first_name[sorter])
<zip at 0x7fd37101f1c0>
[x for x in zip(last_name[sorter], first_name[sorter])]
[('Arnold', 'Jane'),
('Arnold', 'Steve'),
('Jones', 'Bill'),
('Jones', 'Bob'),
('Walters', 'Barbara')]
6.2 其他排序算法
values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
array([2, 3, 4, 0, 1])
values.take(indexer)
array(['1:first', '1:second', '1:third', '2:first', '2:second'],
dtype='<U8')
indexer = key.argsort(kind='quicksort')
indexer
array([2, 3, 4, 0, 1])
indexer = key.argsort(kind='heapsort')
indexer
array([4, 2, 3, 1, 0])
values.take(indexer)
array(['1:third', '1:first', '1:second', '2:second', '2:first'],
dtype='<U8')
6.3 数组的部分排序
# numpy.partition(arr, inx, axis),其中inx表示数组中第几小的数值,然后让比这个数值小的数拍在它之前,比它大的排在其后
np.random.seed(12345)
arr = np.random.randn(20)
arr
array([-0.20470766, 0.47894334, -0.51943872, -0.5557303 , 1.96578057,
1.39340583, 0.09290788, 0.28174615, 0.76902257, 1.24643474,
1.00718936, -1.29622111, 0.27499163, 0.22891288, 1.35291684,
0.88642934, -2.00163731, -0.37184254, 1.66902531, -0.43856974])
np.partition(arr, 3)
array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
-0.43856974, -0.20470766, 0.28174615, 0.76902257, 0.47894334,
1.00718936, 0.09290788, 0.27499163, 0.22891288, 1.35291684,
0.88642934, 1.39340583, 1.96578057, 1.66902531, 1.24643474])
arr
array([-0.20470766, 0.47894334, -0.51943872, -0.5557303 , 1.96578057,
1.39340583, 0.09290788, 0.28174615, 0.76902257, 1.24643474,
1.00718936, -1.29622111, 0.27499163, 0.22891288, 1.35291684,
0.88642934, -2.00163731, -0.37184254, 1.66902531, -0.43856974])
np.sort(arr)
array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.43856974,
-0.37184254, -0.20470766, 0.09290788, 0.22891288, 0.27499163,
0.28174615, 0.47894334, 0.76902257, 0.88642934, 1.00718936,
1.24643474, 1.35291684, 1.39340583, 1.66902531, 1.96578057])
np.partition(arr, 10)
array([-1.29622111, -0.43856974, -0.51943872, -0.5557303 , -0.37184254,
-2.00163731, -0.20470766, 0.09290788, 0.22891288, 0.27499163,
0.28174615, 1.00718936, 0.47894334, 0.76902257, 1.35291684,
0.88642934, 1.39340583, 1.96578057, 1.66902531, 1.24643474])
arr
array([-0.20470766, 0.47894334, -0.51943872, -0.5557303 , 1.96578057,
1.39340583, 0.09290788, 0.28174615, 0.76902257, 1.24643474,
1.00718936, -1.29622111, 0.27499163, 0.22891288, 1.35291684,
0.88642934, -2.00163731, -0.37184254, 1.66902531, -0.43856974])
indices = np.argpartition(arr, 3) # 取索引
indices
array([16, 11, 3, 2, 17, 19, 0, 7, 8, 1, 10, 6, 12, 13, 14, 15, 5,
4, 18, 9])
arr.take(indices)
array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
-0.43856974, -0.20470766, 0.28174615, 0.76902257, 0.47894334,
1.00718936, 0.09290788, 0.27499163, 0.22891288, 1.35291684,
0.88642934, 1.39340583, 1.96578057, 1.66902531, 1.24643474])
arr.take(indices) == np.partition(arr, 3)
array([ True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True])
6.4 numpy.searchsorted:在已排序数组中寻找元素
arr = np.array([0, 8, 11, 8, 15])
arr.searchsorted(9)
2
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)
3
arr.searchsorted([0, 8, 11, 16])
array([0, 3, 3, 5])
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
array([0, 3])
arr.searchsorted([0, 1], side='right')
array([3, 7])
# 一个很典型的应用
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data
array([9940., 6768., 7908., 1709., 268., 8003., 9037., 246., 4917.,
5262., 5963., 519., 8950., 7282., 8183., 5002., 8101., 959.,
2189., 2587., 4681., 4593., 7095., 1780., 5314., 1677., 7688.,
9281., 6094., 1501., 4896., 3773., 8486., 9110., 3838., 3154.,
5683., 1878., 1258., 6875., 7996., 5735., 9732., 6340., 8884.,
4954., 3516., 7142., 5039., 2256.])
labels = bins.searchsorted(data)
labels
array([4, 4, 4, 3, 2, 4, 4, 2, 3, 4, 4, 2, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3,
4, 3, 4, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 4, 4,
4, 3, 3, 4, 4, 3])
# 配合pandas的groupby,可以实现聚类
import pandas as pd
pd.Series(data).groupby(labels).mean()
2 498.000000
3 3064.277778
4 7389.035714
dtype: float64
7. 使用Numba编写快速Numpy函数
import numpy as np
# 现考虑一个python本身的函数,看看有多慢
def mean_distance(x, y):
nx = len(x)
result = 0.0
count = 0
for i in range(nx):
result += x[i] - y[i]
count += 1
return result / count
x = np.random.randn(10000000)
y = np.random.randn(10000000)
%timeit mean_distance(x, y)
2.68 s ± 100 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit (x - y).mean()
26.9 ms ± 549 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# 用numba.jit函数将上面的函数编译成Numba函数:
import numba as nb
numba_mean_distance = nb.jit(mean_distance)
# 等价方法
@nb.jit
def numba_mean_distance2(x, y):
nx = len(x)
result = 0.0
count = 0
for i in range(nx):
result += x[i] - y[i]
count += 1
return result / count
%timeit numba_mean_distance(x, y) # 甚至比numpy的函数还快
14.6 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
注意Numba不能将任意的python代码编译成机器代码,支持纯python代码的重要的子集。
%timeit numba_mean_distance2(x, y)
16.3 ms ± 911 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Numba将不知道如何编译的函数的调用替换为CPython API。Numba的jit函数有个一选项,nopython=True,将允许的代码限制为可以编译为LLVM的Python代码,而无须调用任何Python的C语言API。jit(nopython=True)有一个短的别名numba.njit
from numba import float64, njit
@njit(float64(float64[:], float64[:]))
def njit_mean_distance(x, y):
return (x - y).mean()
%timeit njit_mean_distance(x, y)
36.9 ms ± 174 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
7.1 使用Numba创建自定义numpy.ufunc对象
from numba import vectorize
@vectorize
def nb_add(x, y):
return x + y
x = np.arange(10)
nb_add(x, x)
array([ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18])
nb_add.accumulate(x, axis=0) # 即,建立的一个编译好的Numpy ufunc,其行为与内建的numpy函数一样
array([ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45])
8. 高阶数组输入和输出
8.1 内存映射文件
是一种与磁盘上的二进制数据交互的方法,就像它是存储在内存数组中一样。允许对大型文件以小堆栈的方式进行读取和写入。
# 创建内存映射,使用np.memmap传入文件路径、dtype、shape和文件模式:
mmap = np.memmap('mymap', dtype='float64', mode='w+', shape=(10000, 10000))
mmap
memmap([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
# memmap的切片返回的是硬盘上数据的视图,如果将数据赋值给切片,那么其将在内存中缓冲(类似一个python文件对象),可以调用flush写入硬盘。
section = mmap[:5]
section.shape
(5, 10000)
section[:] = np.random.randn(5, 10000)
section
memmap([[-1.4264226 , 0.21729148, 1.60461715, ..., 0.3102347 ,
0.17720547, 1.69646377],
[-1.20953714, -2.7361618 , -0.23058431, ..., 0.33713541,
0.67793013, 0.60138858],
[ 0.71859367, -0.34768919, 1.33271115, ..., 0.32399778,
1.03741373, -0.65384645],
[-0.6104341 , 0.64413784, 0.42810329, ..., 1.25154416,
0.34818979, 0.80809682],
[ 0.21072709, 0.09675299, -0.10433349, ..., 1.22574256,
-0.20164288, 0.46595202]])
mmap
memmap([[-1.4264226 , 0.21729148, 1.60461715, ..., 0.3102347 ,
0.17720547, 1.69646377],
[-1.20953714, -2.7361618 , -0.23058431, ..., 0.33713541,
0.67793013, 0.60138858],
[ 0.71859367, -0.34768919, 1.33271115, ..., 0.32399778,
1.03741373, -0.65384645],
...,
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ]])
mmap.flush()
mmap
memmap([[-1.4264226 , 0.21729148, 1.60461715, ..., 0.3102347 ,
0.17720547, 1.69646377],
[-1.20953714, -2.7361618 , -0.23058431, ..., 0.33713541,
0.67793013, 0.60138858],
[ 0.71859367, -0.34768919, 1.33271115, ..., 0.32399778,
1.03741373, -0.65384645],
...,
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ]])
del mmap # 内存中删除
mmap
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-35-49f7bdb78b33> in <module>
----> 1 mmap
NameError: name 'mmap' is not defined
当内存映射超出范围且被当垃圾收集时,任何更改都将刷新到硬盘。由于硬盘中只有二进制数据,没有其他信息的元数据,打开时必须指定dtype和shape。其中dtype也适用结构化或者嵌套的dtype。
mmap = np.memmap('mymap', dtype='float64', shape=(10000, 10000))
mmap
memmap([[-1.4264226 , 0.21729148, 1.60461715, ..., 0.3102347 ,
0.17720547, 1.69646377],
[-1.20953714, -2.7361618 , -0.23058431, ..., 0.33713541,
0.67793013, 0.60138858],
[ 0.71859367, -0.34768919, 1.33271115, ..., 0.32399778,
1.03741373, -0.65384645],
...,
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ],
[ 0. , 0. , 0. , ..., 0. ,
0. , 0. ]])
8.2 HDF5 和其他数组存储选择
HDF(Hierarchical Data Format)分层数据格式,是一种可压缩的数组格式。
9. 性能技巧
- 将python循环和条件逻辑转换为数组操作和布尔数组操作
- 尽可能使用广播
- 使用数组视图(切片)避免复制数据
- 使用ufunc和ufunc方法
9.1 连续内存的重要性
根据程序访问的局部性原理,当按照数组原来的顺序(C或者F)访问,可以避免跨步,同时Cache的命中率更高。可以用ndarray的flags属性检查顺序。
arr_c = np.ones((5000, 5000), order='C')
arr_f = np.ones((5000, 5000), order='F')
arr_c.flags
C_CONTIGUOUS : True
F_CONTIGUOUS : False
OWNDATA : True
WRITEABLE : True
ALIGNED : True
WRITEBACKIFCOPY : False
UPDATEIFCOPY : False
arr_f.flags
C_CONTIGUOUS : False
F_CONTIGUOUS : True
OWNDATA : True
WRITEABLE : True
ALIGNED : True
WRITEBACKIFCOPY : False
UPDATEIFCOPY : False
arr_f.flags.f_contiguous
True
arr_f.flags.c_contiguous
False
arr_c.flags.c_contiguous
True
%timeit arr_c.sum(axis=1)
13.1 ms ± 628 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit arr_c.sum(axis=0)
13.7 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit arr_f.sum(axis=0)
11.6 ms ± 935 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit arr_f.sum(axis=1)
15.2 ms ± 1.59 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
# 转换到所需的顺序,可以用转置,或者copy
arr_f.copy('C').flags
C_CONTIGUOUS : True
F_CONTIGUOUS : False
OWNDATA : True
WRITEABLE : True
ALIGNED : True
WRITEBACKIFCOPY : False
UPDATEIFCOPY : False
# 数组上构建视图,结果并不一定是连续的
arr_c[:50].flags.contiguous
True
arr_c[:50].flags.c_contiguous
True
arr_c[:50].flags.f_contiguous
False
arr_c[:, :50].flags
C_CONTIGUOUS : False
F_CONTIGUOUS : False
OWNDATA : False
WRITEABLE : True
ALIGNED : True
WRITEBACKIFCOPY : False
UPDATEIFCOPY : False
arr_c[:, :50].flags.contiguous
False