Numpy cheatsheet

1. ndarray 对象内幕

import numpy as np
np.ones((3, 4, 5), dtype=np.float64).strides
(160, 40, 8)

反映了在不同轴上遍历的步长,可见每个np.float64的长度是8Byte=64bit。跨度大的轴的计算代价更高。

np.ones((3, 4, 5), dtype=np.float32).strides
(80, 20, 4)
np.ones((3, 4, 5), dtype=np.uint16).strides
(40, 10, 2)
np.ones((3, 4, 5), dtype=np.float32).strides
(80, 20, 4)
np.ones((3, 4, 5), dtype=np.integer).strides
(160, 40, 8)

1.1 Numpy dtype 层次结构

ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
ints.dtype
dtype('uint16')
np.issubdtype(ints.dtype, np.integer)
True
np.issubdtype(ints.dtype, np.floating)
False
np.issubdtype(floats.dtype, np.floating)
True
np.issubdtype('float', np.floating)
True
# 查看父类
np.float64.mro()
[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]
np.uint16.mro()
[numpy.uint16,
 numpy.unsignedinteger,
 numpy.integer,
 numpy.number,
 numpy.generic,
 object]
np.issubdtype(np.float64, float)
True
np.issubdtype(np.float32, float)
False
np.float32.mro()
[numpy.float32,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 object]

2. 高阶数组操作

2.1 重塑数组

arr = np.arange(8)
arr
array([0, 1, 2, 3, 4, 5, 6, 7])
# C-order 按照行方向重塑
# F-order 按照列方向重塑
arr.reshape((4, 2))
array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])
arr.reshape((4, 2), order='F')
array([[0, 4],
       [1, 5],
       [2, 6],
       [3, 7]])
arr.reshape((-1, 4))
array([[0, 1, 2, 3],
       [4, 5, 6, 7]])
# 扁平化:flatten(),返回数据的副本
# 分散化:ravel(),无副本,直接改变原数组的value
arr = np.arange(15).reshape((5, 3))
arr
array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])
arr.flatten()
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])
arr.flatten()[2]=99
arr
array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])
arr.ravel()
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])
arr.ravel()[2] = 99
arr
array([[ 0,  1, 99],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])
arr.flatten(order='F')
array([ 0,  3,  6,  9, 12,  1,  4,  7, 10, 13, 99,  5,  8, 11, 14])

2.2 C顺序和Fortran顺序

# C-order 从数组shape属性中末尾的轴开始往前遍历,即,先遍历高索引位的轴
# F-order 从数组shape属性中开头的轴开始往后遍历,即,先遍历低索引位的轴
arr = np.arange(12).reshape((3, 4))
arr
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
arr.ravel()
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])
arr.ravel(order='f')
array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

2.3 连接和分割数组

arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
# axis指定在哪个轴进行拼接
np.concatenate([arr1, arr2], axis=0)
array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])
np.concatenate([arr1, arr2], axis=1)
array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])
arr3 = np.array([1, 2, 3])
arr4 = np.array([4, 5, 6])
np.column_stack((arr3, arr4)) # 会自动将1D数组转为2D
array([[1, 4],
       [2, 5],
       [3, 6]])
np.column_stack((arr3[None, :], arr4[None, :]))
array([[1, 2, 3, 4, 5, 6]])
np.hstack((arr3, arr4)) # 不会江1D数组转为2D
array([1, 2, 3, 4, 5, 6])
np.row_stack((arr3, arr4))
array([[1, 2, 3],
       [4, 5, 6]])
np.vstack((arr3, arr4))
array([[1, 2, 3],
       [4, 5, 6]])
# split 切割/分割数组
arr = np.random.randn(5, 4)
print(arr)
first, second, third = np.split(arr, [1, 3], axis=1)
first, second, third
[[ 0.84435272 -1.41113975  0.15211615 -1.09964343]
 [-0.05692753  1.54980138  0.84249968 -0.2415958 ]
 [ 1.28897645 -0.00376694 -0.09085957  0.38944626]
 [ 1.08790741  0.13027303 -0.82255024 -0.42911225]
 [-1.70847678 -0.84645461 -0.2277294  -0.61552024]]





(array([[ 0.84435272],
        [-0.05692753],
        [ 1.28897645],
        [ 1.08790741],
        [-1.70847678]]),
 array([[-1.41113975,  0.15211615],
        [ 1.54980138,  0.84249968],
        [-0.00376694, -0.09085957],
        [ 0.13027303, -0.82255024],
        [-0.84645461, -0.2277294 ]]),
 array([[-1.09964343],
        [-0.2415958 ],
        [ 0.38944626],
        [-0.42911225],
        [-0.61552024]]))
first, second, third = np.hsplit(arr, [1, 3])
first, second, third
(array([[ 0.84435272],
        [-0.05692753],
        [ 1.28897645],
        [ 1.08790741],
        [-1.70847678]]),
 array([[-1.41113975,  0.15211615],
        [ 1.54980138,  0.84249968],
        [-0.00376694, -0.09085957],
        [ 0.13027303, -0.82255024],
        [-0.84645461, -0.2277294 ]]),
 array([[-1.09964343],
        [-0.2415958 ],
        [ 0.38944626],
        [-0.42911225],
        [-0.61552024]]))
first, second, third = np.vsplit(arr, [1, 3])
first, second, third
(array([[ 0.84435272, -1.41113975,  0.15211615, -1.09964343]]),
 array([[-0.05692753,  1.54980138,  0.84249968, -0.2415958 ],
        [ 1.28897645, -0.00376694, -0.09085957,  0.38944626]]),
 array([[ 1.08790741,  0.13027303, -0.82255024, -0.42911225],
        [-1.70847678, -0.84645461, -0.2277294 , -0.61552024]]))

2.3.1 堆叠助手:r_和c_

arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)
np.row_stack([arr1, arr2])
array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [ 0.70333255,  0.26275106],
       [ 1.99202938, -1.46490714],
       [ 0.73384258,  0.43515298]])
np.r_[arr1, arr2]
array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [ 0.70333255,  0.26275106],
       [ 1.99202938, -1.46490714],
       [ 0.73384258,  0.43515298]])
np.column_stack([np.vstack([arr1, arr2]), arr])
array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [ 0.70333255,  0.26275106,  3.        ],
       [ 1.99202938, -1.46490714,  4.        ],
       [ 0.73384258,  0.43515298,  5.        ]])
np.c_[np.r_[arr1, arr2], arr]
array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [ 0.70333255,  0.26275106,  3.        ],
       [ 1.99202938, -1.46490714,  4.        ],
       [ 0.73384258,  0.43515298,  5.        ]])
np.column_stack([1:6, -10:-5])
  File "<ipython-input-81-d3ac66c3723b>", line 1
    np.column_stack([1:6, -10:-5])
                      ^
SyntaxError: invalid syntax
np.c_[1:6, -10:-5]
array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

2.4 重复元素:tile 和 repeat

# repeat 元素重复
arr = np.arange(3)
arr
array([0, 1, 2])
arr.repeat(3)
array([0, 0, 0, 1, 1, 1, 2, 2, 2])
arr.repeat([2, 1, 4])
array([0, 0, 1, 2, 2, 2, 2])
arr = np.random.randn(2, 3)
arr
array([[ 0.16201186, -0.76919668, -0.71688664],
       [-0.70057032, -0.96810267, -1.61212582]])
arr.repeat([2, 1, 4, 2, 2, 2]) # 会扁平化
array([ 0.16201186,  0.16201186, -0.76919668, -0.71688664, -0.71688664,
       -0.71688664, -0.71688664, -0.70057032, -0.70057032, -0.96810267,
       -0.96810267, -1.61212582, -1.61212582])
arr = np.random.randn(2, 2)
arr
array([[ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573]])
arr.repeat(2) # 如果不指定轴,会造成扁平化
array([ 1.28555077,  1.28555077,  0.33821435,  0.33821435, -0.51810958,
       -0.51810958, -1.69789573, -1.69789573])
arr.repeat(2, axis=0)
array([[ 1.28555077,  0.33821435],
       [ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573],
       [-0.51810958, -1.69789573]])
arr.repeat(2, axis=1)
array([[ 1.28555077,  1.28555077,  0.33821435,  0.33821435],
       [-0.51810958, -0.51810958, -1.69789573, -1.69789573]])
arr.repeat([2, 1], axis=0)
array([[ 1.28555077,  0.33821435],
       [ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573]])
arr.repeat([2, 3], axis=1)
array([[ 1.28555077,  1.28555077,  0.33821435,  0.33821435,  0.33821435],
       [-0.51810958, -0.51810958, -1.69789573, -1.69789573, -1.69789573]])
# tile 是直接整体数组重复
arr
array([[ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573]])
np.tile(arr, 2)
array([[ 1.28555077,  0.33821435,  1.28555077,  0.33821435],
       [-0.51810958, -1.69789573, -0.51810958, -1.69789573]])
np.tile(arr, [2, 1])
array([[ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573],
       [ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573]])
np.tile(arr, (3, 2))
array([[ 1.28555077,  0.33821435,  1.28555077,  0.33821435],
       [-0.51810958, -1.69789573, -0.51810958, -1.69789573],
       [ 1.28555077,  0.33821435,  1.28555077,  0.33821435],
       [-0.51810958, -1.69789573, -0.51810958, -1.69789573],
       [ 1.28555077,  0.33821435,  1.28555077,  0.33821435],
       [-0.51810958, -1.69789573, -0.51810958, -1.69789573]])

2.5 神奇索引的等价方法: take 和 put

arr = np.arange(10) * 100
arr
array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900])
inds = [7, 1, 2, 6]
arr[inds]
array([700, 100, 200, 600])
arr.take(inds)
array([700, 100, 200, 600])
arr.put(inds, 22)
arr
array([  0,  22,  22, 300, 400, 500,  22,  22, 800, 900])
arr.put(inds, [1, 2, 3, 4])
arr
array([  0,   2,   3, 300, 400, 500,   4,   1, 800, 900])
arr = np.random.randn(2, 4)
inds = [2, 0, 2, 1]
arr
array([[-0.49565355,  0.78522712, -0.06629777, -1.00791514],
       [ 0.39132436, -0.52828662, -0.82480479,  0.49250005]])
arr.take(inds, axis=1)
array([[-0.06629777, -0.49565355, -0.06629777,  0.78522712],
       [-0.82480479,  0.39132436, -0.82480479, -0.52828662]])

注意:put不接受axis参数

3. 广播 broadcast

首先,其实数组与标量间的运算其实是一种先广播,后element-wise的运算

import numpy as np
arr = np.arange(5)
arr
array([0, 1, 2, 3, 4])
arr * 4
array([ 0,  4,  8, 12, 16])

广播的规则是每个末尾维度,轴长匹配或者长度是1,广播会在丢失的轴,比如(4, 3) + (3,)后面的数组就复合末尾轴长相等,会广播一个缺失的轴;或者,广播在轴长为1的轴上进行,比如(4, 3) + (1, 3),会将轴0从1广播为4。对于数组和标量的运算,其实也是利用了广播,比如(4, 3) + scale,其中scale的shape其实可以认为(1,),则末尾长度为1,广播时,末尾的维度广播成3,缺失的轴广播成4。

广播可在两个数组中都进行,比如(4, 4) + (4, 1, 4),首先末尾轴长一致,其次不一致的轴长其中一个为1,那么1广播成4,另外缺失轴广播为4.

基于这种规则,有时候想计算(4, 3) + (4, 1),而实际上后者为(4,)的时候,由于末尾轴长不是1,而且3与4也不匹配,因此不能够广播,必须通过reshape,或者[:, None]的方式增加坐标轴,或者利用np.newaxis

因此,其实抓住两个数组的末尾轴长是关键,一看轴长既不是1,也不一致,那么别想广播了,看看怎么写循环操作吧。

arr = np.random.randn(4, 3)
arr.mean(0)
array([ 0.27783846,  0.36009253, -0.1499029 ])
demeaned = arr - arr.mean(0)
demeaned
array([[-0.79969385, -1.6011334 , -0.00747013],
       [-0.0381061 ,  0.64865496, -0.97992594],
       [ 1.13694786,  0.81091045,  0.73967573],
       [-0.29914791,  0.14156799,  0.24772034]])
arr.shape, arr.mean(0).shape
((4, 3), (3,))
aaa = np.array([1])
aaa.shape
(1,)
ans = arr - aaa
arr - ans
array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])
arr.shape, aaa.shape
((4, 3), (1,))
arr.shape
(4, 3)
arr.mean(1).shape
(4,)
arr - arr.mean(1)
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-18-8b8ada26fac0> in <module>
----> 1 arr - arr.mean(1)


ValueError: operands could not be broadcast together with shapes (4,3) (4,) 
arr - arr.mean(1).reshape(-1, 1)
array([[ 0.11823437, -0.6009511 ,  0.48271673],
       [ 0.20018202,  0.96919716, -1.16937918],
       [ 0.35626561,  0.11248227, -0.46874788],
       [-0.21403229,  0.30893769, -0.0949054 ]])
arr - arr.mean(1)[:, None]
array([[ 0.11823437, -0.6009511 ,  0.48271673],
       [ 0.20018202,  0.96919716, -1.16937918],
       [ 0.35626561,  0.11248227, -0.46874788],
       [-0.21403229,  0.30893769, -0.0949054 ]])
arr - arr.mean(1)[:, np.newaxis]
array([[ 0.11823437, -0.6009511 ,  0.48271673],
       [ 0.20018202,  0.96919716, -1.16937918],
       [ 0.35626561,  0.11248227, -0.46874788],
       [-0.21403229,  0.30893769, -0.0949054 ]])

一个三维例子:

arr = np.ones((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr.shape, arr_3d.shape
((4, 4), (4, 1, 4))
arr + arr_3d
array([[[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]],

       [[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]],

       [[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]],

       [[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]]])

一个常用的模式,比如,减掉/除掉某个轴的求和/方差/均值之类的:

arr = np.random.randn(3, 4, 5)
# 加入叫减掉1轴的均值
means = arr.mean(1)
means
array([[-0.95808939, -0.59395877,  0.44605451,  0.06325242,  0.14369531],
       [ 0.2600657 , -0.92595688, -0.75528343, -0.2486933 , -0.02936524],
       [-0.22052564,  0.14549496, -0.67660057, -0.10151047,  0.26275483]])
arr.shape, means.shape
((3, 4, 5), (3, 5))
demeaned = arr - means[:, np.newaxis, :]
demeaned.mean(1) < 1e-16
array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])
# 可以将其写为一个函数
def demean_axis(arr, axis=0):
    means = arr.mean(axis)
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]
arr = np.random.randn(3, 4, 5)
demeaned = demean_axis(arr, axis=1)
demeaned.mean(1) < 1e-16
<ipython-input-45-8051ed80feee>:6: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return arr - means[(indexer)]





array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

数组赋值其实也用到了广播:

arr = np.zeros((4, 3))
col = np.array([1.28, 0, 33, 0.5])
arr[:] = col[:, np.newaxis]
arr
array([[ 1.28,  1.28,  1.28],
       [ 0.  ,  0.  ,  0.  ],
       [33.  , 33.  , 33.  ],
       [ 0.5 ,  0.5 ,  0.5 ]])
arr[:2] = [[2], [3]]
arr
array([[ 2. ,  2. ,  2. ],
       [ 3. ,  3. ,  3. ],
       [33. , 33. , 33. ],
       [ 0.5,  0.5,  0.5]])

4. 高阶 ufunc 用法

4.1 ufunc 实例方法

有点像pandas中,agg这样的函数

# reduce 连续使用ufunc(比如add,multiply),reduce先建立好气函数内数组的形状和哪些元素将要进行ufunc运算,然后做ufunc运算

arr = np.arange(10)
np.add.reduce(arr)
45
arr.sum()
45
arr = np.arange(1, 5)
np.multiply.reduce(arr)
24
np.prod(arr)
24
arr.prod()
24
np.random.seed(12346)
arr = np.random.randn(5, 5)
arr[::2]
array([[-8.99822478e-02,  7.59372617e-01,  7.48336101e-01,
        -9.81497953e-01,  3.65775545e-01],
       [ 2.48256116e-01, -3.21536673e-01, -8.48730755e-01,
         4.60468309e-04, -5.46459347e-01],
       [-6.49092950e-01, -4.79535727e-01, -9.53521432e-01,
         1.42253882e+00,  1.75403128e-01]])
arr[::2].sort(1)
arr[::2]
array([[-9.81497953e-01, -8.99822478e-02,  3.65775545e-01,
         7.48336101e-01,  7.59372617e-01],
       [-8.48730755e-01, -5.46459347e-01, -3.21536673e-01,
         4.60468309e-04,  2.48256116e-01],
       [-9.53521432e-01, -6.49092950e-01, -4.79535727e-01,
         1.75403128e-01,  1.42253882e+00]])
arr
array([[-9.81497953e-01, -8.99822478e-02,  3.65775545e-01,
         7.48336101e-01,  7.59372617e-01],
       [-3.15442628e-01, -8.66135605e-01,  2.78568155e-02,
        -4.55597723e-01, -1.60189223e+00],
       [-8.48730755e-01, -5.46459347e-01, -3.21536673e-01,
         4.60468309e-04,  2.48256116e-01],
       [ 2.53915229e-01,  1.93684246e+00, -7.99504902e-01,
        -5.69159281e-01,  4.89244731e-02],
       [-9.53521432e-01, -6.49092950e-01, -4.79535727e-01,
         1.75403128e-01,  1.42253882e+00]])
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)
array([ True, False,  True, False,  True])
np.logical_and.accumulate(arr[:, :-1] < arr[:, 1:], axis=1)
array([[ True,  True,  True,  True],
       [False, False, False, False],
       [ True,  True,  True,  True],
       [ True, False, False, False],
       [ True,  True,  True,  True]])
arr[:, :-1] < arr[:, 1:]
array([[ True,  True,  True,  True],
       [False,  True, False, False],
       [ True,  True,  True,  True],
       [ True, False,  True,  True],
       [ True,  True,  True,  True]])
np.all(arr[:, :-1] < arr[:, 1:], axis=1)
array([ True, False,  True, False,  True])
np.logical_and.reduce(arr[:, [0]] < arr[:, [1]], axis=1)
array([ True, False,  True,  True,  True])
arr[:, [0]] < arr[:, [1]]
array([[ True],
       [False],
       [ True],
       [ True],
       [ True]])
# 类似于sum与cumsum,与recuce对应的是accumulate
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)
array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])
# outer返回一个类似“外积”只不过不是计算积,而是进行ufunc运算
arr = np.arange(3).repeat([1, 2, 2])
arr
array([0, 1, 1, 2, 2])
np.multiply.outer(arr, np.arange(5))
array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])
np.outer(arr, np.arange(5))
array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])
np.logical_and.outer(arr, np.arange(5))
array([[False, False, False, False, False],
       [False,  True,  True,  True,  True],
       [False,  True,  True,  True,  True],
       [False,  True,  True,  True,  True],
       [False,  True,  True,  True,  True]])
# outer 的维度为两个数组shape直接拼接
x, y = np.random.randn(3, 4), np.random.randn(5)
result = np.subtract.outer(x, y)
result.shape
(3, 4, 5)
# reduceat 相当于groupby
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])
array([10, 18, 17])
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
array([[ 0,  0,  0,  0,  0],
       [ 0,  1,  2,  3,  4],
       [ 0,  2,  4,  6,  8],
       [ 0,  3,  6,  9, 12]])
np.add.reduceat(arr, [0, 2, 4], axis=1)
array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]])

4.2 使用python编写新的ufunc方法

# 利用 numpy.vectorize
def add_elements(x, y):
    return x + y

add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))
array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])
add_them(np.random.randn(2, 2), np.random.randn(2, 2))
array([[-1.33080793, -1.43407981],
       [ 0.15584993,  1.0519004 ]])
arr = np.random.randn(10000)
%timeit add_them(arr, arr)
1.18 ms ± 15.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
arr = np.random.randn(10000)
%timeit np.add(arr, arr)
2.53 µs ± 43.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

5. 结构化和记录数组

import numpy as np
# 利用(field_name, field_data_type)作为dtype的元组列表
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr
array([(1.5       ,  6), (3.14159265, -2)],
      dtype=[('x', '<f8'), ('y', '<i4')])
sarr[0]
(1.5, 6)
sarr['x']
array([1.5       , 3.14159265])
sarr['y']
array([ 6, -2], dtype=int32)
sarr['x'].dtype.name
'float64'
sarr.dtype.name
'void96'
sarr.dtype.names
('x', 'y')
sarr.dtype
dtype([('x', '<f8'), ('y', '<i4')])
sarr[0]['y']
6

5.1 嵌套dtype和多维字段

# 可以向dtype多传递一个形状,用于指定作用次数
dtype = [('x', np.int64, 3), ('y', np.int32)]

arr = np.zeros(4, dtype=dtype)
arr
array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])
np.array([1, 2, 3, 4], dtype=dtype)
array([([1, 1, 1], 1), ([2, 2, 2], 2), ([3, 3, 3], 3), ([4, 4, 4], 4)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])
arr[0]
([0, 0, 0], 0)
arr[0]['x']
array([0, 0, 0])
arr['x']
array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 3), ((3, 4), 5)], dtype=dtype)
data
array([((1., 2.), 3), ((3., 4.), 5)],
      dtype=[('x', [('a', '<f8'), ('b', '<f4')]), ('y', '<i4')])
data['x']
array([(1., 2.), (3., 4.)], dtype=[('a', '<f8'), ('b', '<f4')])
data['y']
array([3, 5], dtype=int32)
data['x']['a']
array([1., 3.])

6. 更多关于排序的内容

import numpy as np
# ndarray 的 sort 实例方法 与 python 内建的列表排序类似,是一种原位排序,而不生成新数组
arr = np.random.randn(6)
arr.sort()
arr
array([-1.20245647, -1.06934741,  0.23694375,  0.46847888,  1.33116886,
        2.07072179])
arr = np.random.randn(3, 5)
arr
array([[-0.1373051 ,  0.8347231 , -2.13610283,  0.6911535 ,  1.29073812],
       [-0.89613231,  0.40151617,  1.19168597, -0.3273313 ,  1.15674067],
       [-0.59111152, -0.13488416,  0.13590381,  0.07592941, -0.92518222]])
arr[:, 0].sort()
arr
array([[-0.89613231,  0.8347231 , -2.13610283,  0.6911535 ,  1.29073812],
       [-0.59111152,  0.40151617,  1.19168597, -0.3273313 ,  1.15674067],
       [-0.1373051 , -0.13488416,  0.13590381,  0.07592941, -0.92518222]])
# 与实例方法不同的是,numpy.sort 产生的是新的副本
arr = np.random.randn(5)
arr
array([-2.1607857 , -0.98139601, -1.74567649, -0.93574966,  0.53958451])
np.sort(arr)
array([-2.1607857 , -1.74567649, -0.98139601, -0.93574966,  0.53958451])
arr
array([-2.1607857 , -0.98139601, -1.74567649, -0.93574966,  0.53958451])
# 可以指定轴
arr = np.random.randn(3, 5)
arr
array([[-2.9614586 ,  0.43584634, -0.27190644,  0.03146461,  0.21746412],
       [ 0.4819753 , -1.1517702 ,  1.59403466, -0.51082439,  0.05183487],
       [ 0.6184096 , -1.34489717,  0.05997099,  1.23059888,  1.84840695]])
arr.sort(axis=1)
arr
array([[-2.9614586 , -0.27190644,  0.03146461,  0.21746412,  0.43584634],
       [-1.1517702 , -0.51082439,  0.05183487,  0.4819753 ,  1.59403466],
       [-1.34489717,  0.05997099,  0.6184096 ,  1.23059888,  1.84840695]])
# 逆序的方法
arr[:, ::-1]
array([[ 0.43584634,  0.21746412,  0.03146461, -0.27190644, -2.9614586 ],
       [ 1.59403466,  0.4819753 ,  0.05183487, -0.51082439, -1.1517702 ],
       [ 1.84840695,  1.23059888,  0.6184096 ,  0.05997099, -1.34489717]])

6.1 间接排序: argsort 和 lexsort

values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer
array([1, 2, 4, 3, 0])
values[indexer]
array([0, 1, 2, 3, 5])
arr = np.random.randn(3, 5)
arr[0] = values
arr
array([[ 5.        ,  0.        ,  1.        ,  3.        ,  2.        ],
       [-1.27871978,  1.65385215,  1.04044587,  0.89253023,  0.12713788],
       [ 1.58960486,  1.06406754,  0.06449551,  0.17571087,  1.35782749]])
arr[:, indexer]
array([[ 0.        ,  1.        ,  2.        ,  3.        ,  5.        ],
       [ 1.65385215,  1.04044587,  0.12713788,  0.89253023, -1.27871978],
       [ 1.06406754,  0.06449551,  1.35782749,  0.17571087,  1.58960486]])
arr[:, arr[0].argsort()]
array([[ 0.        ,  1.        ,  2.        ,  3.        ,  5.        ],
       [ 1.65385215,  1.04044587,  0.12713788,  0.89253023, -1.27871978],
       [ 1.06406754,  0.06449551,  1.35782749,  0.17571087,  1.58960486]])

lexsort 与 argsort 类似,不过其对多键数组进行间接字典排序

first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name)) # 先执行first_name排序,再在其基础上执行last_name排序
sorter
array([1, 2, 3, 0, 4])
zip(last_name[sorter], first_name[sorter])
<zip at 0x7fd37101f1c0>
[x for x in zip(last_name[sorter], first_name[sorter])]
[('Arnold', 'Jane'),
 ('Arnold', 'Steve'),
 ('Jones', 'Bill'),
 ('Jones', 'Bob'),
 ('Walters', 'Barbara')]

6.2 其他排序算法

values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
array([2, 3, 4, 0, 1])
values.take(indexer)
array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='<U8')
indexer = key.argsort(kind='quicksort')
indexer
array([2, 3, 4, 0, 1])
indexer = key.argsort(kind='heapsort')
indexer
array([4, 2, 3, 1, 0])
values.take(indexer)
array(['1:third', '1:first', '1:second', '2:second', '2:first'],
      dtype='<U8')

6.3 数组的部分排序

# numpy.partition(arr, inx, axis),其中inx表示数组中第几小的数值,然后让比这个数值小的数拍在它之前,比它大的排在其后
np.random.seed(12345)
arr = np.random.randn(20)
arr
array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474,
        1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684,
        0.88642934, -2.00163731, -0.37184254,  1.66902531, -0.43856974])
np.partition(arr, 3)
array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
       -0.43856974, -0.20470766,  0.28174615,  0.76902257,  0.47894334,
        1.00718936,  0.09290788,  0.27499163,  0.22891288,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])
arr
array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474,
        1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684,
        0.88642934, -2.00163731, -0.37184254,  1.66902531, -0.43856974])
np.sort(arr)
array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.43856974,
       -0.37184254, -0.20470766,  0.09290788,  0.22891288,  0.27499163,
        0.28174615,  0.47894334,  0.76902257,  0.88642934,  1.00718936,
        1.24643474,  1.35291684,  1.39340583,  1.66902531,  1.96578057])
np.partition(arr, 10)
array([-1.29622111, -0.43856974, -0.51943872, -0.5557303 , -0.37184254,
       -2.00163731, -0.20470766,  0.09290788,  0.22891288,  0.27499163,
        0.28174615,  1.00718936,  0.47894334,  0.76902257,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])
arr
array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474,
        1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684,
        0.88642934, -2.00163731, -0.37184254,  1.66902531, -0.43856974])
indices = np.argpartition(arr, 3) # 取索引
indices
array([16, 11,  3,  2, 17, 19,  0,  7,  8,  1, 10,  6, 12, 13, 14, 15,  5,
        4, 18,  9])
arr.take(indices)
array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
       -0.43856974, -0.20470766,  0.28174615,  0.76902257,  0.47894334,
        1.00718936,  0.09290788,  0.27499163,  0.22891288,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])
arr.take(indices) == np.partition(arr, 3)
array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

6.4 numpy.searchsorted:在已排序数组中寻找元素

arr = np.array([0, 8, 11, 8, 15])
arr.searchsorted(9)
2
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)
3
arr.searchsorted([0, 8, 11, 16])
array([0, 3, 3, 5])
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
array([0, 3])
arr.searchsorted([0, 1], side='right')
array([3, 7])
# 一个很典型的应用
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data
array([9940., 6768., 7908., 1709.,  268., 8003., 9037.,  246., 4917.,
       5262., 5963.,  519., 8950., 7282., 8183., 5002., 8101.,  959.,
       2189., 2587., 4681., 4593., 7095., 1780., 5314., 1677., 7688.,
       9281., 6094., 1501., 4896., 3773., 8486., 9110., 3838., 3154.,
       5683., 1878., 1258., 6875., 7996., 5735., 9732., 6340., 8884.,
       4954., 3516., 7142., 5039., 2256.])
labels = bins.searchsorted(data)
labels
array([4, 4, 4, 3, 2, 4, 4, 2, 3, 4, 4, 2, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3,
       4, 3, 4, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 4, 4,
       4, 3, 3, 4, 4, 3])
# 配合pandas的groupby,可以实现聚类
import pandas as pd

pd.Series(data).groupby(labels).mean()
2     498.000000
3    3064.277778
4    7389.035714
dtype: float64

7. 使用Numba编写快速Numpy函数

import numpy as np
# 现考虑一个python本身的函数,看看有多慢
def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count
x = np.random.randn(10000000)
y = np.random.randn(10000000)
%timeit mean_distance(x, y)
2.68 s ± 100 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit (x - y).mean()
26.9 ms ± 549 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# 用numba.jit函数将上面的函数编译成Numba函数:
import numba as nb
numba_mean_distance = nb.jit(mean_distance)
# 等价方法
@nb.jit
def numba_mean_distance2(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count
%timeit numba_mean_distance(x, y) # 甚至比numpy的函数还快
14.6 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

注意Numba不能将任意的python代码编译成机器代码,支持纯python代码的重要的子集。

%timeit numba_mean_distance2(x, y)
16.3 ms ± 911 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Numba将不知道如何编译的函数的调用替换为CPython API。Numba的jit函数有个一选项,nopython=True,将允许的代码限制为可以编译为LLVM的Python代码,而无须调用任何Python的C语言API。jit(nopython=True)有一个短的别名numba.njit

from numba import float64, njit

@njit(float64(float64[:], float64[:]))
def njit_mean_distance(x, y):
    return (x - y).mean()
%timeit njit_mean_distance(x, y)
36.9 ms ± 174 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

7.1 使用Numba创建自定义numpy.ufunc对象

from numba import vectorize

@vectorize
def nb_add(x, y):
    return x + y
x = np.arange(10)
nb_add(x, x)
array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])
nb_add.accumulate(x, axis=0) # 即,建立的一个编译好的Numpy ufunc,其行为与内建的numpy函数一样
array([ 0,  1,  3,  6, 10, 15, 21, 28, 36, 45])

8. 高阶数组输入和输出

8.1 内存映射文件

是一种与磁盘上的二进制数据交互的方法,就像它是存储在内存数组中一样。允许对大型文件以小堆栈的方式进行读取和写入。

# 创建内存映射,使用np.memmap传入文件路径、dtype、shape和文件模式:
mmap = np.memmap('mymap', dtype='float64', mode='w+', shape=(10000, 10000))
mmap
memmap([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])
# memmap的切片返回的是硬盘上数据的视图,如果将数据赋值给切片,那么其将在内存中缓冲(类似一个python文件对象),可以调用flush写入硬盘。
section = mmap[:5]
section.shape
(5, 10000)
section[:] = np.random.randn(5, 10000)
section
memmap([[-1.4264226 ,  0.21729148,  1.60461715, ...,  0.3102347 ,
          0.17720547,  1.69646377],
        [-1.20953714, -2.7361618 , -0.23058431, ...,  0.33713541,
          0.67793013,  0.60138858],
        [ 0.71859367, -0.34768919,  1.33271115, ...,  0.32399778,
          1.03741373, -0.65384645],
        [-0.6104341 ,  0.64413784,  0.42810329, ...,  1.25154416,
          0.34818979,  0.80809682],
        [ 0.21072709,  0.09675299, -0.10433349, ...,  1.22574256,
         -0.20164288,  0.46595202]])
mmap
memmap([[-1.4264226 ,  0.21729148,  1.60461715, ...,  0.3102347 ,
          0.17720547,  1.69646377],
        [-1.20953714, -2.7361618 , -0.23058431, ...,  0.33713541,
          0.67793013,  0.60138858],
        [ 0.71859367, -0.34768919,  1.33271115, ...,  0.32399778,
          1.03741373, -0.65384645],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])
mmap.flush()
mmap
memmap([[-1.4264226 ,  0.21729148,  1.60461715, ...,  0.3102347 ,
          0.17720547,  1.69646377],
        [-1.20953714, -2.7361618 , -0.23058431, ...,  0.33713541,
          0.67793013,  0.60138858],
        [ 0.71859367, -0.34768919,  1.33271115, ...,  0.32399778,
          1.03741373, -0.65384645],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])
del mmap # 内存中删除
mmap
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-35-49f7bdb78b33> in <module>
----> 1 mmap


NameError: name 'mmap' is not defined

当内存映射超出范围且被当垃圾收集时,任何更改都将刷新到硬盘。由于硬盘中只有二进制数据,没有其他信息的元数据,打开时必须指定dtype和shape。其中dtype也适用结构化或者嵌套的dtype。

mmap = np.memmap('mymap', dtype='float64', shape=(10000, 10000))
mmap
memmap([[-1.4264226 ,  0.21729148,  1.60461715, ...,  0.3102347 ,
          0.17720547,  1.69646377],
        [-1.20953714, -2.7361618 , -0.23058431, ...,  0.33713541,
          0.67793013,  0.60138858],
        [ 0.71859367, -0.34768919,  1.33271115, ...,  0.32399778,
          1.03741373, -0.65384645],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

8.2 HDF5 和其他数组存储选择

HDF(Hierarchical Data Format)分层数据格式,是一种可压缩的数组格式。

9. 性能技巧

  • 将python循环和条件逻辑转换为数组操作和布尔数组操作
  • 尽可能使用广播
  • 使用数组视图(切片)避免复制数据
  • 使用ufunc和ufunc方法

9.1 连续内存的重要性

根据程序访问的局部性原理,当按照数组原来的顺序(C或者F)访问,可以避免跨步,同时Cache的命中率更高。可以用ndarray的flags属性检查顺序。

arr_c = np.ones((5000, 5000), order='C')
arr_f = np.ones((5000, 5000), order='F')
arr_c.flags
  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False
arr_f.flags
  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False
arr_f.flags.f_contiguous
True
arr_f.flags.c_contiguous
False
arr_c.flags.c_contiguous
True
%timeit arr_c.sum(axis=1)
13.1 ms ± 628 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit arr_c.sum(axis=0)
13.7 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit arr_f.sum(axis=0)
11.6 ms ± 935 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit arr_f.sum(axis=1)
15.2 ms ± 1.59 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
# 转换到所需的顺序,可以用转置,或者copy
arr_f.copy('C').flags
  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False
# 数组上构建视图,结果并不一定是连续的
arr_c[:50].flags.contiguous
True
arr_c[:50].flags.c_contiguous
True
arr_c[:50].flags.f_contiguous
False
arr_c[:, :50].flags
  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False
arr_c[:, :50].flags.contiguous
False

猜你喜欢

转载自blog.csdn.net/qq_26928055/article/details/125001522