1. ndarray 对象内幕

import numpy as np

np.ones((3, 4, 5), dtype=np.float64).strides

(160, 40, 8)

反映了在不同轴上遍历的步长，可见每个np.float64的长度是8Byte=64bit。跨度大的轴的计算代价更高。

np.ones((3, 4, 5), dtype=np.float32).strides

(80, 20, 4)

np.ones((3, 4, 5), dtype=np.uint16).strides

(40, 10, 2)

np.ones((3, 4, 5), dtype=np.float32).strides

(80, 20, 4)

np.ones((3, 4, 5), dtype=np.integer).strides

(160, 40, 8)

1.1 Numpy dtype 层次结构

ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)

ints.dtype

dtype('uint16')

np.issubdtype(ints.dtype, np.integer)

True

np.issubdtype(ints.dtype, np.floating)

False

np.issubdtype(floats.dtype, np.floating)

True

np.issubdtype('float', np.floating)

True

# 查看父类
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

np.uint16.mro()

[numpy.uint16,
 numpy.unsignedinteger,
 numpy.integer,
 numpy.number,
 numpy.generic,
 object]

np.issubdtype(np.float64, float)

True

np.issubdtype(np.float32, float)

False

np.float32.mro()

[numpy.float32,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 object]

2. 高阶数组操作

2.1 重塑数组

arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

# C-order 按照行方向重塑
# F-order 按照列方向重塑
arr.reshape((4, 2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

arr.reshape((4, 2), order='F')

array([[0, 4],
       [1, 5],
       [2, 6],
       [3, 7]])

arr.reshape((-1, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

# 扁平化：flatten（），返回数据的副本
# 分散化：ravel（），无副本，直接改变原数组的value
arr = np.arange(15).reshape((5, 3))
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

arr.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

arr.flatten()[2]=99
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

arr.ravel()[2] = 99

arr

array([[ 0,  1, 99],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

arr.flatten(order='F')

array([ 0,  3,  6,  9, 12,  1,  4,  7, 10, 13, 99,  5,  8, 11, 14])

2.2 C顺序和Fortran顺序

# C-order 从数组shape属性中末尾的轴开始往前遍历，即，先遍历高索引位的轴
# F-order 从数组shape属性中开头的轴开始往后遍历，即，先遍历低索引位的轴
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

arr.ravel(order='f')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

2.3 连接和分割数组

arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])

# axis指定在哪个轴进行拼接
np.concatenate([arr1, arr2], axis=0)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

np.concatenate([arr1, arr2], axis=1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

arr3 = np.array([1, 2, 3])
arr4 = np.array([4, 5, 6])

np.column_stack((arr3, arr4)) # 会自动将1D数组转为2D

array([[1, 4],
       [2, 5],
       [3, 6]])

np.column_stack((arr3[None, :], arr4[None, :]))

array([[1, 2, 3, 4, 5, 6]])

np.hstack((arr3, arr4)) # 不会江1D数组转为2D

array([1, 2, 3, 4, 5, 6])

np.row_stack((arr3, arr4))

array([[1, 2, 3],
       [4, 5, 6]])

np.vstack((arr3, arr4))

array([[1, 2, 3],
       [4, 5, 6]])

# split 切割/分割数组
arr = np.random.randn(5, 4)
print(arr)
first, second, third = np.split(arr, [1, 3], axis=1)
first, second, third

[[ 0.84435272 -1.41113975  0.15211615 -1.09964343]
 [-0.05692753  1.54980138  0.84249968 -0.2415958 ]
 [ 1.28897645 -0.00376694 -0.09085957  0.38944626]
 [ 1.08790741  0.13027303 -0.82255024 -0.42911225]
 [-1.70847678 -0.84645461 -0.2277294  -0.61552024]]





(array([[ 0.84435272],
        [-0.05692753],
        [ 1.28897645],
        [ 1.08790741],
        [-1.70847678]]),
 array([[-1.41113975,  0.15211615],
        [ 1.54980138,  0.84249968],
        [-0.00376694, -0.09085957],
        [ 0.13027303, -0.82255024],
        [-0.84645461, -0.2277294 ]]),
 array([[-1.09964343],
        [-0.2415958 ],
        [ 0.38944626],
        [-0.42911225],
        [-0.61552024]]))

first, second, third = np.hsplit(arr, [1, 3])
first, second, third

(array([[ 0.84435272],
        [-0.05692753],
        [ 1.28897645],
        [ 1.08790741],
        [-1.70847678]]),
 array([[-1.41113975,  0.15211615],
        [ 1.54980138,  0.84249968],
        [-0.00376694, -0.09085957],
        [ 0.13027303, -0.82255024],
        [-0.84645461, -0.2277294 ]]),
 array([[-1.09964343],
        [-0.2415958 ],
        [ 0.38944626],
        [-0.42911225],
        [-0.61552024]]))

first, second, third = np.vsplit(arr, [1, 3])
first, second, third

(array([[ 0.84435272, -1.41113975,  0.15211615, -1.09964343]]),
 array([[-0.05692753,  1.54980138,  0.84249968, -0.2415958 ],
        [ 1.28897645, -0.00376694, -0.09085957,  0.38944626]]),
 array([[ 1.08790741,  0.13027303, -0.82255024, -0.42911225],
        [-1.70847678, -0.84645461, -0.2277294 , -0.61552024]]))

2.3.1 堆叠助手：r_和c_

arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)

np.row_stack([arr1, arr2])

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [ 0.70333255,  0.26275106],
       [ 1.99202938, -1.46490714],
       [ 0.73384258,  0.43515298]])

np.r_[arr1, arr2]

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [ 0.70333255,  0.26275106],
       [ 1.99202938, -1.46490714],
       [ 0.73384258,  0.43515298]])

np.column_stack([np.vstack([arr1, arr2]), arr])

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [ 0.70333255,  0.26275106,  3.        ],
       [ 1.99202938, -1.46490714,  4.        ],
       [ 0.73384258,  0.43515298,  5.        ]])

np.c_[np.r_[arr1, arr2], arr]

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [ 0.70333255,  0.26275106,  3.        ],
       [ 1.99202938, -1.46490714,  4.        ],
       [ 0.73384258,  0.43515298,  5.        ]])

np.column_stack([1:6, -10:-5])

  File "<ipython-input-81-d3ac66c3723b>", line 1
    np.column_stack([1:6, -10:-5])
                      ^
SyntaxError: invalid syntax

np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

2.4 重复元素：tile 和 repeat

# repeat 元素重复
arr = np.arange(3)
arr

array([0, 1, 2])

arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

arr.repeat([2, 1, 4])

array([0, 0, 1, 2, 2, 2, 2])

arr = np.random.randn(2, 3)
arr

array([[ 0.16201186, -0.76919668, -0.71688664],
       [-0.70057032, -0.96810267, -1.61212582]])

arr.repeat([2, 1, 4, 2, 2, 2]) # 会扁平化

array([ 0.16201186,  0.16201186, -0.76919668, -0.71688664, -0.71688664,
       -0.71688664, -0.71688664, -0.70057032, -0.70057032, -0.96810267,
       -0.96810267, -1.61212582, -1.61212582])

arr = np.random.randn(2, 2)
arr

array([[ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573]])

arr.repeat(2) # 如果不指定轴，会造成扁平化

array([ 1.28555077,  1.28555077,  0.33821435,  0.33821435, -0.51810958,
       -0.51810958, -1.69789573, -1.69789573])

arr.repeat(2, axis=0)

array([[ 1.28555077,  0.33821435],
       [ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573],
       [-0.51810958, -1.69789573]])

arr.repeat(2, axis=1)

array([[ 1.28555077,  1.28555077,  0.33821435,  0.33821435],
       [-0.51810958, -0.51810958, -1.69789573, -1.69789573]])

arr.repeat([2, 1], axis=0)

array([[ 1.28555077,  0.33821435],
       [ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573]])

arr.repeat([2, 3], axis=1)

array([[ 1.28555077,  1.28555077,  0.33821435,  0.33821435,  0.33821435],
       [-0.51810958, -0.51810958, -1.69789573, -1.69789573, -1.69789573]])

# tile 是直接整体数组重复
arr

array([[ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573]])

np.tile(arr, 2)

array([[ 1.28555077,  0.33821435,  1.28555077,  0.33821435],
       [-0.51810958, -1.69789573, -0.51810958, -1.69789573]])

np.tile(arr, [2, 1])

array([[ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573],
       [ 1.28555077,  0.33821435],
       [-0.51810958, -1.69789573]])

np.tile(arr, (3, 2))

array([[ 1.28555077,  0.33821435,  1.28555077,  0.33821435],
       [-0.51810958, -1.69789573, -0.51810958, -1.69789573],
       [ 1.28555077,  0.33821435,  1.28555077,  0.33821435],
       [-0.51810958, -1.69789573, -0.51810958, -1.69789573],
       [ 1.28555077,  0.33821435,  1.28555077,  0.33821435],
       [-0.51810958, -1.69789573, -0.51810958, -1.69789573]])

2.5 神奇索引的等价方法： take 和 put

arr = np.arange(10) * 100
arr

array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900])

inds = [7, 1, 2, 6]
arr[inds]

array([700, 100, 200, 600])

arr.take(inds)

array([700, 100, 200, 600])

arr.put(inds, 22)

arr

array([  0,  22,  22, 300, 400, 500,  22,  22, 800, 900])

arr.put(inds, [1, 2, 3, 4])
arr

array([  0,   2,   3, 300, 400, 500,   4,   1, 800, 900])

arr = np.random.randn(2, 4)
inds = [2, 0, 2, 1]
arr

array([[-0.49565355,  0.78522712, -0.06629777, -1.00791514],
       [ 0.39132436, -0.52828662, -0.82480479,  0.49250005]])

arr.take(inds, axis=1)

array([[-0.06629777, -0.49565355, -0.06629777,  0.78522712],
       [-0.82480479,  0.39132436, -0.82480479, -0.52828662]])

注意：put不接受axis参数

3. 广播 broadcast

首先，其实数组与标量间的运算其实是一种先广播，后element-wise的运算

import numpy as np

arr = np.arange(5)
arr

array([0, 1, 2, 3, 4])

arr * 4

array([ 0,  4,  8, 12, 16])

广播的规则是每个末尾维度，轴长匹配或者长度是1，广播会在丢失的轴，比如(4, 3) + (3,)后面的数组就复合末尾轴长相等，会广播一个缺失的轴；或者，广播在轴长为1的轴上进行,比如(4, 3) + (1, 3)，会将轴0从1广播为4。对于数组和标量的运算，其实也是利用了广播，比如(4, 3) + scale，其中scale的shape其实可以认为(1,)，则末尾长度为1，广播时，末尾的维度广播成3，缺失的轴广播成4。

广播可在两个数组中都进行，比如(4, 4) + (4, 1, 4)，首先末尾轴长一致，其次不一致的轴长其中一个为1，那么1广播成4，另外缺失轴广播为4.

基于这种规则，有时候想计算(4, 3) + (4, 1),而实际上后者为(4,)的时候，由于末尾轴长不是1，而且3与4也不匹配，因此不能够广播，必须通过reshape，或者[:, None]的方式增加坐标轴，或者利用np.newaxis

因此，其实抓住两个数组的末尾轴长是关键，一看轴长既不是1，也不一致，那么别想广播了，看看怎么写循环操作吧。

arr = np.random.randn(4, 3)
arr.mean(0)

array([ 0.27783846,  0.36009253, -0.1499029 ])

demeaned = arr - arr.mean(0)
demeaned

array([[-0.79969385, -1.6011334 , -0.00747013],
       [-0.0381061 ,  0.64865496, -0.97992594],
       [ 1.13694786,  0.81091045,  0.73967573],
       [-0.29914791,  0.14156799,  0.24772034]])

arr.shape, arr.mean(0).shape

((4, 3), (3,))

aaa = np.array([1])
aaa.shape

(1,)

ans = arr - aaa
arr - ans

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

arr.shape, aaa.shape

((4, 3), (1,))

arr.shape

(4, 3)

arr.mean(1).shape

(4,)

arr - arr.mean(1)

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-18-8b8ada26fac0> in <module>
----> 1 arr - arr.mean(1)


ValueError: operands could not be broadcast together with shapes (4,3) (4,)

arr - arr.mean(1).reshape(-1, 1)

array([[ 0.11823437, -0.6009511 ,  0.48271673],
       [ 0.20018202,  0.96919716, -1.16937918],
       [ 0.35626561,  0.11248227, -0.46874788],
       [-0.21403229,  0.30893769, -0.0949054 ]])

arr - arr.mean(1)[:, None]

array([[ 0.11823437, -0.6009511 ,  0.48271673],
       [ 0.20018202,  0.96919716, -1.16937918],
       [ 0.35626561,  0.11248227, -0.46874788],
       [-0.21403229,  0.30893769, -0.0949054 ]])

arr - arr.mean(1)[:, np.newaxis]

array([[ 0.11823437, -0.6009511 ,  0.48271673],
       [ 0.20018202,  0.96919716, -1.16937918],
       [ 0.35626561,  0.11248227, -0.46874788],
       [-0.21403229,  0.30893769, -0.0949054 ]])

一个三维例子：

arr = np.ones((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr.shape, arr_3d.shape

((4, 4), (4, 1, 4))

arr + arr_3d

array([[[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]],

       [[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]],

       [[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]],

       [[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]]])

一个常用的模式，比如，减掉/除掉某个轴的求和/方差/均值之类的：

arr = np.random.randn(3, 4, 5)
# 加入叫减掉1轴的均值
means = arr.mean(1)
means

array([[-0.95808939, -0.59395877,  0.44605451,  0.06325242,  0.14369531],
       [ 0.2600657 , -0.92595688, -0.75528343, -0.2486933 , -0.02936524],
       [-0.22052564,  0.14549496, -0.67660057, -0.10151047,  0.26275483]])

arr.shape, means.shape

((3, 4, 5), (3, 5))

demeaned = arr - means[:, np.newaxis, :]

demeaned.mean(1) < 1e-16

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

# 可以将其写为一个函数
def demean_axis(arr, axis=0):
    means = arr.mean(axis)
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]

arr = np.random.randn(3, 4, 5)
demeaned = demean_axis(arr, axis=1)
demeaned.mean(1) < 1e-16

<ipython-input-45-8051ed80feee>:6: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return arr - means[(indexer)]





array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])

数组赋值其实也用到了广播：

arr = np.zeros((4, 3))
col = np.array([1.28, 0, 33, 0.5])
arr[:] = col[:, np.newaxis]
arr

array([[ 1.28,  1.28,  1.28],
       [ 0.  ,  0.  ,  0.  ],
       [33.  , 33.  , 33.  ],
       [ 0.5 ,  0.5 ,  0.5 ]])

arr[:2] = [[2], [3]]
arr

array([[ 2. ,  2. ,  2. ],
       [ 3. ,  3. ,  3. ],
       [33. , 33. , 33. ],
       [ 0.5,  0.5,  0.5]])

4. 高阶 ufunc 用法

4.1 ufunc 实例方法

有点像pandas中，agg这样的函数

# reduce 连续使用ufunc（比如add，multiply）,reduce先建立好气函数内数组的形状和哪些元素将要进行ufunc运算，然后做ufunc运算

arr = np.arange(10)
np.add.reduce(arr)

arr.sum()

arr = np.arange(1, 5)
np.multiply.reduce(arr)

np.prod(arr)

arr.prod()

np.random.seed(12346)
arr = np.random.randn(5, 5)
arr[::2]

array([[-8.99822478e-02,  7.59372617e-01,  7.48336101e-01,
        -9.81497953e-01,  3.65775545e-01],
       [ 2.48256116e-01, -3.21536673e-01, -8.48730755e-01,
         4.60468309e-04, -5.46459347e-01],
       [-6.49092950e-01, -4.79535727e-01, -9.53521432e-01,
         1.42253882e+00,  1.75403128e-01]])

arr[::2].sort(1)

arr[::2]

array([[-9.81497953e-01, -8.99822478e-02,  3.65775545e-01,
         7.48336101e-01,  7.59372617e-01],
       [-8.48730755e-01, -5.46459347e-01, -3.21536673e-01,
         4.60468309e-04,  2.48256116e-01],
       [-9.53521432e-01, -6.49092950e-01, -4.79535727e-01,
         1.75403128e-01,  1.42253882e+00]])

arr

array([[-9.81497953e-01, -8.99822478e-02,  3.65775545e-01,
         7.48336101e-01,  7.59372617e-01],
       [-3.15442628e-01, -8.66135605e-01,  2.78568155e-02,
        -4.55597723e-01, -1.60189223e+00],
       [-8.48730755e-01, -5.46459347e-01, -3.21536673e-01,
         4.60468309e-04,  2.48256116e-01],
       [ 2.53915229e-01,  1.93684246e+00, -7.99504902e-01,
        -5.69159281e-01,  4.89244731e-02],
       [-9.53521432e-01, -6.49092950e-01, -4.79535727e-01,
         1.75403128e-01,  1.42253882e+00]])

np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

array([ True, False,  True, False,  True])

np.logical_and.accumulate(arr[:, :-1] < arr[:, 1:], axis=1)

array([[ True,  True,  True,  True],
       [False, False, False, False],
       [ True,  True,  True,  True],
       [ True, False, False, False],
       [ True,  True,  True,  True]])

arr[:, :-1] < arr[:, 1:]

array([[ True,  True,  True,  True],
       [False,  True, False, False],
       [ True,  True,  True,  True],
       [ True, False,  True,  True],
       [ True,  True,  True,  True]])

np.all(arr[:, :-1] < arr[:, 1:], axis=1)

array([ True, False,  True, False,  True])

np.logical_and.reduce(arr[:, [0]] < arr[:, [1]], axis=1)

array([ True, False,  True,  True,  True])

arr[:, [0]] < arr[:, [1]]

array([[ True],
       [False],
       [ True],
       [ True],
       [ True]])

# 类似于sum与cumsum，与recuce对应的是accumulate
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])

# outer返回一个类似“外积”只不过不是计算积，而是进行ufunc运算
arr = np.arange(3).repeat([1, 2, 2])
arr

array([0, 1, 1, 2, 2])

np.multiply.outer(arr, np.arange(5))

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

np.outer(arr, np.arange(5))

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

np.logical_and.outer(arr, np.arange(5))

array([[False, False, False, False, False],
       [False,  True,  True,  True,  True],
       [False,  True,  True,  True,  True],
       [False,  True,  True,  True,  True],
       [False,  True,  True,  True,  True]])

# outer 的维度为两个数组shape直接拼接
x, y = np.random.randn(3, 4), np.random.randn(5)
result = np.subtract.outer(x, y)

result.shape

(3, 4, 5)

# reduceat 相当于groupby
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])

array([10, 18, 17])

arr = np.multiply.outer(np.arange(4), np.arange(5))
arr

array([[ 0,  0,  0,  0,  0],
       [ 0,  1,  2,  3,  4],
       [ 0,  2,  4,  6,  8],
       [ 0,  3,  6,  9, 12]])

np.add.reduceat(arr, [0, 2, 4], axis=1)

array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]])

4.2 使用python编写新的ufunc方法

# 利用 numpy.vectorize
def add_elements(x, y):
    return x + y

add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])

add_them(np.random.randn(2, 2), np.random.randn(2, 2))

array([[-1.33080793, -1.43407981],
       [ 0.15584993,  1.0519004 ]])

arr = np.random.randn(10000)
%timeit add_them(arr, arr)

1.18 ms ± 15.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

arr = np.random.randn(10000)
%timeit np.add(arr, arr)

2.53 µs ± 43.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

5. 结构化和记录数组

import numpy as np

# 利用(field_name, field_data_type)作为dtype的元组列表
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

array([(1.5       ,  6), (3.14159265, -2)],
      dtype=[('x', '<f8'), ('y', '<i4')])

sarr[0]

(1.5, 6)

sarr['x']

array([1.5       , 3.14159265])

sarr['y']

array([ 6, -2], dtype=int32)

sarr['x'].dtype.name

'float64'

sarr.dtype.name

'void96'

sarr.dtype.names

('x', 'y')

sarr.dtype

dtype([('x', '<f8'), ('y', '<i4')])

sarr[0]['y']

5.1 嵌套dtype和多维字段

# 可以向dtype多传递一个形状，用于指定作用次数
dtype = [('x', np.int64, 3), ('y', np.int32)]

arr = np.zeros(4, dtype=dtype)
arr

array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

np.array([1, 2, 3, 4], dtype=dtype)

array([([1, 1, 1], 1), ([2, 2, 2], 2), ([3, 3, 3], 3), ([4, 4, 4], 4)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

arr[0]

([0, 0, 0], 0)

arr[0]['x']

array([0, 0, 0])

arr['x']

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 3), ((3, 4), 5)], dtype=dtype)
data

array([((1., 2.), 3), ((3., 4.), 5)],
      dtype=[('x', [('a', '<f8'), ('b', '<f4')]), ('y', '<i4')])

data['x']

array([(1., 2.), (3., 4.)], dtype=[('a', '<f8'), ('b', '<f4')])

data['y']

array([3, 5], dtype=int32)

data['x']['a']

array([1., 3.])

6. 更多关于排序的内容

import numpy as np

# ndarray 的 sort 实例方法 与 python 内建的列表排序类似，是一种原位排序，而不生成新数组
arr = np.random.randn(6)
arr.sort()
arr

array([-1.20245647, -1.06934741,  0.23694375,  0.46847888,  1.33116886,
        2.07072179])

arr = np.random.randn(3, 5)
arr

array([[-0.1373051 ,  0.8347231 , -2.13610283,  0.6911535 ,  1.29073812],
       [-0.89613231,  0.40151617,  1.19168597, -0.3273313 ,  1.15674067],
       [-0.59111152, -0.13488416,  0.13590381,  0.07592941, -0.92518222]])

arr[:, 0].sort()

arr

array([[-0.89613231,  0.8347231 , -2.13610283,  0.6911535 ,  1.29073812],
       [-0.59111152,  0.40151617,  1.19168597, -0.3273313 ,  1.15674067],
       [-0.1373051 , -0.13488416,  0.13590381,  0.07592941, -0.92518222]])

# 与实例方法不同的是，numpy.sort 产生的是新的副本
arr = np.random.randn(5)
arr

array([-2.1607857 , -0.98139601, -1.74567649, -0.93574966,  0.53958451])

np.sort(arr)

array([-2.1607857 , -1.74567649, -0.98139601, -0.93574966,  0.53958451])

arr

array([-2.1607857 , -0.98139601, -1.74567649, -0.93574966,  0.53958451])

# 可以指定轴
arr = np.random.randn(3, 5)
arr

array([[-2.9614586 ,  0.43584634, -0.27190644,  0.03146461,  0.21746412],
       [ 0.4819753 , -1.1517702 ,  1.59403466, -0.51082439,  0.05183487],
       [ 0.6184096 , -1.34489717,  0.05997099,  1.23059888,  1.84840695]])

arr.sort(axis=1)

arr

array([[-2.9614586 , -0.27190644,  0.03146461,  0.21746412,  0.43584634],
       [-1.1517702 , -0.51082439,  0.05183487,  0.4819753 ,  1.59403466],
       [-1.34489717,  0.05997099,  0.6184096 ,  1.23059888,  1.84840695]])

# 逆序的方法
arr[:, ::-1]

array([[ 0.43584634,  0.21746412,  0.03146461, -0.27190644, -2.9614586 ],
       [ 1.59403466,  0.4819753 ,  0.05183487, -0.51082439, -1.1517702 ],
       [ 1.84840695,  1.23059888,  0.6184096 ,  0.05997099, -1.34489717]])

6.1 间接排序： argsort 和 lexsort

values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer

array([1, 2, 4, 3, 0])

values[indexer]

array([0, 1, 2, 3, 5])

arr = np.random.randn(3, 5)
arr[0] = values

arr

array([[ 5.        ,  0.        ,  1.        ,  3.        ,  2.        ],
       [-1.27871978,  1.65385215,  1.04044587,  0.89253023,  0.12713788],
       [ 1.58960486,  1.06406754,  0.06449551,  0.17571087,  1.35782749]])

arr[:, indexer]

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  5.        ],
       [ 1.65385215,  1.04044587,  0.12713788,  0.89253023, -1.27871978],
       [ 1.06406754,  0.06449551,  1.35782749,  0.17571087,  1.58960486]])

arr[:, arr[0].argsort()]

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  5.        ],
       [ 1.65385215,  1.04044587,  0.12713788,  0.89253023, -1.27871978],
       [ 1.06406754,  0.06449551,  1.35782749,  0.17571087,  1.58960486]])

lexsort 与 argsort 类似，不过其对多键数组进行间接字典排序

first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name)) # 先执行first_name排序，再在其基础上执行last_name排序
sorter

array([1, 2, 3, 0, 4])

zip(last_name[sorter], first_name[sorter])

<zip at 0x7fd37101f1c0>

[x for x in zip(last_name[sorter], first_name[sorter])]

[('Arnold', 'Jane'),
 ('Arnold', 'Steve'),
 ('Jones', 'Bill'),
 ('Jones', 'Bob'),
 ('Walters', 'Barbara')]

6.2 其他排序算法

values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer

array([2, 3, 4, 0, 1])

values.take(indexer)

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='<U8')

indexer = key.argsort(kind='quicksort')
indexer

array([2, 3, 4, 0, 1])

indexer = key.argsort(kind='heapsort')
indexer

array([4, 2, 3, 1, 0])

values.take(indexer)

array(['1:third', '1:first', '1:second', '2:second', '2:first'],
      dtype='<U8')

6.3 数组的部分排序

# numpy.partition(arr, inx, axis)，其中inx表示数组中第几小的数值，然后让比这个数值小的数拍在它之前，比它大的排在其后
np.random.seed(12345)
arr = np.random.randn(20)
arr

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474,
        1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684,
        0.88642934, -2.00163731, -0.37184254,  1.66902531, -0.43856974])

np.partition(arr, 3)

array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
       -0.43856974, -0.20470766,  0.28174615,  0.76902257,  0.47894334,
        1.00718936,  0.09290788,  0.27499163,  0.22891288,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])

arr

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474,
        1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684,
        0.88642934, -2.00163731, -0.37184254,  1.66902531, -0.43856974])

np.sort(arr)

array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.43856974,
       -0.37184254, -0.20470766,  0.09290788,  0.22891288,  0.27499163,
        0.28174615,  0.47894334,  0.76902257,  0.88642934,  1.00718936,
        1.24643474,  1.35291684,  1.39340583,  1.66902531,  1.96578057])

np.partition(arr, 10)

array([-1.29622111, -0.43856974, -0.51943872, -0.5557303 , -0.37184254,
       -2.00163731, -0.20470766,  0.09290788,  0.22891288,  0.27499163,
        0.28174615,  1.00718936,  0.47894334,  0.76902257,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])

arr

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474,
        1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684,
        0.88642934, -2.00163731, -0.37184254,  1.66902531, -0.43856974])

indices = np.argpartition(arr, 3) # 取索引
indices

array([16, 11,  3,  2, 17, 19,  0,  7,  8,  1, 10,  6, 12, 13, 14, 15,  5,
        4, 18,  9])

arr.take(indices)

array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
       -0.43856974, -0.20470766,  0.28174615,  0.76902257,  0.47894334,
        1.00718936,  0.09290788,  0.27499163,  0.22891288,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])

arr.take(indices) == np.partition(arr, 3)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

6.4 numpy.searchsorted：在已排序数组中寻找元素

arr = np.array([0, 8, 11, 8, 15])
arr.searchsorted(9)

arr = np.array([0, 1, 7, 12, 15])

arr.searchsorted(9)

arr.searchsorted([0, 8, 11, 16])

array([0, 3, 3, 5])

arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])

array([0, 3])

arr.searchsorted([0, 1], side='right')

array([3, 7])

# 一个很典型的应用
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data

array([9940., 6768., 7908., 1709.,  268., 8003., 9037.,  246., 4917.,
       5262., 5963.,  519., 8950., 7282., 8183., 5002., 8101.,  959.,
       2189., 2587., 4681., 4593., 7095., 1780., 5314., 1677., 7688.,
       9281., 6094., 1501., 4896., 3773., 8486., 9110., 3838., 3154.,
       5683., 1878., 1258., 6875., 7996., 5735., 9732., 6340., 8884.,
       4954., 3516., 7142., 5039., 2256.])

labels = bins.searchsorted(data)

labels

array([4, 4, 4, 3, 2, 4, 4, 2, 3, 4, 4, 2, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3,
       4, 3, 4, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 4, 4,
       4, 3, 3, 4, 4, 3])

# 配合pandas的groupby，可以实现聚类
import pandas as pd

pd.Series(data).groupby(labels).mean()

2     498.000000
3    3064.277778
4    7389.035714
dtype: float64

7. 使用Numba编写快速Numpy函数

import numpy as np

# 现考虑一个python本身的函数，看看有多慢
def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

x = np.random.randn(10000000)
y = np.random.randn(10000000)
%timeit mean_distance(x, y)

2.68 s ± 100 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

%timeit (x - y).mean()

26.9 ms ± 549 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

# 用numba.jit函数将上面的函数编译成Numba函数：
import numba as nb
numba_mean_distance = nb.jit(mean_distance)

# 等价方法
@nb.jit
def numba_mean_distance2(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

%timeit numba_mean_distance(x, y) # 甚至比numpy的函数还快

14.6 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

注意Numba不能将任意的python代码编译成机器代码，支持纯python代码的重要的子集。

%timeit numba_mean_distance2(x, y)

16.3 ms ± 911 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Numba将不知道如何编译的函数的调用替换为CPython API。Numba的jit函数有个一选项，nopython=True，将允许的代码限制为可以编译为LLVM的Python代码，而无须调用任何Python的C语言API。jit(nopython=True)有一个短的别名numba.njit

from numba import float64, njit

@njit(float64(float64[:], float64[:]))
def njit_mean_distance(x, y):
    return (x - y).mean()

%timeit njit_mean_distance(x, y)

36.9 ms ± 174 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

7.1 使用Numba创建自定义numpy.ufunc对象

from numba import vectorize

@vectorize
def nb_add(x, y):
    return x + y

x = np.arange(10)
nb_add(x, x)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

nb_add.accumulate(x, axis=0) # 即，建立的一个编译好的Numpy ufunc，其行为与内建的numpy函数一样

array([ 0,  1,  3,  6, 10, 15, 21, 28, 36, 45])

8. 高阶数组输入和输出

8.1 内存映射文件

是一种与磁盘上的二进制数据交互的方法，就像它是存储在内存数组中一样。允许对大型文件以小堆栈的方式进行读取和写入。

# 创建内存映射，使用np.memmap传入文件路径、dtype、shape和文件模式：
mmap = np.memmap('mymap', dtype='float64', mode='w+', shape=(10000, 10000))
mmap

memmap([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

# memmap的切片返回的是硬盘上数据的视图，如果将数据赋值给切片，那么其将在内存中缓冲（类似一个python文件对象），可以调用flush写入硬盘。
section = mmap[:5]

section.shape

(5, 10000)

section[:] = np.random.randn(5, 10000)
section

memmap([[-1.4264226 ,  0.21729148,  1.60461715, ...,  0.3102347 ,
          0.17720547,  1.69646377],
        [-1.20953714, -2.7361618 , -0.23058431, ...,  0.33713541,
          0.67793013,  0.60138858],
        [ 0.71859367, -0.34768919,  1.33271115, ...,  0.32399778,
          1.03741373, -0.65384645],
        [-0.6104341 ,  0.64413784,  0.42810329, ...,  1.25154416,
          0.34818979,  0.80809682],
        [ 0.21072709,  0.09675299, -0.10433349, ...,  1.22574256,
         -0.20164288,  0.46595202]])

mmap

memmap([[-1.4264226 ,  0.21729148,  1.60461715, ...,  0.3102347 ,
          0.17720547,  1.69646377],
        [-1.20953714, -2.7361618 , -0.23058431, ...,  0.33713541,
          0.67793013,  0.60138858],
        [ 0.71859367, -0.34768919,  1.33271115, ...,  0.32399778,
          1.03741373, -0.65384645],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

mmap.flush()

mmap

memmap([[-1.4264226 ,  0.21729148,  1.60461715, ...,  0.3102347 ,
          0.17720547,  1.69646377],
        [-1.20953714, -2.7361618 , -0.23058431, ...,  0.33713541,
          0.67793013,  0.60138858],
        [ 0.71859367, -0.34768919,  1.33271115, ...,  0.32399778,
          1.03741373, -0.65384645],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

del mmap # 内存中删除

mmap

---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-35-49f7bdb78b33> in <module>
----> 1 mmap


NameError: name 'mmap' is not defined

当内存映射超出范围且被当垃圾收集时，任何更改都将刷新到硬盘。由于硬盘中只有二进制数据，没有其他信息的元数据，打开时必须指定dtype和shape。其中dtype也适用结构化或者嵌套的dtype。

mmap = np.memmap('mymap', dtype='float64', shape=(10000, 10000))
mmap

memmap([[-1.4264226 ,  0.21729148,  1.60461715, ...,  0.3102347 ,
          0.17720547,  1.69646377],
        [-1.20953714, -2.7361618 , -0.23058431, ...,  0.33713541,
          0.67793013,  0.60138858],
        [ 0.71859367, -0.34768919,  1.33271115, ...,  0.32399778,
          1.03741373, -0.65384645],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

8.2 HDF5 和其他数组存储选择

HDF（Hierarchical Data Format）分层数据格式，是一种可压缩的数组格式。

9. 性能技巧

将python循环和条件逻辑转换为数组操作和布尔数组操作
尽可能使用广播
使用数组视图（切片）避免复制数据
使用ufunc和ufunc方法

9.1 连续内存的重要性

根据程序访问的局部性原理，当按照数组原来的顺序（C或者F）访问，可以避免跨步，同时Cache的命中率更高。可以用ndarray的flags属性检查顺序。

arr_c = np.ones((5000, 5000), order='C')
arr_f = np.ones((5000, 5000), order='F')
arr_c.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

arr_f.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

arr_f.flags.f_contiguous

True

arr_f.flags.c_contiguous

False

arr_c.flags.c_contiguous

True

%timeit arr_c.sum(axis=1)

13.1 ms ± 628 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit arr_c.sum(axis=0)

13.7 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit arr_f.sum(axis=0)

11.6 ms ± 935 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit arr_f.sum(axis=1)

15.2 ms ± 1.59 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

# 转换到所需的顺序，可以用转置，或者copy
arr_f.copy('C').flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

# 数组上构建视图，结果并不一定是连续的
arr_c[:50].flags.contiguous

True

arr_c[:50].flags.c_contiguous

True

arr_c[:50].flags.f_contiguous

False

arr_c[:, :50].flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

arr_c[:, :50].flags.contiguous

False

Numpy cheatsheet