ONNX简易部署教程

ONNX

ONNX(英语:Open Neural Network Exchange)是一种针对机器学习所设计的开放式的文件格式,用于存储训练好的模型。它使得不同的人工智能框架(如Pytorch、MXNet)可以采用相同格式存储模型数据并交互。 ONNX的规范及代码主要由微软,亚马逊,Facebook和IBM等公司共同开发,以开放源代码的方式托管在Github上。 目前官方支持加载ONNX模型并进行推理的深度学习框架有: Caffe2, PyTorch, MXNet,ML.NET,TensorRT 和 Microsoft CNTK,并且 TensorFlow 也非官方的支持ONNX。

环境准备

  • pytorch
  • onnxruntime / onnxruntime-gpu

将 PyTorch 模型转换为 ONNX 模型

在 PyTorch 中导出模型是通过跟踪或脚本来实现的。本教程将使用通过跟踪导出的模型作为示例。要导出模型,我们调用该torch.onnx.export()函数。这将执行模型,记录用于计算输出的运算符的跟踪。因为export运行模型,我们需要提供一个输入张量x。只要类型和大小正确,其中的值可以是随机的。请注意,输入大小将在导出的 ONNX 图中针对所有输入的维度进行固定,除非指定为动态轴。在此示例中,我们使用 batch_size 1 的输入导出模型,然后在dynamic_axes 参数中将第一个维度指定为动态torch.onnx.export(). 因此,导出的模型将接受大小为 [batch_size, 1, 224, 224] 的输入,其中 batch_size 可以是可变的。

# Input to the model
x = torch.randn(batch_size, 1, 224, 224, requires_grad=True)
torch_out = torch_model(x)

# Export the model
torch.onnx.export(torch_model,               # model being run
                  x,                         # model input (or a tuple for multiple inputs)
                  "super_resolution.onnx",   # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=10,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['input'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  dynamic_axes={
    
    'input' : {
    
    0 : 'batch_size'},    # variable length axes
                                'output' : {
    
    0 : 'batch_size'}})

我们还计算torch_out了模型的输出,我们将使用它来验证我们导出的模型在 ONNX Runtime 中运行时计算相同的值。

但在使用 ONNX Runtime 验证模型的输出之前,我们将使用 ONNX 的 API 检查 ONNX 模型。首先,onnx.load(onnx_path)将加载保存的模型并输出 onnx.ModelProto 结构(用于捆绑 ML 模型的顶级文件/容器格式。有关nx.proto 文档的更多信息。)。然后,onnx.checker.check_model(onnx_model)将验证模型的结构并确认模型具有有效的模式。ONNX 图的有效性通过检查模型的版本、图的结构以及节点及其输入和输出来验证。

import onnx

onnx_model = onnx.load("*******.onnx")
onnx.checker.check_model(onnx_model)

现在让我们使用 ONNX Runtime 的 Python API 计算输出。这部分通常可以在单独的进程或另一台机器上完成,但我们将在同一进程中继续,以便我们可以验证 ONNX 运行时和 PyTorch 正在为网络计算相同的值。

为了使用 ONNX Runtime 运行模型,我们需要使用所选配置参数为模型创建一个推理会话(这里我们使用默认配置)。创建会话后,我们使用 run() api 评估模型。此调用的输出是一个列表,其中包含由 ONNX Runtime 计算的模型输出。

import onnxruntime

ort_session = onnxruntime.InferenceSession("****.onnx")

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs = {
    
    ort_session.get_inputs()[0].name: to_numpy(x)}
ort_outs = ort_session.run(None, ort_inputs)

# compare ONNX Runtime and PyTorch results
np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)

print("Exported model has been tested with ONNXRuntime, and the result looks good!")

这里以YOLOP为例子,进行模型转换:

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--height', type=int, default=640)  # height
    parser.add_argument('--width', type=int, default=640)  # width
    args = parser.parse_args()

    do_simplify = True

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = MCnet(YOLOP)
    checkpoint = torch.load('./weights/End-to-end.pth', map_location=device)
    model.load_state_dict(checkpoint['state_dict'])
    model.eval()

    height = args.height
    width = args.width
    print("Load ./weights/End-to-end.pth done!")
    onnx_path = f'./weights/yolop-{
      
      height}-{
      
      width}.onnx'
    inputs = torch.randn(1, 3, height, width)

    print(f"Converting to {
      
      onnx_path}")
    torch.onnx.export(model, inputs, onnx_path,
                      verbose=False, opset_version=12, input_names=['images'],
                      output_names=['det_out', 'drive_area_seg', 'lane_line_seg'])
    print('convert', onnx_path, 'to onnx finish!!!')
    # Checks
    model_onnx = onnx.load(onnx_path)  # load onnx model
    onnx.checker.check_model(model_onnx)  # check onnx model
    print(onnx.helper.printable_graph(model_onnx.graph))  # print

    if do_simplify:
        print(f'simplifying with onnx-simplifier {
      
      onnxsim.__version__}...')
        model_onnx, check = onnxsim.simplify(model_onnx, check_n=3)
        assert check, 'assert check failed'
        onnx.save(model_onnx, onnx_path)

    x = inputs.cpu().numpy()
    try:
        sess = ort.InferenceSession(onnx_path)

        for ii in sess.get_inputs():
            print("Input: ", ii)
        for oo in sess.get_outputs():
            print("Output: ", oo)

        print('read onnx using onnxruntime sucess')
    except Exception as e:
        print('read failed')
        raise e

使用onnxruntime进行模型推理

def infer_yolop(weight="yolop-640-640.onnx",
                img_path="./inference/images/7dd9ef45-f197db95.jpg"):

    ort.set_default_logger_severity(4)
    onnx_path = f"./weights/{
      
      weight}"
    ort_session = ort.InferenceSession(onnx_path)
    print(f"Load {
      
      onnx_path} done!")

    outputs_info = ort_session.get_outputs()
    inputs_info = ort_session.get_inputs()

    for ii in inputs_info:
        print("Input: ", ii)
    for oo in outputs_info:
        print("Output: ", oo)

    print("num outputs: ", len(outputs_info))

    save_det_path = f"./pictures/detect_onnx.jpg"
    save_da_path = f"./pictures/da_onnx.jpg"
    save_ll_path = f"./pictures/ll_onnx.jpg"
    save_merge_path = f"./pictures/output_onnx.jpg"

    img_bgr = cv2.imread(img_path)
    height, width, _ = img_bgr.shape

    # convert to RGB
    img_rgb = img_bgr[:, :, ::-1].copy()

    # resize & normalize
    canvas, r, dw, dh, new_unpad_w, new_unpad_h = resize_unscale(img_rgb, (640, 640))

    img = canvas.copy().astype(np.float32)  # (3,640,640) RGB
    img /= 255.0
    img[:, :, 0] -= 0.485
    img[:, :, 1] -= 0.456
    img[:, :, 2] -= 0.406
    img[:, :, 0] /= 0.229
    img[:, :, 1] /= 0.224
    img[:, :, 2] /= 0.225

    img = img.transpose(2, 0, 1)

    img = np.expand_dims(img, 0)  # (1, 3,640,640)

    # inference: (1,n,6) (1,2,640,640) (1,2,640,640)
    det_out, da_seg_out, ll_seg_out = ort_session.run(
        ['det_out', 'drive_area_seg', 'lane_line_seg'],
        input_feed={
    
    "images": img}
    )

    det_out = torch.from_numpy(det_out).float()
    boxes = non_max_suppression(det_out)[0]  # [n,6] [x1,y1,x2,y2,conf,cls]
    boxes = boxes.cpu().numpy().astype(np.float32)

    if boxes.shape[0] == 0:
        print("no bounding boxes detected.")
        return

    # scale coords to original size.
    boxes[:, 0] -= dw
    boxes[:, 1] -= dh
    boxes[:, 2] -= dw
    boxes[:, 3] -= dh
    boxes[:, :4] /= r

    print(f"detect {
      
      boxes.shape[0]} bounding boxes.")

    img_det = img_rgb[:, :, ::-1].copy()
    for i in range(boxes.shape[0]):
        x1, y1, x2, y2, conf, label = boxes[i]
        x1, y1, x2, y2, label = int(x1), int(y1), int(x2), int(y2), int(label)
        img_det = cv2.rectangle(img_det, (x1, y1), (x2, y2), (0, 255, 0), 2, 2)

    cv2.imwrite(save_det_path, img_det)

    # select da & ll segment area.
    da_seg_out = da_seg_out[:, :, dh:dh + new_unpad_h, dw:dw + new_unpad_w]
    ll_seg_out = ll_seg_out[:, :, dh:dh + new_unpad_h, dw:dw + new_unpad_w]

    da_seg_mask = np.argmax(da_seg_out, axis=1)[0]  # (?,?) (0|1)
    ll_seg_mask = np.argmax(ll_seg_out, axis=1)[0]  # (?,?) (0|1)
    print(da_seg_mask.shape)
    print(ll_seg_mask.shape)

    color_area = np.zeros((new_unpad_h, new_unpad_w, 3), dtype=np.uint8)
    color_area[da_seg_mask == 1] = [0, 255, 0]
    color_area[ll_seg_mask == 1] = [255, 0, 0]
    color_seg = color_area

    # convert to BGR
    color_seg = color_seg[..., ::-1]
    color_mask = np.mean(color_seg, 2)
    img_merge = canvas[dh:dh + new_unpad_h, dw:dw + new_unpad_w, :]
    img_merge = img_merge[:, :, ::-1]

    # merge: resize to original size
    img_merge[color_mask != 0] = \
        img_merge[color_mask != 0] * 0.5 + color_seg[color_mask != 0] * 0.5
    img_merge = img_merge.astype(np.uint8)
    img_merge = cv2.resize(img_merge, (width, height),
                           interpolation=cv2.INTER_LINEAR)
    for i in range(boxes.shape[0]):
        x1, y1, x2, y2, conf, label = boxes[i]
        x1, y1, x2, y2, label = int(x1), int(y1), int(x2), int(y2), int(label)
        img_merge = cv2.rectangle(img_merge, (x1, y1), (x2, y2), (0, 255, 0), 2, 2)

    # da: resize to original size
    da_seg_mask = da_seg_mask * 255
    da_seg_mask = da_seg_mask.astype(np.uint8)
    da_seg_mask = cv2.resize(da_seg_mask, (width, height),
                             interpolation=cv2.INTER_LINEAR)

    # ll: resize to original size
    ll_seg_mask = ll_seg_mask * 255
    ll_seg_mask = ll_seg_mask.astype(np.uint8)
    ll_seg_mask = cv2.resize(ll_seg_mask, (width, height),
                             interpolation=cv2.INTER_LINEAR)

    cv2.imwrite(save_merge_path, img_merge)
    cv2.imwrite(save_da_path, da_seg_mask)
    cv2.imwrite(save_ll_path, ll_seg_mask)

    print("detect done.")

使用onnxruntime-gpu进行推理加速

将下列代码置换到上面的代码中

ort.inferenceSession(onnx_path, providers=ort.get_available_providers())

参考

https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html

猜你喜欢

转载自blog.csdn.net/weixin_42990464/article/details/126384425