Computational energy BM1684X deploys handwritten digit recognition model

General process------------------------------------------------ ---------------

Refer to "sophon-sail_zh"

Transplantation steps------------------------------------------------ --------------------------

First build your own network model and export it to onnx format--for details, please refer to-->

GitHub - warren-wzw/MNIST-pytorch

  1. Use the tpu-mlir tool to convert the onnx model into bmodel format--for details, please refer to ---> https://kdocs.cn/l/cdwzqT3Hbyje

Copy to the board end: scp test_output_fp* [email protected] :/data

Remotely connect to another Linux--->ssh -p 22 [email protected] 6

  1. Set up the sophon-sail environment on the board side----> https://kdocs.cn/l/ce7T9GNtS3D3

python version------------------------------------------------ ----------------------------------------

Create a new MNIST folder on the board end. The file directory is as follows, where datasets stores the test data set train-images-idx3-ubyte, test_output_fp16_1b.bmodel and test_output_fp32_1b.bmodel are the bmodel models converted by onnx, and test.py is the test code.

  1. The main principle is to use the API provided by sophon to load the bmodel type model that can be adapted to BM1684X, and use their API to perform model inference. You can refer to the official sail API -->

3. API reference — sophon-sail v23.03.01 documentation

  1. Let’s explain the test code below
#import cv2
import numpy as np
import sophon.sail as sail
import time

num = -1 
inference_time =[0]
print("--0-5 1-0 2-4 3-1 4-9 5-2 6-1 7-3 8-1 9-4 for example:if num =9 the pic's num is 4")
 
engine = sail.Engine("./test_output_fp32_1b.bmodel",0,sail.IOMode.SYSIO) #load model-use FP32model on tpu-0 and use sys memery
#engine = sail.Engine("./test_output_fp16_1b.bmodel",0,sail.IOMode.SYSIO) #load model-use FP16 on tpu-0 and use sys memery

graph_name =engine.get_graph_names()[0]                      #get_graph_names-test_output
input_tensor_name = engine.get_input_names(graph_name)[0]    #get_input_names-input.1
output_tensor_name = engine.get_output_names(graph_name)[0]  #get_output_names-25_LogSoftmax

batchsize,channel,height,width = engine.get_input_shape(graph_name,input_tensor_name) #get batchsize-1,channel-1,input image's height-28 & width-28

#read image
with open("./datasets/train-images-idx3-ubyte","rb") as f:
    file = f.read()
for i in range(8000): 
    num =num +1  
    i = 16+784*num
    image1 = [int(str(item).encode('ascii'),16) for item in file[i:i+784]]

    #reshap input data
    input_data = np.array(image1,dtype=np.float32).reshape(1,1,28,28)  #reshape the image to 1 1 28 28
    input_data_final = {input_tensor_name:input_data}     #because the process's parmeter(input_data)  must be dictionary so use{}
    start_time = time.time()
    outputs = engine.process(graph_name,input_data_final) #model inference
    end_time = time.time()
    inference_time.append(end_time - start_time)  
 
    result = outputs[output_tensor_name]  #use output_tensor_name to get the tensor
    max_value=np.argmax(result)           #get the index of the best score
    print("----------------------------------the result is ",max_value,"the time is ",inference_time[num]*1000,"ms")

mean = (sum(inference_time) / len(inference_time))*1000
print("-----FP32--","loop ",num+1,"times","average time",mean,"ms")

  1. Test Results

FP32

FP16

Basically stable at 4%, peak value can reach 8%

C++ version------------------------------------------------ -------------------------------------------------- -------

First install the c++ cross-compilation environment

--> https://kdocs.cn/l/cbe77SdEwLKm

1: Compile using cross-compilation and create a new folder MNIST

File structure

CMakeFile.txt

main.cpp

#define USE_FFMPEG 1
#define USE_OPENCV 1
#define USE_BMCV 1
#include <stdio.h>
#include <sail/cvwrapper.h>
#include <iostream>
#include <string>
#include <numeric>
#include <sys/time.h>

#include "spdlog/spdlog.h"
#include "spdlog/fmt/fmt.h"
#include "engine.h"

using namespace std;
using namespace sail;
const std::string& bmodel_path_fp32="./test_output_fp32_1b.bmodel";
const std::string& bmodel_path_fp16="./test_output_fp16_1b.bmodel";
const int MODEL_IN_WIDTH = 28;
const int MODEL_IN_HEIGHT = 28;
const int MODEL_CHANNEL = 1;
const int loop_count = 1000;
int num = -1;

static inline int64_t getCurrentTimeUs()
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec * 1000000 + tv.tv_usec;
}
void Load_data(int num,unsigned char * input_image)
{
    int j=16+784*num;
    FILE *file = fopen("./datasets/train-images-idx3-ubyte", "rb");
    if (file == NULL) {
        printf("can't open the file!\n");
    }
    fseek(file,j,SEEK_SET);
    fread(input_image,sizeof(char),784,file);
/*     for(int i=0;i<MODEL_IN_WIDTH;i++){
        for(int j=0;j<MODEL_IN_WIDTH;j++){
            printf("%4d",input_image[i*28+j]);
        }
        printf("\n");
    } */ 
    fclose(file);
}

void Array_change(float input_aray[][MODEL_CHANNEL][MODEL_IN_WIDTH][MODEL_IN_HEIGHT],unsigned char *input_image)
{
   int index=0;
    for (int i = 0; i < 1; i++) {
        for (int j = 0; j < MODEL_CHANNEL; j++) {
            for (int k = 0; k < MODEL_IN_HEIGHT; k++) {
                for (int l = 0; l < MODEL_IN_WIDTH; l++) {
                    input_aray[i][j][k][l] = (float)input_image[index++];
                    //cout<<input_aray[i][j][k][l]<<" ";
                }
                //cout<<endl;
            }
        }
        //cout<<endl;
    } 
}

void Bubble_sort(float *buffer,int num)
{
    float temp;
    for(int i=0; i<num;i++){
        for(int j=0; j<num-i-1;j++){
            if(buffer[j]>buffer[j+1]){
                temp = buffer[j];
                buffer[j]=buffer[j+1];
                buffer[j+1]=temp;
            }
        }
    }
}

void dump_shape(std::vector<int> shape)
{
    cout<<"[  ";
    for (const int& value : shape) {
        std::cout << value << " ";
    }
     cout<<"]"<<endl;
}
bool inference(int device_id)
{
    int64_t time[loop_count] = {};
    unsigned char input_image[784]={};
    float input_aray[1][MODEL_CHANNEL][MODEL_IN_HEIGHT][MODEL_IN_WIDTH]={};
    int64_t sum=0;
    float buffer_copy[]={};
    // init Engine
    sail::Engine engine(device_id);    
    // load bmodel without builtin input and output tensors
    engine.load(bmodel_path_fp32);
    // get model info
    auto graph_name = engine.get_graph_names().front();
    auto input_name = engine.get_input_names(graph_name).front();
    auto output_name = engine.get_output_names(graph_name).front();
    std::vector<int> input_shape = {1, 1, 28, 28};
    std::map<std::string, std::vector<int>> input_shapes;
    input_shapes[input_name] = input_shape;
    auto output_shape = engine.get_output_shape(graph_name, output_name);
    auto input_dtype = engine.get_input_dtype (graph_name, input_name);
    auto output_dtype = engine.get_output_dtype(graph_name, output_name);
    cout<<"----------------graph_name is "<<graph_name<<endl;
    cout<<"----------------input_name is "<<input_name<<endl;
    cout<<"----------------output_name is "<<output_name<<endl;
    cout<<"----------------input_dtype is "<<input_dtype<<endl;
    cout<<"----------------output_dtype is "<<output_dtype<<endl;
    cout<<"output shape is ";
    dump_shape(output_shape);
    cout<<"input shape is ";
    dump_shape(input_shape);
    // get handle to create input and output tensors
    sail::Handle handle = engine.get_handle();

    // allocate input and output tensors with both system and device memory
    sail::Tensor in(handle, input_shape, input_dtype, true, true);
    sail::Tensor out(handle, output_shape, output_dtype, true, true);
    std::map<std::string, sail::Tensor*> input_tensors = {
   
   {input_name, &in}};
    std::map<std::string, sail::Tensor*> output_tensors = {
   
   {output_name, &out}};
    // prepare input and output data in system memory with data type of float32
    float* input = nullptr;
    float* output = nullptr;
    int in_size = std::accumulate(input_shape.begin(), input_shape.end(),
                                    1, std::multiplies<int>());
    int out_size = std::accumulate(output_shape.begin(), output_shape.end(),
                                    1, std::multiplies<int>());
    if (input_dtype == BM_FLOAT32) {
        input = reinterpret_cast<float*>(in.sys_data());
    } 
    else {
        input = new float[in_size];
    }
    if (output_dtype == BM_FLOAT32) {
        output = reinterpret_cast<float*>(out.sys_data());
    } 
    else {
        output = new float[out_size];
    }
    //loop 
    for(int times=0;times<loop_count;times++) {
        num++;
        Load_data(num,input_image);
        Array_change(input_aray,input_image);
        bool status=in.own_dev_data();
        cout<<"own_dev_data "<<status<<endl;
        status=in.own_sys_data();
        cout<<"own_sys_data "<<status<<endl;
        in.reset_sys_data(input_aray,input_shape);
         
        // set io_mode SYSO:Both input and output tensors are in system memory.
        engine.set_io_mode(graph_name, sail::  SYSIO);

        bm_data_type_t ret =in.dtype();
        printf("in.dtype is %d\n", ret);

        //inference
        int64_t start_time = getCurrentTimeUs();
        engine.process(graph_name, input_tensors, input_shapes, output_tensors);
        int64_t end_time = getCurrentTimeUs();
        time[times]=end_time-start_time;
        sum = sum+time[times];
        //post process
        auto real_output_shape = engine.get_output_shape(graph_name, output_name);
        float* output_data = reinterpret_cast<float*>(out.sys_data());
        for(int i = 0; i < 10;i++){
            buffer_copy[i]=output_data[i];
            //printf("output_data is %f \n",output_data[i]);
        }
        Bubble_sort(output_data,10);
        for(int i =0;i<10;i++){
            if(buffer_copy[i]==output_data[9]){
                printf("------------------------------------------the pic value is %d \n",i);
            }
        }

        /* cout<<"real_output_shape is "<<"[  ";
        dump_shape(real_output_shape);*/
        printf(": Elapse Time = %.3f ms \n", time[times] / 1000.f);
    }
    printf("--------loop %d times sum is %.4f ms average time is %.3f ms\n", loop_count,sum / 1000.f,(sum / 1000.f)/loop_count);
    return true;
}
int main()
{
    int device_id = 0;
    int tpu_num=get_available_tpu_num();
    printf("the tpu number is %d\n", tpu_num);
    
    bool status = inference(device_id);
    return 0;    
}

Print results

fp32

fp16

you8

bm-smi

Basically stable at 2%, with a peak value of 4

Guess you like

Origin blog.csdn.net/warren103098/article/details/132238599