TensorRT -- Construction

TensorRT Construction

给出一个最简单的创建并运行的过程。

Procedure

  1. Create a global object ILogger

ILogger 是个抽象类,需要派生后使用。官方 API 中给了例子。具体使用可以从 Code Sample 中拷贝。

  1. Create object of type IBuilder
IBuilder* builder = createInferBuilder(Logger);
  1. Create object of type IBuilderConfig
IBuilderConfig* config = builder->createBuilderConfig();
  1. Create object of type INetworkDefinition
// default creation
INetworkDefinition* network = builder->createNetworkV2(0U);

// explicit batch
INetworkDefinition* network = builder->createNetworkV2(1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
  1. Network Definition

  2. Create Engine

ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  1. Create Execution Context
IExecutionContext* context = engine->createExecutionContext();
  1. Do Inference in the context

下面给出的例子是异步的。(Asychronous execution)

  // Pointer to Input & Output Buffers
  void* buffers[2];
  
  // In order to bind the buffers, we need to know the names of the input and output tensors.
  // Note that indices are guaranteed to be less than IEngine::getNbBindings()
  const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
  const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);

  // Create GPU buffers on device
  CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
  CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

  // Create stream
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
  CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
  context->enqueue(batchSize, buffers, stream, nullptr);
  CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
  cudaStreamSynchronize(stream);

  // Release stream and buffers
  cudaStreamDestroy(stream);
  CUDA_CHECK(cudaFree(buffers[inputIndex]));
  CUDA_CHECK(cudaFree(buffers[outputIndex]));

简单来说三个步骤:拷贝数据到 Devicecontext->enqueue() 执行、拷贝数据回 Host

Network Definition

// set the input
auto data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{-1, 1, INPUT_H, INPUT_W});

// add layers
auto conv1 = network->addConvolution(*data->getOutput(0), 20, DimsHW{5, 5}, weightMap["conv1filter"], weightMap["conv1bias"]);
conv1->setStride(DimsHW{1, 1});

auto pool1 = network->addPooling(*conv1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2});
pool1->setStride(DimsHW{2, 2});

auto ip1 = network->addFullyConnected(*pool1->getOutput(0), 500, weightMap["ip1filter"], weightMap["ip1bias"]);
auto relu1 = network->addActivation(*ip1->getOutput(0), ActivationType::kRELU);

auto prob = network->addSoftMax(*relu1->getOutput(0));
prob->getOutput(0)->setName(OUTPUT_BLOB_NAME);

// set output
network->markOutput(*prob->getOutput(0));
  1. 输入的维数都是 Dims3 ,可以隐式指定 batchsize 。对于全连接层,若输入为 {C, H, W},则会变形成 {1, C*H*W} 后输入网络。
  2. 网络的输入和输出都是 ITensor 类。将 ITensor 和字符串绑定,用于之后索引。

文中代码参考此处

猜你喜欢

转载自blog.csdn.net/lib0000/article/details/113406329