tensorRT的使用包括两个阶段, build and runtime (deployment):
build:Import and optimize trained models to generate inference engines
build阶段主要完成模型转换(从caffe或TensorFlow到TensorRT),在模型转换时会完成前述优化过程中的层间融合,精度校准。这一步的输出是一个针对特定GPU平台和网络模型的优化过的TensorRT模型,这个TensorRT模型可以序列化存储到磁盘或内存中。存储到磁盘中的文件称之为 plan file。
build阶段依次实例化以下对象
- ILogger
- IBuilder
- INetworkDefiniton
- IParser
- ICudaEngine
- serialize成IHostMemory
build示例代码
//brief: Builds an engine from a network definition.
//创建一个IBuilder对象
IBuilder *builder = createInferBuilder(gLogger);
//brief: A network definition for input to the builder.
// 创建一个network对象
INetworkDefinition *network = builder->createNetwork();
//brief: Class used for parsing Caffe models. Allows users to export models trained using Caffe to TRT.
//创建一个ICaffeParser对象,继承自IParser类,用于解析caffe模型
ICaffeParser *parser = createCaffeParser();
//brief: Set the IPluginFactory used to create the user defined plugins.
parser->setPluginFactory(&pluginFactory);
engine_file = enginefile;
if ((dataType == DataType::kINT8 && !builder->platformHasFastInt8()) ||
(dataType == DataType::kHALF && !builder->platformHasFastFp16()))
return false;
const IBlobNameToTensor *blobNameToTensor = parser->parse(
deployFile.c_str(),
modelFile.c_str(),
*network,
dataType == DataType::kINT8 ? DataType::kFLOAT: dataType);
// specify which tensors are output
for (auto &s : OUTPUT_BLOB_NAMES)
//markOutput brief: Mark a tensor as a network output.
//标记输出tensor
network->markOutput(*blobNameToTensor->find(s.c_str()));
//find brief: Given a blob name, returns a pointer to a ITensor object.
// Build the engine
//设置存储空间和batchSize等
builder->setMaxBatchSize(maxBatchSize);
//workspaceSize brief: The maximum GPU temporary memory which the engine can use at execution time.
builder->setMaxWorkspaceSize(1 << 30);
//brief: Build a CUDA engine from a network definition.
//生成ICudaEngine
engine = builder->buildCudaEngine(*network);
assert(engine);
network->destroy();
parser->destroy();
// serialize the engine,
//brief: Serialize the network to a stream.
//序列化engine
TRTModelStream = engine->serialize();
engine->destroy();
builder->destroy();
pluginFactory.destroyPlugin();
runtime (deploy):Generate runtime inference engine for inference
runtime或者说是deploy阶段主要完成推理过程,Kernel Auto-Tuning 和 Dynamic Tensor Memory 应该是在这里完成的。将上面一个步骤中的plan文件首先反序列化,并创建一个 runtime engine,然后就可以输入数据(比如测试集或数据集之外的图片),然后输出分类向量结果或检测结果。
tensorRT的好处就是不需要安装其他深度学习框架,就可以实现部署和推理。
runtime阶段实例化以下对象
- IRuntime
- ICudaEngine
- IExecutionContext
以下是runtime示例代码
//创建IRuntime对象
runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
//创建ICudaEngine对象
engine = runtime->deserializeCudaEngine(TRTModelStream->data(), TRTModelStream->size(), &pluginFactory);
assert(engine != nullptr);
std::cout << "createinference" << std::endl;
for (int bi = 0; bi < engine->getNbBindings(); bi++)
{
if (engine->bindingIsInput(bi) == true) printf("Binding %d (%s): Input.\n", bi, engine->getBindingName(bi));
else printf("Binding %d (%s): Output.\n", bi, engine->getBindingName(bi));
}
std::cout << "****TensorRT Phase:********doInference" << std::endl;
//创建IExecutionContext对象,创建上下文环境 context,用于启动kernel
context = engine->createExecutionContext();
//context->setProfiler(&gProfiler);
assert(engine->getNbBindings() == 2);
// In order to bind the buffers, we need to know the names of the
// input and output tensors. //获取输入,输出tensor索引
inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME.c_str());// inputIndex=0
outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAMES[0].c_str());// outputIndex=1
//std::cout<<"outputIndex = "<<outputIndex<<std::endl;
//申请GPU显存
// Allocate GPU memory for Input / Output data
CUDA_CHECK(
cudaMalloc(&buffers[inputIndex], batchSize * inputDim.c() * inputDim.h() * inputDim.w() * sizeof(float)));
CUDA_CHECK(cudaMalloc(
&buffers[outputIndex], batchSize * outputDim.c() * outputDim.h() * outputDim.w() * sizeof(float)));
//使用cuda 流来管理并行计算
// Use CUDA streams to manage the concurrency of copying and executing
CUDA_CHECK(cudaStreamCreate(&stream));
//从内存到显存,input是读入内存中的数据;buffers[inputIndex]是显存上的存储区域,用于存放输入数据
// Copy Input Data to the GPU
cudaMemcpyAsync(buffers[inputIndex], input,
batchSize * size_of_single_input,
cudaMemcpyHostToDevice, stream);
//启动cuda核计算
// Launch an instance of the GIE compute kernel
context->enqueue(batchSize, buffers, stream, nullptr);
//从显存到内存,buffers[outputIndex]是显存中的存储区,存放模型输出;output是内存中的数据
// Copy Output Data to the Host
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex],
batchSize * outputDim.c() * outputDim.h() * outputDim.w() * sizeof(float),
cudaMemcpyDeviceToHost));
//如果使用了多个cuda流,需要同步
// It is possible to have multiple instances of the code above
// in flight on the GPU in different streams.
// The host can then sync on a given stream and use the results
CUDA_CHECK(cudaStreamSynchronize(stream));
参考
TensorRT(1)-介绍-使用-安装:https://arleyzhang.github.io/articles/7f4b25ce/