一文玩轉pytorch轉onnx-tensorRT ——(C)測試onnx轉tensorRT

說明

前文講到了如何在onnx註冊自定義層,以便onnx parsing時找到所對應的層。
在示例demo前,貼出遇到的問題

  • 問題: ONNX and tensorRT: ERROR: Network must have at least one output
    • 開始當作正常bug去處理的,找到了一些參考資料:1)既然沒有輸出,就標記一個輸出; 2)input可能出現的問題
    • 後來發現上面的方式對我的問題沒有什麼效果,最後經過巨量的微調後的debug,發現是因爲在轉onnx時強行修改瞭如下的代碼,而upsample的修改導致結果不正確(ps:做研究,切莫隨便捧別人臭腳,每一步都是有根據的纔行,不然……)。實際上,這種寫法不是對所有的pytorch(pytorch下的onnx)都是有效的,pytorch下的onnx有多個版本的。
import torch.onnx.symbolic
    # Override Upsample's ONNX export from old opset if required (not needed for TRT 5.1+)
    @torch.onnx.symbolic.parse_args('v', 'is')
    def upsample_nearest2d(g, input, output_size):
        height_scale = float(output_size[-2]) / input.type().sizes()[-2]
        width_scale = float(output_size[-1]) / input.type().sizes()[-1]
        return g.op("Upsample", input,
                    scales_f=(1, 1, height_scale, width_scale),
                    mode_s="nearest")
    @torch.onnx.symbolic.parse_args('v', 'is', 'i')
    def upsample_bilinear2d(g, input, output_size, align_corners):
        height_scale = float(output_size[-2]) / input.type().sizes()[-2]
        width_scale = float(output_size[-1]) / input.type().sizes()[-1]
        return g.op("Upsample", input,
                    scales_f=(1, 1, height_scale, width_scale),
                    mode_s="linear")
    torch.onnx.symbolic.upsample_bilinear2d = upsample_bilinear2d
    torch.onnx.symbolic.upsample_nearest2d = upsample_nearest2d

補充

後來的其他實驗遇到的一些小bug,記錄如下:

  • averagePool如果stride=1時(方便寫block時會這麼寫),那麼這一層沒有實質意義的,但是轉成的onnx,再轉到tensorRT下時,會出現值檢查的bug。這種情況可以直接在生成網絡時,去掉這些層。當然,還有複雜一點點的,直接去掉可能導致一些邏輯的模型加載報錯,所以可以用pad填充替代此層(當作佔位符,用最簡單的按序號加載參數時會用到)。

測試onnx轉tensorRT

  • python代碼
class gnSOLE(nn.Module):
    def __init__(self):
        super().__init__()
        out_channels = 2
        self.lr = nn.LeakyReLU()
        self.mp = nn.MaxPool2d((2, 2), stride=(2, 2))
        self.gn = nn.GroupNorm(num_groups=2, num_channels=out_channels, eps=1e-5)
        self.bn = nn.BatchNorm2d(out_channels)
        for m in self.modules():
            if isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x1 = self.mp(x)
        x1 = self.gn(self.lr(x1))
        # x1 = upsample_to(x1, x)
        p4 = x1 + x
        return p4

def GN_ONNX():
    model = gnSOLE()
    model.to(torch.device('cuda'))
    zero_input = torch.rand(1, 2, 8, 8).cuda()
    # model(zero_input)
    torch.onnx.export(model, zero_input, 'gn.onnx', verbose=True)
  • c++代碼。這兒包括編譯方式,具體還可以參考“如何測試tensorRT添加自定義層”中的介紹。另外這裏註釋掉了common(tensorRT的demo中引用的頭文件),註釋代碼仍舊保留,權當參考。
//nvcc -o gn test_onnx.cpp ../cuda/groupnorm.cu /usr/src/tensorrt/samples/common/logger.cpp
// -I/home/user/package/cub-1.8.0 -I/usr/src/tensorrt/samples/common/ -I./../cuda/ -L/usr/local/cuda/lib64
// -lcudart -lcuda -L/usr/local/lib/ -lnvonnxparser -L/usr/lib/x86_64-linux-gnu/ -lnvinfer
// -lnvparsers -lnvinfer_plugin
#include <algorithm>
#include <assert.h>
#include <cmath>
//#include <cuda_runtime_api.h>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <sys/stat.h>
#include <time.h>

#include "NvInfer.h"
#include <NvOnnxParser.h>
#include "logger.h"
//#include "common.h"
#include "GN.h"

#define CHECK(status)                                          \
    do                                                         \
    {                                                          \
        auto ret = (status);                                   \
        if (ret != 0)                                          \
        {                                                      \
            std::cout << "Cuda failure: " << ret << std::endl; \
            abort();                                           \
        }                                                      \
    } while (0)
using namespace nvinfer1;

static const int INPUT_H = 28;
static const int INPUT_W = 28;
static const int OUTPUT_SIZE = 10;

const std::string gSampleName = "TensorRT.sample_onnx_mnist";

bool onnxToTRTModel(const std::string &modelFile, // name of the onnx model
                    unsigned int maxBatchSize,    // batch size - NB must be at least as large as the batch we want to run with
                    IHostMemory *&trtModelStream) // output buffer for the TensorRT model
{
    // create the builder
    IBuilder *builder = createInferBuilder(gLogger.getTRTLogger());
    assert(builder != nullptr);
    nvinfer1::INetworkDefinition *network = builder->createNetwork();

    auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
    //Optional - uncomment below lines to view network layer information
    //config->setPrintLayerInfo(true);
    //parser->reportParsingInfo();
    std::cout << modelFile << std::endl;
    if (!parser->parseFromFile(modelFile.c_str(),
                               static_cast<int>(gLogger.getReportableSeverity()))) {
        gLogError << "Failure while parsing ONNX file" << std::endl;
        return false;
    }

    // Build the engine
    builder->setMaxBatchSize(maxBatchSize);
    builder->setMaxWorkspaceSize(1 << 20);
//    builder->setFp16Mode(gArgs.runInFp16);
//    builder->setInt8Mode(gArgs.runInInt8);
//
//    if (gArgs.runInInt8) {
//        samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
//    }
//
//    samplesCommon::enableDLA(builder, gArgs.useDLACore);

    ICudaEngine *engine = builder->buildCudaEngine(*network);
    assert(engine);

    // we can destroy the parser
    parser->destroy();

    // serialize the engine, then close everything down
    trtModelStream = engine->serialize();
    engine->destroy();
    network->destroy();
    builder->destroy();

    return true;
}

void doInference(IExecutionContext &context, float *input, float *output, int batchSize) {
    const ICudaEngine &engine = context.getEngine();
    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine.getNbBindings() == 2);
    void *buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than IEngine::getNbBindings()
    int inputIndex{}, outputIndex{};
    for (int b = 0; b < engine.getNbBindings(); ++b) {
        if (engine.bindingIsInput(b))
            inputIndex = b;
        else
            outputIndex = b;
    }

    // create GPU buffers and a stream
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float),
                          cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost,
                          stream));
    cudaStreamSynchronize(stream);

    // release the stream and the buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

//!
//! \brief This function prints the help information for running this sample
//!
void printHelpInfo() {
    std::cout
            << "Usage: ./sample_onnx_mnist [-h or --help] [-d or --datadir=<path to data directory>] [--useDLACore=<int>]\n";
    std::cout << "--help          Display help information\n";
    std::cout
            << "--datadir       Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data directories are given, the default is to use (data/samples/mnist/, data/mnist/)"
            << std::endl;
    std::cout
            << "--useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform."
            << std::endl;
    std::cout << "--int8          Run in Int8 mode.\n";
    std::cout << "--fp16          Run in FP16 mode." << std::endl;
}

int main(int argc, char **argv) {
    auto sampleTest = gLogger.defineTest(gSampleName, argc, const_cast<const char **>(argv));
    gLogger.reportTestStart(sampleTest);
    // create a TensorRT model from the onnx model and serialize it to a stream
    IHostMemory *trtModelStream{nullptr};
    if (!onnxToTRTModel("/home/user/weight/gn.onnx", 1, trtModelStream))
        std::cerr << "can not read onnx!";

    assert(trtModelStream != nullptr);

    uint8_t fileData[INPUT_H * INPUT_W];
    // print an ascii representation
    gLogInfo << "Input:\n"
//    float data[INPUT_H * INPUT_W];
//    for (int i = 0; i < INPUT_H * INPUT_W; i++)
//        data[i] = 1.0 - float(fileData[i] / 255.0);
//
//    // deserialize the engine
//    IRuntime *runtime = createInferRuntime(gLogger);
//    assert(runtime != nullptr);
//    if (gArgs.useDLACore >= 0) {
//        runtime->setDLACore(gArgs.useDLACore);
//    }
//
//    ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
//    assert(engine != nullptr);
//    trtModelStream->destroy();
//    IExecutionContext *context = engine->createExecutionContext();
//    assert(context != nullptr);
//    // run inference
//    float prob[OUTPUT_SIZE];
//    doInference(*context, data, prob, 1);
//
//    // destroy the engine
//    context->destroy();
//    engine->destroy();
//    runtime->destroy();
//*********************************
//    float val{0.0f};
//    int idx{0};
//
//    //Calculate Softmax
//    float sum{0.0f};
//    for (int i = 0; i < OUTPUT_SIZE; i++)
//    {
//        prob[i] = exp(prob[i]);
//        sum += prob[i];
//    }
//
//    gLogInfo << "Output:\n";
//    for (int i = 0; i < OUTPUT_SIZE; i++)
//    {
//        prob[i] /= sum;
//        val = std::max(val, prob[i]);
//        if (val == prob[i])
//            idx = i;
//
//        gLogInfo << " Prob " << i << "  " << std::fixed << std::setw(5) << std::setprecision(4) << prob[i] << " "
//                 << "Class " << i << ": " << std::string(int(std::floor(prob[i] * 10 + 0.5f)), '*') << "\n";
//    }
//    gLogInfo << std::endl;
//
//    bool pass{idx == num && val > 0.9f};
//
//    return gLogger.reportTest(sampleTest, pass);
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章