前言

TensorRT可以通過INT8量化處理網絡，然後大幅加速網絡推理速度，本文旨在詳細分析MNIST INT8 Sample 的代碼，解釋如何使用TensorRT 對網絡做INT8 量化處理。

關於INT8 量化的背景知識可以參考博文TensorRT INT8校準與量化原理

代碼分析

sampleINT8的github 代碼參考link: https://github.com/NVIDIA/TensorRT/tree/release/6.0/samples/opensource/sampleINT8

程序的主要流程分爲 main與程序輸入參數初始化 -> 網絡構建 -> 網絡推理 -> 釋放資源結束這幾個階段，下面逐個階段分析代碼

Main入口

//!
//! \brief Initializes members of the params struct using the command line args
//!
SampleINT8Params initializeSampleParams(const samplesCommon::Args& args, int batchSize)
{
    SampleINT8Params params;
    // Use directories provided by the user, in addition to default directories.
    params.dataDirs = args.dataDirs;
    params.dataDirs.emplace_back("data/mnist/");
    params.dataDirs.emplace_back("int8/mnist/");
    params.dataDirs.emplace_back("samples/mnist/");
    params.dataDirs.emplace_back("data/samples/mnist/");
    params.dataDirs.emplace_back("data/int8/mnist/");
    params.dataDirs.emplace_back("data/int8_samples/mnist/");

    params.batchSize = batchSize;
    params.dlaCore = args.useDLACore;
    params.nbCalBatches = 10;
    params.calBatchSize = 50;
    params.inputTensorNames.push_back("data");
    params.outputTensorNames.push_back("prob");
    params.prototxtFileName = "deploy.prototxt";
    params.weightsFileName = "mnist_lenet.caffemodel";
    params.networkName = "mnist";
    return params;
}

//!
//! \brief Prints the help information for running this sample
//!
void printHelpInfo()
{
    std::cout << "Usage: ./sample_int8 [-h or --help] [-d or --datadir=<path to data directory>] "
                 "[--useDLACore=<int>]"
              << std::endl;
    std::cout << "--help          Display help information" << std::endl;
    std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used "
                 "multiple times to add multiple directories."
              << std::endl;
    std::cout << "--useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, "
                 "where n is the number of DLA engines on the platform."
              << std::endl;
    std::cout << "batch=N         Set batch size (default = 32)." << std::endl;
    std::cout << "start=N         Set the first batch to be scored (default = 100). All batches before this batch will "
                 "be used for calibration."
              << std::endl;
    std::cout << "score=N         Set the number of batches to be scored (default = 400)." << std::endl;
}

int main(int argc, char** argv)
{
    if (argc >= 2 && (!strncmp(argv[1], "help", 4) || !strncmp(argv[1], "--help", 6) || !strncmp(argv[1], "--h", 3)))
    {
        printHelpInfo();
        return EXIT_FAILURE;
    }

    // By default we score over 40K images starting at 3200, so we don't score those used to search calibration
    int batchSize = 32;
    int firstScoreBatch = 100;
    int nbScoreBatches = 400;

    // Parse extra arguments
    for (int i = 1; i < argc; ++i)
    {
        if (!strncmp(argv[i], "batch=", 6))
        {
            batchSize = atoi(argv[i] + 6);
        }
        else if (!strncmp(argv[i], "start=", 6))
        {
            firstScoreBatch = atoi(argv[i] + 6);
        }
        else if (!strncmp(argv[i], "score=", 6))
        {
            nbScoreBatches = atoi(argv[i] + 6);
        }
    }

    if (batchSize > 128)
    {
        gLogError << "Please provide batch size <= 128" << std::endl;
        return EXIT_FAILURE;
    }

    if ((firstScoreBatch + nbScoreBatches) * batchSize > 500000)
    {
        gLogError << "Only 50000 images available" << std::endl;
        return EXIT_FAILURE;
    }

    samplesCommon::Args args;
    samplesCommon::parseArgs(args, argc, argv);

    SampleINT8 sample(initializeSampleParams(args, batchSize));

......

檢查程序輸入參數，如果不符合要求，則print help的提示信息
通過initializeSampleParams函數設置默認參數的值

int main(int argc, char** argv)
{
......

    std::vector<std::string> dataTypeNames = {"FP32", "FP16", "INT8"};
    std::vector<DataType> dataTypes = {DataType::kFLOAT, DataType::kHALF, DataType::kINT8};
    std::vector<std::pair<float, float>> scores(3, std::make_pair(0.0f, 0.0f));
    for (size_t i = 0; i < dataTypes.size(); i++)
    {
        gLogInfo << dataTypeNames[i] << " run:" << nbScoreBatches << " batches of size " << batchSize << " starting at "
                 << firstScoreBatch << std::endl;

        if (!sample.build(dataTypes[i]))
        {
            if (!sample.isSupported(dataTypes[i]))
            {
                gLogWarning << "Skipping " << dataTypeNames[i] << " since the platform does not support this data type."
                            << std::endl;
                continue;
            }
            return gLogger.reportFail(sampleTest);
        }
        if (!sample.infer(scores[i], firstScoreBatch, nbScoreBatches))
        {
            return gLogger.reportFail(sampleTest);
        }
    }

......

根據FP32, FP16, INT8 三種數據類型來構建網絡和推理網絡

構建(Build)網絡

bool SampleINT8::build(DataType dataType)
{

    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(gLogger.getTRTLogger()));
    if (!builder)
    {
        return false;
    }

    auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetwork());
    if (!network)
    {
        return false;
    }

    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
    if (!config)
    {
        return false;
    }

    auto parser = SampleUniquePtr<nvcaffeparser1::ICaffeParser>(nvcaffeparser1::createCaffeParser());
    if (!parser)
    {
        return false;
    }

    if ((dataType == DataType::kINT8 && !builder->platformHasFastInt8())
        || (dataType == DataType::kHALF && !builder->platformHasFastFp16()))
    {
        return false;
    }

    auto constructed = constructNetwork(builder, network, config, parser, dataType);

......

TensorRT的標準流程，創建IBuilder -> 創建INetworkDefinition -> 創建IBuilderConfig -> 創建caffe模型的分析器ICaffeParser，判斷硬件平臺是否支持native FP16或INT8
通過consstructNetwork函數分析caffe模型，構建網絡

bool SampleINT8::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
    SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser, DataType dataType)
{
    mEngine = nullptr;
    const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor
        = parser->parse(locateFile(mParams.prototxtFileName, mParams.dataDirs).c_str(),
            locateFile(mParams.weightsFileName, mParams.dataDirs).c_str(), *network,
            dataType == DataType::kINT8 ? DataType::kFLOAT : dataType);

    for (auto& s : mParams.outputTensorNames)
    {
        network->markOutput(*blobNameToTensor->find(s.c_str()));
    }

    // Calibrator life time needs to last until after the engine is built.
    std::unique_ptr<IInt8Calibrator> calibrator;

    config->setAvgTimingIterations(1);
    config->setMinTimingIterations(1);
    config->setMaxWorkspaceSize(1_GiB);
    config->setFlag(BuilderFlag::kDEBUG);
    if (dataType == DataType::kHALF)
    {
        config->setFlag(BuilderFlag::kFP16);
    }
    if (dataType == DataType::kINT8)
    {
        config->setFlag(BuilderFlag::kINT8);
    }
    builder->setMaxBatchSize(mParams.batchSize);

    if (dataType == DataType::kINT8)
    {
        MNISTBatchStream calibrationStream(mParams.calBatchSize, mParams.nbCalBatches, "train-images-idx3-ubyte",
            "train-labels-idx1-ubyte", mParams.dataDirs);
        calibrator.reset(new Int8EntropyCalibrator2<MNISTBatchStream>(
            calibrationStream, 0, mParams.networkName.c_str(), mParams.inputTensorNames[0].c_str()));
        config->setInt8Calibrator(calibrator.get());
    }
......

通過parser->parse分析caffe模型分析和權重文件結合dataType是INT8還是FP 來構建network對象
通過network->markOutput(*blobNameToTensor->find(s.c_str())) 標記網絡output的Tensor
通過config 配置網絡每一層的迭代次數和網絡佔用的內存大小，根據dataType設置Flag
通過builder->setMaxBatchSize設置網絡input的batchSize
如果是dataType是INT8類型，則構建BatchStream對象用於獲取執行校準需要的calibration數據
通過calibrator.reset(new Int8EntropyCalibrator2<MNISTBatchStream>( calibrationStream ... 構建TensorRT需要的calibration interface，TensorRT通過該interface在量化過程中獲取calibration數據，關於BatchStream的詳細分析如下
通過config->setInt8Calibrator(calibrator.get()); 將calibration interface配置到網絡的config中

BatchStream

class IBatchStream
{
public:
    virtual void reset(int firstBatch) = 0;
    virtual bool next() = 0;
    virtual void skip(int skipCount) = 0;
    virtual float* getBatch() = 0;
    virtual float* getLabels() = 0;
    virtual int getBatchesRead() const = 0;
    virtual int getBatchSize() const = 0;
    virtual nvinfer1::Dims getDims() const = 0;
};

class MNISTBatchStream : public IBatchStream
{
public:
    MNISTBatchStream(int batchSize, int maxBatches, const std::string& dataFile, const std::string& labelsFile,
        const std::vector<std::string>& directories)
        : mBatchSize{batchSize}
        , mMaxBatches{maxBatches}
        , mDims{3, 1, 28, 28} //!< We already know the dimensions of MNIST images.
    {
        readDataFile(locateFile(dataFile, directories));
        readLabelsFile(locateFile(labelsFile, directories));
    }

    void reset(int firstBatch) override
    {
        mBatchCount = firstBatch;
    }

    bool next() override
    {
        if (mBatchCount >= mMaxBatches)
        {
            return false;
        }
        ++mBatchCount;
        return true;
    }

    void skip(int skipCount) override
    {
        mBatchCount += skipCount;
    }

    float* getBatch() override
    {
        return mData.data() + (mBatchCount * mBatchSize * samplesCommon::volume(mDims));
    }

    float* getLabels() override
    {
        return mLabels.data() + (mBatchCount * mBatchSize);
    }

    int getBatchesRead() const override
    {
        return mBatchCount;
    }

    int getBatchSize() const override
    {
        return mBatchSize;
    }

    nvinfer1::Dims getDims() const override
    {
        return mDims;
    }

private:
    void readDataFile(const std::string& dataFilePath)
    {
        std::ifstream file{dataFilePath.c_str(), std::ios::binary};

        int magicNumber, numImages, imageH, imageW;
        file.read(reinterpret_cast<char*>(&magicNumber), sizeof(magicNumber));
        // All values in the MNIST files are big endian.
        magicNumber = samplesCommon::swapEndianness(magicNumber);
        assert(magicNumber == 2051 && "Magic Number does not match the expected value for an MNIST image set");

        // Read number of images and dimensions
        file.read(reinterpret_cast<char*>(&numImages), sizeof(numImages));
        file.read(reinterpret_cast<char*>(&imageH), sizeof(imageH));
        file.read(reinterpret_cast<char*>(&imageW), sizeof(imageW));

        numImages = samplesCommon::swapEndianness(numImages);
        imageH = samplesCommon::swapEndianness(imageH);
        imageW = samplesCommon::swapEndianness(imageW);

        // The MNIST data is made up of unsigned bytes, so we need to cast to float and normalize.
        int numElements = numImages * imageH * imageW;
        std::vector<uint8_t> rawData(numElements);
        file.read(reinterpret_cast<char*>(rawData.data()), numElements * sizeof(uint8_t));
        mData.resize(numElements);
        std::transform(
            rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast<float>(val) / 255.f; });
    }

    void readLabelsFile(const std::string& labelsFilePath)
    {
        std::ifstream file{labelsFilePath.c_str(), std::ios::binary};
        int magicNumber, numImages;
        file.read(reinterpret_cast<char*>(&magicNumber), sizeof(magicNumber));
        // All values in the MNIST files are big endian.
        magicNumber = samplesCommon::swapEndianness(magicNumber);
        assert(magicNumber == 2049 && "Magic Number does not match the expected value for an MNIST labels file");

        file.read(reinterpret_cast<char*>(&numImages), sizeof(numImages));
        numImages = samplesCommon::swapEndianness(numImages);

        std::vector<uint8_t> rawLabels(numImages);
        file.read(reinterpret_cast<char*>(rawLabels.data()), numImages * sizeof(uint8_t));
        mLabels.resize(numImages);
        std::transform(
            rawLabels.begin(), rawLabels.end(), mLabels.begin(), [](uint8_t val) { return static_cast<float>(val); });
    }

    int mBatchSize{0};
    int mBatchCount{0}; //!< The batch that will be read on the next invocation of next()
    int mMaxBatches{0};
    Dims mDims{};
    std::vector<float> mData{};
    std::vector<float> mLabels{};
};

通過構造函數獲得校準calibrationS數據的BatchSize和Batch number
通過readDataFile讀取calibrationS數據集的數據文件，包括文件中共有多少Image，每個Image的height和width，計算總共要讀取的數據量numElements = numImages * imageH * imageW;，將文件的數據讀取到mData
通過readDataFile讀取calibrationS數據集的labels文件，過程與readDataFile類似，只是將讀取的數據保存到mLabels
next()函數返回下一個Batch的count
getBatch 返回當前BatchCount對應的數據指針，即數據mData偏移(mBatchCount * mBatchSize * samplesCommon::volume(mDims)) 的位置
getLabels() 返回當前BatchCount對應的label指針，即mLabels.data() 偏移 (mBatchCount * mBatchSize) 的位置

template <typename TBatchStream>
class EntropyCalibratorImpl
{
public:
    EntropyCalibratorImpl(
        TBatchStream stream, int firstBatch, std::string networkName, const char* inputBlobName, bool readCache = true)
        : mStream{stream}
        , mCalibrationTableName("CalibrationTable" + networkName)
        , mInputBlobName(inputBlobName)
        , mReadCache(readCache)
    {
        nvinfer1::Dims dims = mStream.getDims();
        mInputCount = samplesCommon::volume(dims) * mStream.getBatchSize();
        CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
        mStream.reset(firstBatch);
    }

    virtual ~EntropyCalibratorImpl()
    {
        CHECK(cudaFree(mDeviceInput));
    }

    int getBatchSize() const
    {
        return mStream.getBatchSize();
    }

    bool getBatch(void* bindings[], const char* names[], int nbBindings)
    {
        if (!mStream.next())
        {
            return false;
        }
        CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
        assert(!strcmp(names[0], mInputBlobName));
        bindings[0] = mDeviceInput;
        return true;
    }

    const void* readCalibrationCache(size_t& length)
    {
        mCalibrationCache.clear();
        std::ifstream input(mCalibrationTableName, std::ios::binary);
        input >> std::noskipws;
        if (mReadCache && input.good())
        {
            std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
                std::back_inserter(mCalibrationCache));
        }
        length = mCalibrationCache.size();
        return length ? mCalibrationCache.data() : nullptr;
    }

    void writeCalibrationCache(const void* cache, size_t length)
    {
        std::ofstream output(mCalibrationTableName, std::ios::binary);
        output.write(reinterpret_cast<const char*>(cache), length);
    }

private:
    TBatchStream mStream;
    size_t mInputCount;
    std::string mCalibrationTableName;
    const char* mInputBlobName;
    bool mReadCache{true};
    void* mDeviceInput{nullptr};
    std::vector<char> mCalibrationCache;
};

//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
template <typename TBatchStream>
class Int8EntropyCalibrator2 : public IInt8EntropyCalibrator2
{
public:
    Int8EntropyCalibrator2(
        TBatchStream stream, int firstBatch, const char* networkName, const char* inputBlobName, bool readCache = true)
        : mImpl(stream, firstBatch, networkName, inputBlobName, readCache)
    {
    }

    int getBatchSize() const override
    {
        return mImpl.getBatchSize();
    }

    bool getBatch(void* bindings[], const char* names[], int nbBindings) override
    {
        return mImpl.getBatch(bindings, names, nbBindings);
    }

    const void* readCalibrationCache(size_t& length) override
    {
        return mImpl.readCalibrationCache(length);
    }

    void writeCalibrationCache(const void* cache, size_t length) override
    {
        mImpl.writeCalibrationCache(cache, length);
    }

private:
    EntropyCalibratorImpl<TBatchStream> mImpl;
};

TensorRT需要應用程序實現calibration的interface IInt8Calibrator, 其中IInt8EntropyCalibrator2是它的繼承類，所以在本示例程序中通過Int8EntropyCalibrator2實現了IInt8EntropyCalibrator2，而Int8EntropyCalibrator2則是通過EntropyCalibratorImpl類實現了IInt8Calibrator需要提供的幾個接口方法，包括
構造函數中通過mInputCount = samplesCommon::volume(dims) * mStream.getBatchSize(); 計算每一個input batch中calibration數據個數，然後通過cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)) 在GPU Device上分配保存input數據的存儲空間
在getBatch方法中，通過cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice) 將BatchStream提供的一次 batch數據從host 傳送到device端
在readCalibrationCache/writeCalibrationCache方法中，將網絡每一層校準後得到的Calibration閾值保存到CalibrationTable以便後續讀取重複使用

bool SampleINT8::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
    SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
    SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser, DataType dataType)
{
......

    if (mParams.dlaCore >= 0)
    {
        samplesCommon::enableDLA(builder.get(), config.get(), mParams.dlaCore);
        if (mParams.batchSize > builder->getMaxDLABatchSize())
        {
            gLogError << "Requested batch size " << mParams.batchSize << " is greater than the max DLA batch size of "
                      << builder->getMaxDLABatchSize() << ". Reducing batch size accordingly." << std::endl;
            return false;
        }
    }

    mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
        builder->buildEngineWithConfig(*network, *config), samplesCommon::InferDeleter());
    if (!mEngine)
    {
        return false;
    }

    return true;
}

根據程序輸入參數判斷是否要enable NV DLA硬件加速
創建ICudaEngine用於後續網絡推理過程

推理(Infer)過程

bool SampleINT8::infer(std::pair<float, float>& score, int firstScoreBatch, int nbScoreBatches)
{
    float ms{0.0f};

    // Create RAII buffer manager object
    samplesCommon::BufferManager buffers(mEngine, mParams.batchSize);

    auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
    if (!context)
    {
        return false;
    }

    MNISTBatchStream batchStream(
        mParams.batchSize, nbScoreBatches, "train-images-idx3-ubyte", "train-labels-idx1-ubyte", mParams.dataDirs);
    batchStream.skip(firstScoreBatch);

    Dims outputDims = context->getEngine().getBindingDimensions(
        context->getEngine().getBindingIndex(mParams.outputTensorNames[0].c_str()));
    int outputSize = samplesCommon::volume(outputDims);
    int top1{0}, top5{0};
    float totalTime{0.0f};
......

構建BufferManager 分配Host與Device的存儲空間供推理過程輸入網絡input數據和獲取網絡output數據，關於BufferManager數據結構的詳細分析請參考我的博文 TensorRT sampleMNIST 詳解
創建推理需要的IExecutionContext對象
創建Score數據集的BatchStream，用於獲取Score數據集的data與label數據
獲取網絡output Tensor的維度outputDims，計算出outputSzie的大小

bool SampleINT8::processInput(const samplesCommon::BufferManager& buffers, const float* data)
{
    // Fill data buffer
    float* hostDataBuffer = static_cast<float*>(buffers.getHostBuffer(mParams.inputTensorNames[0]));
    std::memcpy(hostDataBuffer, data, mParams.batchSize * samplesCommon::volume(mInputDims) * sizeof(float));
    return true;
}

......

bool SampleINT8::infer(std::pair<float, float>& score, int firstScoreBatch, int nbScoreBatches)
{
......

while (batchStream.next())
    {
        // Read the input data into the managed buffers
        assert(mParams.inputTensorNames.size() == 1);
        if (!processInput(buffers, batchStream.getBatch()))
        {
            return false;
        }

        // Memcpy from host input buffers to device input buffers
        buffers.copyInputToDevice();

        cudaStream_t stream;
        CHECK(cudaStreamCreate(&stream));

        // Use CUDA events to measure inference time
        cudaEvent_t start, end;
        CHECK(cudaEventCreateWithFlags(&start, cudaEventBlockingSync));
        CHECK(cudaEventCreateWithFlags(&end, cudaEventBlockingSync));
        cudaEventRecord(start, stream);

        bool status = context->enqueue(mParams.batchSize, buffers.getDeviceBindings().data(), stream, nullptr);
        if (!status)
        {
            return false;
        }

        cudaEventRecord(end, stream);
        cudaEventSynchronize(end);
        cudaEventElapsedTime(&ms, start, end);
        cudaEventDestroy(start);
        cudaEventDestroy(end);

        totalTime += ms;

        // Memcpy from device output buffers to host output buffers
        buffers.copyOutputToHost();

        CHECK(cudaStreamDestroy(stream));

        top1 += calculateScore(buffers, batchStream.getLabels(), mParams.batchSize, outputSize, 1);
        top5 += calculateScore(buffers, batchStream.getLabels(), mParams.batchSize, outputSize, 5);

        if (batchStream.getBatchesRead() % 100 == 0)
        {
            gLogInfo << "Processing next set of max 100 batches" << std::endl;
        }
    }

    int imagesRead = batchStream.getBatchesRead() * mParams.batchSize;
    score.first = float(top1) / float(imagesRead);
    score.second = float(top5) / float(imagesRead);

    gLogInfo << "Top1: " << score.first << ", Top5: " << score.second << std::endl;
    gLogInfo << "Processing " << imagesRead << " images averaged " << totalTime / imagesRead << " ms/image and "
             << totalTime / batchStream.getBatchesRead() << " ms/batch." << std::endl;

    return true;

while循環持續從score數據集合中按batchSize的大小讀取一批input數據
通過processInput函數將input數據copy到BufferManager的host 存儲空間中
通過buffers.copyInputToDevice(); 將BufferManager的host 存儲空間數據傳送到GPU device 存儲空間，實現input數據的輸入
通過cudaStreamCreate 創建並行計算的stream對象
通過cudaEventCreateWithFlags創建統計時間的start和end 對象
通過context->enqueue 開始執行在score數據集上的網絡推理過程
通過cudaEventSynchronize同步等待cuda並行計算的推理過程結束
通過cudaEventElapsedTime計算本次推理過程的時間
通過buffers.copyOutputToHost(); 將推理的output從GPU device端傳送到host 端
通過calculateScore函數來計算本次推理過程的精度，具體過程如下

int SampleINT8::calculateScore(
    const samplesCommon::BufferManager& buffers, float* labels, int batchSize, int outputSize, int threshold)
{
    float* probs = static_cast<float*>(buffers.getHostBuffer(mParams.outputTensorNames[0]));

    int success = 0;
    for (int i = 0; i < batchSize; i++)
    {
        float *prob = probs + outputSize * i, correct = prob[(int) labels[i]];

        int better = 0;
        for (int j = 0; j < outputSize; j++)
        {
            if (prob[j] >= correct)
            {
                better++;
            }
        }
        if (better <= threshold)
        {
            success++;
        }
    }
    return success;
}

probs與labels指針的關係如下圖所示

特別解釋一下for循環中的better和success的計算規則
calculateScore函數本質上是計算在1次Batch推理結果中排Top1/Top5 的正確結果個數，其中Top1/Top5的計算規則如下圖所示

最終calculateScore函數返回一個BatchSize的 input image推理後，概率排名Top1/Top5 的正確output結果個數

bool SampleINT8::infer(std::pair<float, float>& score, int firstScoreBatch, int nbScoreBatches)
{
......

    int imagesRead = batchStream.getBatchesRead() * mParams.batchSize;
    score.first = float(top1) / float(imagesRead);
    score.second = float(top5) / float(imagesRead);

    gLogInfo << "Top1: " << score.first << ", Top5: " << score.second << std::endl;
    gLogInfo << "Processing " << imagesRead << " images averaged " << totalTime / imagesRead << " ms/image and "
             << totalTime / batchStream.getBatchesRead() << " ms/batch." << std::endl;

    return true;
}

總共向網絡輸入了imagesRead個image做推理，計算推理結果中Top1/Top5的正確結果個數佔總輸入image的比例輸出到log，即如下的程序運行log

&&&& RUNNING TensorRT.sample_int8 # ./sample_int8 mnist
[I] FP32 run:400 batches of size 100 starting at 100
[I] Processing next set of max 100 batches
[I] Processing next set of max 100 batches
[I] Processing next set of max 100 batches
[I] Processing next set of max 100 batches
[I] Top1: 0.9904, Top5: 1
[I] Processing 40000 images averaged 0.00170236 ms/image and 0.170236 ms/batch.
[I] FP16 run:400 batches of size 100 starting at 100
[I] Processing next set of max 100 batches
[I] Processing next set of max 100 batches
[I] Processing next set of max 100 batches
[I] Processing next set of max 100 batches
[I] Top1: 0.9904, Top5: 1
[I] Processing 40000 images averaged 0.00128872 ms/image and 0.128872 ms/batch.

INT8 run:400 batches of size 100 starting at 100
[I] Processing next set of max 100 batches
[I] Processing next set of max 100 batches
[I] Processing next set of max 100 batches
[I] Processing next set of max 100 batches
[I] Top1: 0.9908, Top5: 1
[I] Processing 40000 images averaged 0.000946117 ms/image and 0.0946117 ms/batch.
&&&& PASSED TensorRT.sample_int8 # ./sample_int8 mnist

資源釋放

int main(int argc, char** argv)
{
......

    auto isApproximatelyEqual = [](float a, float b, double tolerance) { return (std::abs(a - b) <= tolerance); };
    double fp16tolerance{0.5}, int8tolerance{1.0};

    if (scores[1].first != 0.0f && !isApproximatelyEqual(scores[0].first, scores[1].first, fp16tolerance))
    {
        gLogError << "FP32(" << scores[0].first << ") and FP16(" << scores[1].first
                  << ") Top1 accuracy differ by more than " << fp16tolerance << "." << std::endl;
        return gLogger.reportFail(sampleTest);
    }
    if (scores[2].first != 0.0f && !isApproximatelyEqual(scores[0].first, scores[2].first, int8tolerance))
    {
        gLogError << "FP32(" << scores[0].first << ") and Int8(" << scores[2].first
                  << ") Top1 accuracy differ by more than " << int8tolerance << "." << std::endl;
        return gLogger.reportFail(sampleTest);
    }
    if (scores[1].second != 0.0f && !isApproximatelyEqual(scores[0].second, scores[1].second, fp16tolerance))
    {
        gLogError << "FP32(" << scores[0].second << ") and FP16(" << scores[1].second
                  << ") Top5 accuracy differ by more than " << fp16tolerance << "." << std::endl;
        return gLogger.reportFail(sampleTest);
    }
    if (scores[2].second != 0.0f && !isApproximatelyEqual(scores[0].second, scores[2].second, int8tolerance))
    {
        gLogError << "FP32(" << scores[0].second << ") and INT8(" << scores[2].second
                  << ") Top5 accuracy differ by more than " << int8tolerance << "." << std::endl;
        return gLogger.reportFail(sampleTest);
    }

    if (!sample.teardown())
    {
        return gLogger.reportFail(sampleTest);
    }

    return gLogger.reportPass(sampleTest);
}

計算不同dataType中Top1和Top5的精度差異是否符合預設的閾值xxxtolerance
通過teardown釋放程序的資源

【代碼分析】TensorRT sampleINT8 詳解

前言

代碼分析

Main入口

構建(Build)網絡

BatchStream

推理(Infer)過程

資源釋放

AI 畫圖真刺激，手把手教你如何用 ComfyUI 來畫出刺激的圖

公司剛入職了一名 Java 中級開發，短短 4 行代碼居然湊齊了 3 個 bug！我哭了~~

公衆號5月C#/.NET熱文一覽

git 下載大陸鏡像地址

【架構分析】Fuchsia FIDL IPC 詳解

【代碼分析】TensorRT sampleINT8 詳解

【代碼分析】TensorRT sampleMNIST 詳解

【工程技術】QEMU運行Linux Kernel環境配置

【芯片原理】NPU矩陣乘法加速詳解

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結