MNN量化源碼詳解

參考鏈接:https://www.zhihu.com/question/337513515

MNN量化工具使用

編譯

 cd MNN
 mkdir build
 cd build
 cmake -DMNN_BUILD_QUANTOOLS=ON ..
 make -j4

使用

./quantized.out origin.mnn quantized.mnn ModelConfig.json

也可以用python安裝mnn

pip install mnn
mnnquant origin.mnn quantized.mnn ModelConfig.json

ModelConfig.json配置格式

{
    "format":"GRAY",
    "mean":[
		0
    ],
    "normal":[
        0.00784314
    ],
    "width":28,
    "height":28,
    "path":"/mldb/dataset/MNIST/test_data/8",
    "used_image_num":100,
    "feature_quantize_method":"KL",
    "weight_quantize_method":"MAX_ABS"
}

format

圖片統一按RGBA讀取,然後轉換到format指定格式,可選:“RGB”, “BGR”, “RGBA”, “GRAY”。

mean, normal

模型預處理需要的mean,normal, 數據按此公式填寫:​

width, height

模型輸入的寬高

path

存放校正特徵量化係數的圖片目錄

used_image_num

用於指定使用上述目錄下多少張圖片進行校正,默認使用path下全部圖片

注意:請確保圖片經過上述步驟處理之後的數據是輸入到模型input接口的數據

feature_quantize_method

指定計算特徵量化係數的方法,可選:
“KL”: 使用KL散度進行特徵量化係數的校正,一般需要100 ~ 1000張圖片
“ADMM”: 使用ADMM(Alternating Direction Method of Multipliers)方法進行特徵量化係數的校正,一般需要一個batch的數據
默認:“KL”

weight_quantize_method

指定權值量化方法,可選:
“MAX_ABS”: 使用權值的絕對值的最大值進行對稱量化
“ADMM”: 使用ADMM方法進行權值量化
默認:“MAX_ABS”
上述特徵量化方法和權值量化方法可進行多次測試,擇優使用。

源碼詳解

主函數quantized.cpp

int main(int argc, const char* argv[]) {
      if (argc < 4) {
        DLOG(INFO) << "Usage: ./quantized.out src.mnn dst.mnn preTreatConfig.json\n";
        return 0;
    }
    const char* modelFile      = argv[1];
    const char* preTreatConfig = argv[3];
    const char* dstFile        = argv[2];
    DLOG(INFO) << ">>> modelFile: " << modelFile;
    DLOG(INFO) << ">>> preTreatConfig: " << preTreatConfig;
    DLOG(INFO) << ">>> dstFile: " << dstFile
     std::unique_ptr<MNN::NetT> netT;
     {// 讀取原始的model文件, 藉助於flattbuffer生成Net對象
         std::ifstream input(modelFile);
         std::ostringstream outputOs;
         outputOs << input.rdbuf();
         netT = MNN::UnPackNet(outputOs.str().c_str()); //獲取Net對象
     }// temp build net for inference
     flatbuffers::FlatBufferBuilder builder(1024);
     auto offset = MNN::Net::Pack(builder, netT.get());//打包模型準備放入buffer中
     builder.Finish(offset);
     int size      = builder.GetSize();
     auto ocontent = builder.GetBufferPointer();// 創建兩個buffer,兩個都用來放模型數據
     std::unique_ptr<uint8_t> modelForInference(new uint8_t[size]);
     memcpy(modelForInference.get(), ocontent, size);
     std::unique_ptr<uint8_t> modelOriginal(new uint8_t[size]);
     memcpy(modelOriginal.get(), ocontent, size);
 ​
     netT.reset();
     netT = MNN::UnPackNet(modelOriginal.get());// 進行量化操作, 主要這個靠的是Calibration類
     DLOG(INFO) << "Calibrate the feature and quantize model...";
     std::shared_ptr<Calibration> calibration(
         new Calibration(netT.get(), modelForInference.get(), size, preTreatConfig));
     calibration->runQuantizeModel();
     DLOG(INFO) << "Quantize model done!";
     // 量化後的模型寫入到FlatBufferBuilder
     flatbuffers::FlatBufferBuilder builderOutput(1024);
     builderOutput.ForceDefaults(true);
     auto len = MNN::Net::Pack(builderOutput, netT.get());
     builderOutput.Finish(len);
    // FlatBufferBuilder的內容寫入文件,得到量化模型
     {
         std::ofstream output(dstFile);
         output.write((const char*)builderOutput.GetBufferPointer(), builderOutput.GetSize());
     }
 }

Calibration類

MNN量化的核心類,權重量化,特徵量化。

Calibration.hpp

class Calibration {
public:
	// 參數 原始模型,模型uint8_t buffer,size,json配置文件
    Calibration(MNN::NetT* model, uint8_t* modelBuffer, const int bufferSize, const std::string& configPath);

    void runQuantizeModel();

private:
    Calibration();
    MNN::NetT* _originaleModel;// 需要量化的模型 
    std::shared_ptr<MNN::CV::ImageProcess> _process;// 負責image到tensor的轉化類
    const int _binNums = 2048;
    int _imageNum      = 0;
    int _width;
    int _height;
    std::vector<std::string> _imgaes;//圖片,用於校正特徵量化係數的

    // Tensor and Info
	// tensor 到 對應的 TensorStatistic, TensorStatistic是描述tensor在量化過程中需要的統計數據,後面有解釋
    std::map<const MNN::Tensor*, std::shared_ptr<TensorStatistic>> _featureInfo;
	// 所有的tensor
    std::map<int, const MNN::Tensor*> _tensorMap;

    // Op's name, Inputs, Outputs
	// op到  input/output tensor的映射
    std::map<std::string, std::pair<std::vector<MNN::Tensor*>, std::vector<MNN::Tensor*>>> _opInfo;

    // The scale results
    std::map<const MNN::Tensor*, std::vector<float>> _scales;

    std::shared_ptr<MNN::Interpreter> _interpreter;
    // keep mnn forward information
    MNN::Session* _session;
    MNN::Tensor* _inputTensor;
    std::vector<int> _inputTensorDims;

    std::string _featureQuantizeMethod = "KL";
    std::string _weightQuantizeMethod  = "MAX_ABS";

    void _initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels);
    void _initMaps();

    void _computeFeatureMapsRange();
    void _collectFeatureMapsDistribution();
    void _computeFeatureScaleKL();
    void _computeFeatureScaleADMM();
    void _updateScale();

    // insert the dequantization op before the not supported op(int8), and insert dequantization op
    // after the output op, so that get original float data conveniently
    void _insertDequantize();
};

#endif // CALIBRATION_HPP

Calibration構造函數

Calibration::Calibration(MNN::NetT* model, uint8_t* modelBuffer, const int bufferSize, const std::string& configPath)
    : _originaleModel(model) {
    // when the format of input image is RGB/BGR, channels equal to 3, GRAY is 1
    int channles = 3;
	// 解析json
    rapidjson::Document document;
    {
        std::ifstream fileNames(configPath.c_str());
        std::ostringstream output;
        output << fileNames.rdbuf();
        auto outputStr = output.str();
        document.Parse(outputStr.c_str());
        if (document.HasParseError()) {
            MNN_ERROR("Invalid json\n");
            return;
        }
    }
    auto picObj = document.GetObject();
	 // 構造ImageProcess::config對象,將json內容傳入
    ImageProcess::Config config;
    config.filterType = BILINEAR;
    config.destFormat = BGR;
    {
        if (picObj.HasMember("format")) {
            auto format = picObj["format"].GetString();
            static std::map<std::string, ImageFormat> formatMap{{"BGR", BGR}, {"RGB", RGB}, {"GRAY", GRAY}};
            if (formatMap.find(format) != formatMap.end()) {
                config.destFormat = formatMap.find(format)->second;
            }
        }
    }

    if (config.destFormat == GRAY) {
        channles = 1;
    }

    config.sourceFormat = RGBA;
    std::string imagePath;
    _imageNum = 0;
    {
        if (picObj.HasMember("mean")) {
            auto mean = picObj["mean"].GetArray();
            int cur   = 0;
            for (auto iter = mean.begin(); iter != mean.end(); iter++) {
                config.mean[cur++] = iter->GetFloat();
            }
        }
        if (picObj.HasMember("normal")) {
            auto normal = picObj["normal"].GetArray();
            int cur     = 0;
            for (auto iter = normal.begin(); iter != normal.end(); iter++) {
                config.normal[cur++] = iter->GetFloat();
            }
        }
        if (picObj.HasMember("width")) {
            _width = picObj["width"].GetInt();
        }
        if (picObj.HasMember("height")) {
            _height = picObj["height"].GetInt();
        }
        if (picObj.HasMember("path")) {
            imagePath = picObj["path"].GetString();
        }
        if (picObj.HasMember("used_image_num")) {
            _imageNum = picObj["used_image_num"].GetInt();
        }
        if (picObj.HasMember("feature_quantize_method")) {
            std::string method = picObj["feature_quantize_method"].GetString();
            if (Helper::featureQuantizeMethod.find(method) != Helper::featureQuantizeMethod.end()) {
                _featureQuantizeMethod = method;
            } else {
                MNN_ERROR("not supported feature quantization method: %s\n", method.c_str());
                return;
            }
        }
        if (picObj.HasMember("weight_quantize_method")) {
            std::string method = picObj["weight_quantize_method"].GetString();
            if (Helper::weightQuantizeMethod.find(method) != Helper::weightQuantizeMethod.end()) {
                _weightQuantizeMethod = method;
            } else {
                MNN_ERROR("not supported weight quantization method: %s\n", method.c_str());
                return;
            }
        }
        DLOG(INFO) << "Use feature quantization method: " << _featureQuantizeMethod;
        DLOG(INFO) << "Use weight quantization method: " << _weightQuantizeMethod;
    }
    std::shared_ptr<ImageProcess> process(ImageProcess::create(config));// 生成ImageProcess對象
    _process = process;

    // read images file names
    Helper::readImages(_imgaes, imagePath.c_str(), &_imageNum);

    _initMNNSession(modelBuffer, bufferSize, channles);
    _initMaps();
}

initMNNSession函數

主要用於初始化,做好模型推理的準備

void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels) {
    _interpreter.reset(MNN::Interpreter::createFromBuffer(modelBuffer, bufferSize));
    MNN::ScheduleConfig config;
    _session     = _interpreter->createSession(config);
    _inputTensor = _interpreter->getSessionInput(_session, NULL);

    _inputTensorDims.resize(4);
    auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
    DCHECK(4 == _inputTensor->dimensions()) << "Only support 4 dimensions input";
    if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
        _inputTensorDims[0] = 1;
        _inputTensorDims[1] = _height;
        _inputTensorDims[2] = _width;
        _inputTensorDims[3] = channels;
    } else if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NC4HW4) {
        _inputTensorDims[0] = 1;
        _inputTensorDims[1] = channels;
        _inputTensorDims[2] = _height;
        _inputTensorDims[3] = _width;
    } else {
        DLOG(ERROR) << "Input Data Format ERROR!";
    }

    if (_featureQuantizeMethod == "KL") {
        _interpreter->resizeTensor(_inputTensor, _inputTensorDims);
        _interpreter->resizeSession(_session);
    } else if (_featureQuantizeMethod == "ADMM") {
        DCHECK((_imageNum * 4 * _height * _width) < (INT_MAX / 4)) << "Use Little Number of Images When Use ADMM";
        _inputTensorDims[0] = _imageNum;
        _interpreter->resizeTensor(_inputTensor, _inputTensorDims);
        _interpreter->resizeSession(_session);
    }
    _interpreter->releaseModel();
}

_initMaps函數

初始化Map,定義兩個回調函數,執行前後分別完成op到input和output的映射,初始化特徵信息和op信息。

void Calibration::_initMaps() {
    _featureInfo.clear();
    _opInfo.clear();
    _tensorMap.clear();
    // run mnn once, initialize featureMap, opInfo map
	// MNN提供了每個op計算的callback,一個計算前一個是計算後
     //  計算前的callback完成的工作是爲input tensor創建TensorStatistic對象; op info的填充 op->input,output的映射
    MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
        _opInfo[info->name()].first = nTensors;
        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
            for (auto t : nTensors) {
                if (_featureInfo.find(t) == _featureInfo.end()) {
                    _featureInfo[t] = std::shared_ptr<TensorStatistic>(
                        new TensorStatistic(t, _featureQuantizeMethod, info->name() + "__input"));
                }
            }
        }
        return false;
    };
	// 計算後的callback完成的工作是  爲output tensor創建TensorStatistic對象;op info的填充 op->input,output的映射
    MNN::TensorCallBackWithInfo after = [this](const std::vector<MNN::Tensor*>& nTensors,
                                               const MNN::OperatorInfo* info) {
        _opInfo[info->name()].second = nTensors;
        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
            for (auto t : nTensors) {
                if (_featureInfo.find(t) == _featureInfo.end()) {
                    _featureInfo[t] =
                        std::shared_ptr<TensorStatistic>(new TensorStatistic(t, _featureQuantizeMethod, info->name()));
                }
            }
        }
        return true;
    };
    _interpreter->runSessionWithCallBackInfo(_session, before, after);
    // 遍歷op,由op的<input/output index,input/output>加入到 tensorMap
    for (auto& op : _originaleModel->oplists) {
        if (_opInfo.find(op->name) == _opInfo.end()) {
            continue;
        }
        for (int i = 0; i < op->inputIndexes.size(); ++i) {
            _tensorMap[op->inputIndexes[i]] = _opInfo[op->name].first[i];
        }
        for (int i = 0; i < op->outputIndexes.size(); ++i) {
            _tensorMap[op->outputIndexes[i]] = _opInfo[op->name].second[i];
        }
    }

    if (_featureQuantizeMethod == "KL") {
        // set the tensor-statistic method of input tensor as THRESHOLD_MAX
        auto inputTensorStatistic = _featureInfo.find(_inputTensor);
        if (inputTensorStatistic != _featureInfo.end()) {
            inputTensorStatistic->second->setThresholdMethod(THRESHOLD_MAX);
        }
    }
}

TensorStatistic類

由於在Calibration實例裏有個map包含了

std::map<const MNN::Tensor*, std::shared_ptr<TensorStatistic>> _featureInfo;

表示每個tensor都有一個TensorStatistic對象與之對應,tensor在量化過程中數據存在TensorStatistic裏面。
TensorStatistic.hpp

class TensorStatistic {
 public:
     TensorStatistic(const MNN::Tensor* tensor, std::string method, const std::string& name, int binNumber = 2048, GET_THRESHOLD_METHOD thresholdMethod = THRESHOLD_KL);
    .  .  .
     void updateRange();
     void resetDistribution();
     void updateDistribution();
     void setThresholdMethod(GET_THRESHOLD_METHOD thresholdMethod);
     void setChannelWise(bool mergeChannel);
     std::vector<float> finishAndCompute();
     // only this one for ADMM
     std::vector<float> computeScaleADMM();
 private:
     int _computeThreshold(const std::vector<float>& distribution);
     std::vector<std::pair<float, float>> mRangePerChannel;// 該tensor在每個channel上的最大最小值
     // 記該channel上的 (取最大值絕對值,取最小值的絕對值,的最大值) 爲maxValue
     std::vector<float> mIntervals;//每個channel上的interval = mBinNumber / maxValue
     std::vector<bool> mValidChannel; //如果channel上的maxValue>0.00001f, 爲true
     std::vector<std::vector<float>> mDistribution;// 對於每個channel,tensor上浮點數據做均勻映射,映射到[0,mBinNumber],統計各個整數出現次數, 形成了直方圖。
 ​
     std::shared_ptr<MNN::Tensor> mHostTensor; // 該tensor在cpu端的表示
     const MNN::Tensor* mOriginTensor; // 原始的tensor
     int mBinNumber; //  默認是2048
     bool mUpdatedDistributionFlag = false;
     bool mUpdatedRangeFlags       = false;
 ​
     bool mMergeChannel                    = true;
     std::string mName;
     GET_THRESHOLD_METHOD mThresholdMethod = THRESHOLD_KL;
 };

runQuantizeModel函數

兩種特徵量化方法:KL和ADMM

 void Calibration::runQuantizeModel() {
     if (_featureQuantizeMethod == "KL") { // 如果配置文件裏是 KL散度做
         _computeFeatureScaleKL();
     } else if (_featureQuantizeMethod == "ADMM") {// 如果配置文件裏是ADMM
         _computeFeatureScaleADMM();
     }
     _updateScale();
     _insertDequantize();
 }

computeFeatureScaleKL函數

void Calibration::_computeFeatureScaleKL() {
     _computeFeatureMapsRange(); // 計算 feature map裏的數據範圍
     _collectFeatureMapsDistribution(); // 計算feature map裏的數據分佈
 ​
     _scales.clear();
     for (auto& iter : _featureInfo) {
         AUTOTIME;_imgaes
         _scales[iter.first] = iter.second->finishAndCompute();//縮放係數
     }
     //_featureInfo.clear();//No need now
 }

_computeFeatureMapsRange函數

用指定的圖片集合做爲模型輸入,做推理,在推理過程中,每一個op的計算前後分別統計tensor的每一個channel上featuremap的最大值最小值,更新到該tensor關聯的TensorStatistic對象實例裏。其中updateRange函數在TensorStatistic.cpp中

void Calibration::_computeFeatureMapsRange() {
     // feed input data according to input images
     int count = 0;
     for (const auto& img : _imgaes) { // 對於每一個圖片文件
         for (auto& iter : _featureInfo) {
             iter.second->resetUpdatedRangeFlags();
         }
         count++;
         // 讀取圖片,放到input tensor
         Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);
         // 設置回調, 做推理時,每個op計算前,計算後分別調用
         MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors,
                                                  const MNN::OperatorInfo* info) {
             for (auto t : nTensors) {
                 if (_featureInfo.find(t) != _featureInfo.end()) {
                     _featureInfo[t]->updateRange();// 統計輸入tensor裏的最大值最小值
                 }
             }
             return true;
         };
         MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors,
                                                 const MNN::OperatorInfo* info) {
             for (auto t : nTensors) {
                 if (_featureInfo.find(t) != _featureInfo.end()) {
                     _featureInfo[t]->updateRange();// 統計輸出tensor裏的最大值最小值
                 }
             }
             return true;
         };
         // 推理一遍
         _interpreter->runSessionWithCallBackInfo(_session, before, after);
         MNN_PRINT("\rComputeFeatureRange: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
         fflush(stdout);
     }
     MNN_PRINT("\n");
 }

updateRange函數

得到特徵的最大值和最小值

void TensorStatistic::updateRange() {
    if (mUpdatedRangeFlags) {
        return;
    }
    mUpdatedRangeFlags = true;
    mOriginTensor->copyToHostTensor(mHostTensor.get());
    int batch   = mHostTensor->batch();
    int channel = mHostTensor->channel();
    int width   = mHostTensor->width();
    int height  = mHostTensor->height();
    auto area   = width * height;

    for (int n = 0; n < batch; ++n) {
        auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);
        for (int c = 0; c < channel; ++c) {
            int cIndex = c;
            if (mMergeChannel) {
                cIndex = 0;
            }
            auto minValue    = mRangePerChannel[cIndex].first;
            auto maxValue    = mRangePerChannel[cIndex].second;
            auto dataChannel = dataBatch + c * mHostTensor->stride(1);
            for (int v = 0; v < area; ++v) {
                minValue = std::min(minValue, dataChannel[v]);
                maxValue = std::max(maxValue, dataChannel[v]);
            }
            mRangePerChannel[cIndex].first  = minValue;
            mRangePerChannel[cIndex].second = maxValue;
        }
    }
}

_collectFeatureMapsDistribution函數

計算每個tensor的數據分佈情況,其中resetDistributio函數和updateDistribution函數在TensorStatistic.cpp中。

void Calibration::_collectFeatureMapsDistribution() {
     for (auto& iter : _featureInfo) {
         iter.second->resetDistribution(); // 初始化,清空TensorStatistic實例裏的記錄數據分佈的屬性
     }
     // 定義兩個回調, 分別在op計算前 計算後執行
     // feed input data according to input images
     MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
         for (auto t : nTensors) {
             if (_featureInfo.find(t) != _featureInfo.end()) {
                 _featureInfo[t]->updateDistribution(); // 對該input tensor的TensorStatistic實例更新數據分佈
             }
         }
         return true;
     };
     MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
         for (auto t : nTensors) {
             if (_featureInfo.find(t) != _featureInfo.end()) {
                 _featureInfo[t]->updateDistribution();// 對該output tensor的TensorStatistic實例更新數據分佈
             }
         }
         return true;
     };
     int count = 0;
     for (const auto& img : _imgaes) {// 對所有的image,跑一遍推理
         count++;
         for (auto& iter : _fiter : _featureInfoeatureInfo) {
             iter.second->resetUpdatedDistributionFlag();
         }
         Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor); //讀圖片填到input
         _interpreter->runSessionWithCallBackInfo(_session, before, after); // 執行推理 MNN_PRINT("\rCollectFeatureDistribution: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
         fflush(stdout);
     }
     MNN_PRINT("\n");
 }

resetDistributio、updateDistribution函數

void TensorStatistic::resetDistribution() { // 初始化分佈
     for (int i = 0; i < mIntervals.size(); ++i) { // 在每一個channel上做運算
         int cIndex = i;
         if (mMergeChannel) {
             cIndex = 0;
         }
         // 最大值是 該channel上出現的數據的 絕對值的最大值
         auto maxValue         = std::max(fabsf(mRangePerChannel[cIndex].second), fabsf(mRangePerChannel[cIndex].first));
         mValidChannel[cIndex] = maxValue > 0.00001f;// 最大值要足夠大, 然後除以mBinNumber=2048,得到mIntervals
         mIntervals[cIndex]    = 0.0f;
         if (mValidChannel[cIndex]) {
              // mIntervals 代表是 原始float均勻映射到 [0-2048)時, 整數1對應到浮點數上的值
             mIntervals[cIndex] = (float)mBinNumber / maxValue;
         }
     }
     for (auto& c : mDistribution) {
         std::fill(c.begin(), c.end(), 1.0e-07); // mDistribution初始值設爲 接近0的很小的浮點數。
     }
 }
 ​
 void TensorStatistic::updateDistribution() {
     if (mUpdatedDistributionFlag) { //
         return;
     }
     mUpdatedDistributionFlag = true;
     // 取tensor上的數據, 和shape
     mOriginTensor->copyToHostTensor(mHostTensor.get());
     int batch   = mHostTensor->batch();
     int channel = mHostTensor->channel();
     int width   = mHostTensor->width();
     int height  = mHostTensor->height();
     auto area   = width * height;
 ​
     for (int n = 0; n < batch; ++n) {
         auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);// 取該batch上的數據
         for (int c = 0; c < channel; ++c) {// 對每一個channel維度上計算
             int cIndex = c;
             if (mMergeChannel) {
                 cIndex = 0;
             }
             if (!mValidChannel[cIndex]) {
                 continue;
             }
             auto multi       = mIntervals[cIndex];  // 取間隔值
             auto target      = mDistribution[cIndex].data();//取該channel上分佈
             auto dataChannel = dataBatch + c * mHostTensor->stride(1);//取該channel上的feature map
             for (int v = 0; v < area; ++v) {
                 auto data = dataChannel[v]; // data是feature map上的數值點
                 if (data == 0) {
                     continue;
                 }
                 int index = static_cast<int>(fabs(data) * multi);// 該數值點 均勻映射到 整數點, 整數記爲index
                 index     = std::min(index, mBinNumber - 1); // 限制範圍是 不超過mBinNumber - 1
                 target[index] += 1.0f; // 統計到 mDistribution裏
             }
         }
     }
 }

finishAndCompute函數

計算出從float32到int8的縮放參數

std::vector<float> TensorStatistic::finishAndCompute() {
     std::vector<float> scaleValue(mDistribution.size(), 0.0f);
     . . . 
     for (int c = 0; c < mDistribution.size(); ++c) { // 對每一個channel
         if (!mValidChannel[c]) {
             continue;
         }
         float sum          = 0.0f;
         auto& distribution = mDistribution[c];
         std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; });// 求和
         std::for_each(distribution.begin(), distribution.end(), [sum](float& n) { n /= sum; }); // 除以和,等於佔比, 相當於歸一化了auto threshold = _computeThreshold(distribution); // 計算一個閾值
         scaleValue[c]  = ((float)threshold + 0.5) / mIntervals[c] / 127.0; // 計算縮放係數
     }
     return scaleValue;
 }

_computeThreshold函數

在targetBinNums 到mBinNumber之間找一個閾值,使得KL散度最小。

int TensorStatistic::_computeThreshold(const std::vector<float>& distribution) {
     const int targetBinNums = 128;
     int threshold           = targetBinNums; // 默認128if (mThresholdMethod == THRESHOLD_KL) { // 如果是通過KL散度
         float minKLDivergence   = 10000.0f;
         float afterThresholdSum = 0.0f;
         //targetBinNums=128, mBinNumber=2048
         std::for_each(distribution.begin() + targetBinNums, distribution.end(),
                       [&](float n) { afterThresholdSum += n; });
         for (int i = targetBinNums; i < mBinNumber; ++i) { // i 從128 到 2047, 尋找潛在的threshold
             std::vector<float> quantizedDistribution(targetBinNums);
             std::vector<float> candidateDistribution(i);
             std::vector<float> expandedDistribution(i);
             // candidateDistribution是保留了 從0到i的分佈, 同時把 i到mBinNumber直接的分佈加到最後i-1位置
             std::copy(distribution.begin(), distribution.begin() + i, candidateDistribution.begin());
             candidateDistribution[i - 1] += afterThresholdSum;
             afterThresholdSum -= distribution[i];
             // 空間大小i 縮小到targetBinNums時的間隔。
             const float binInterval = (float)i / (float)targetBinNums;// merge i bins to target bins  j從0到127
             // 把0-i之間分佈, 映射到更小空間 0-targetBinNums, 當然映射過後會有損失, 結果保存到quantizedDistribution
             for (int j = 0; j < targetBinNums; ++j) {
                 // [j,j+1)在mBinNumber空間
                 // [start, end]是 i空間 相對應的映射
                 const float start = j * binInterval;
                 const float end   = start + binInterval;
                 // 向上取整
                 const int leftUpper = static_cast<int>(std::ceil(start));
                 if (leftUpper > start) {
                     const float leftScale = leftUpper - start;
                     quantizedDistribution[j] += leftScale * distribution[leftUpper - 1];
                 }// 向下取整
                 const int rightLower = static_cast<int>(std::floor(end));
                 if (rightLower < end) {
                     const float rightScale = end - rightLower;
                     quantizedDistribution[j] += rightScale * distribution[rightLower];
                 }
                 // 轉化成在[0, targetBinNums]分佈
                 std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower,
                               [&](float n) { quantizedDistribution[j] += n; });
             }
             // expand target bins to i bins 從小空間 0-targetBinNums 在反過來映射回來到0-i之間
             // 結果保存到expandedDistribution;
             for (int j = 0; j < targetBinNums; ++j) {
                 const float start   = j * binInterval;
                 const float end     = start + binInterval;
                 float count         = 0;
                 const int leftUpper = static_cast<int>(std::ceil(start));
                 float leftScale     = 0.0f;
                 if (leftUpper > start) {
                     leftScale = leftUpper - start;
                     if (distribution[leftUpper - 1] != 0) {
                         count += leftScale;
                     }
                 }
                 const int rightLower = static_cast<int>(std::floor(end));
                 float rightScale     = 0.0f;
                 if (rightLower < end) {
                     rightScale = end - rightLower;
                     if (distribution[rightLower] != 0) {
                         count += rightScale;
                     }
                 }
 ​
                 std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower, [&](float n) {
                     if (n != 0) {
                         count += 1;
                     }
                 });if (count == 0) {
                     continue;
                 }
                 const float toExpandValue = quantizedDistribution[j] / count;
                 if (leftUpper > start && distribution[leftUpper - 1] != 0) {
                     expandedDistribution[leftUpper - 1] += toExpandValue * leftScale;
                 }
                 if (rightLower < end && distribution[rightLower] != 0) {
                     expandedDistribution[rightLower] += toExpandValue * rightScale;
                 }for (int k = leftUpper; k < rightLower; ++k) {
                     if (distribution[k] != 0) {
                         expandedDistribution[k] += toExpandValue;
                     }
                 }
             }
             // KL散度計算公式 Sum(P[i] * log(P[i] / Q[i]))
             const float curKL = _klDivergence(candidateDistribution, expandedDistribution);
             if (curKL < minKLDivergence) { // 記錄最小的KL散度,以及使得KL散度最小的 threshold
                 minKLDivergence = curKL;
                 threshold       = i;
             }
         }
     } else if (mThresholdMethod == THRESHOLD_MAX) {
         threshold = mBinNumber - 1;
     } else {
         // TODO, support other method
         MNN_ASSERT(false);
     }
     return threshold;
 }

權重量化_updateScale函數

主要是對權重進行量化,完成可量化op的量化

void Calibration::_updateScale() {
     for (const auto& op : _originaleModel->oplists) {
         const auto opType = op->type;
         // 只針對 conv, 或者Eltwise類型op
         if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise &&
             opType != MNN::OpType_Eltwise) {
             continue;
         }
         auto tensorsPair = _opInfo.find(op->name);
         if (tensorsPair == _opInfo.end()) {
             MNN_ERROR("Can't find tensors for %s\n", op->name.c_str());
         }if (opType == MNN::OpType_Eltwise) {//Eltwise類型op
             auto param = op->main.AsEltwise();
             // Now only support AddInt8
             if (param->type != MNN::EltwiseType_SUM) {
                 continue;
             }
             // 取出前面算出來的 scale值
             const auto& inputScale0   = _scales[tensorsPair->second.first[0]];
             const auto& inputScale1   = _scales[tensorsPair->second.first[1]];
             const auto& outputScale   = _scales[tensorsPair->second.second[0]];
             const int outputScaleSize = outputScale.size();
             std::vector<float> outputInvertScale(outputScaleSize);
             Helper::invertData(outputInvertScale.data(), outputScale.data(), outputScaleSize);
             op->type = MNN::OpType_EltwiseInt8; // 修改op類型爲量化版本
             op->main.Reset(); // 重新構造op的參數了
             op->main.type = MNN::OpParameter_EltwiseInt8;auto eltwiseInt8Param         = new MNN::EltwiseInt8T;
             auto input0ScaleParam         = new MNN::QuantizedFloatParamT;
             auto input1ScaleParam         = new MNN::QuantizedFloatParamT;
             auto outputScaleParam         = new MNN::QuantizedFloatParamT;
             input0ScaleParam->tensorScale = inputScale0;
             input1ScaleParam->tensorScale = inputScale1;
             outputScaleParam->tensorScale = outputInvertScale;
             // 從int8恢復float32需要的scale參數
             eltwiseInt8Param->inputQuan0  = std::unique_ptr<MNN::QuantizedFloatParamT>(input0ScaleParam);
             eltwiseInt8Param->inputQuan1  = std::unique_ptr<MNN::QuantizedFloatParamT>(input1ScaleParam);
             eltwiseInt8Param->outputQuan  = std::unique_ptr<MNN::QuantizedFloatParamT>(outputScaleParam);
             op->main.value                = eltwiseInt8Param;
             continue;
         }// below is Conv/DepthwiseConv
         const auto& inputScale  = _scales[tensorsPair->second.first[0]];
         const auto& outputScale = _scales[tensorsPair->second.second[0]];auto param                = op->main.AsConvolution2D();
         param->common->inputCount = tensorsPair->second.first[0]->channel();
         const int channles        = param->common->outputCount;
         const int weightSize      = param->weight.size();
         param->symmetricQuan.reset(new MNN::QuantizedFloatParamT);// 參數重置
         auto& quantizedParam = param->symmetricQuan;// 接着構建 int8版本參數
         quantizedParam->scale.resize(channles);
         quantizedParam->weight.resize(weightSize);
         quantizedParam->bias.resize(channles);
         // conv 和 deptwise_conv 分別算 weight,bias
         if (opType == MNN::OpType_Convolution) { 
             QuantizeConvPerChannel(param->weight.data(), param->weight.size(), param->bias.data(),
                                    quantizedParam->weight.data(), quantizedParam->bias.data(),
                                    quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod);
             op->type = MNN::OpType_ConvInt8;} else if (opType == MNN::OpType_ConvolutionDepthwise) {
             QuantizeDepthwiseConv(param->weight.data(), param->weight.size(), param->bias.data(),
                                   quantizedParam->weight.data(), quantizedParam->bias.data(),
                                   quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod);
             op->type = MNN::OpType_DepthwiseConvInt8;
         }
         if (param->common->relu6) {
             param->common->relu  = true;
             param->common->relu6 = false;
         }// 有了int8版本的參數, 原始的浮點參數就清空了
         param->weight.clear();
         param->bias.clear();
     }
 }

添加反量化_insertDequantize函數

主要作用是添加反量化操作,對於不支持int8的op,前後要加反量化的層。 量化op輸出是整數, 但是以量化op輸出爲輸入的下一個op是float,可能需要是float版本輸出,所以需要添加反量化。
大致說一下過程:
1.遍歷op,找出所有被量化op的輸入輸出tensor
2.遍歷op,找出所有不能量化的op
如果該op的輸入tensor是可量化op產生的, 那麼需要輸入後面加反量化
如果該op的輸出tensor是給到可量化op的, 那麼需要在輸出tensor後面加反量化

對於該圖的輸出tensor,後面加反量化, 確保輸出結果是浮點的

void Calibration::_insertDequantize() {
     // Search All Int Tensors
     std::set<int> int8Tensors;
     std::set<int> int8Outputs;
     for (auto& op : _originaleModel->oplists) {// 遍歷op
         if (Helper::INT8SUPPORTED_OPS.count(op->type) > 0) {//如果op是 int8量化的, 記錄其input和output
             for (auto index : op->inputIndexes) {//記錄其input
                 int8Tensors.insert(index);
             }
             for (auto index : op->outputIndexes) {//記錄其output
                 int8Tensors.insert(index);
                 int8Outputs.insert(index);
             }
         }
     }
     for (auto& op : _originaleModel->oplists) {// 去重,tensor
         for (auto index : op->inputIndexes) {
             auto iter = int8Outputs.find(index);
             if (iter != int8Outputs.end()) {
                 int8Outputs.erase(iter);
             }
         }
     }// Insert Convert For Not Support Int8 Ops, 對於不支持int8的op,前後要加反量化的層
     for (auto iter = _originaleModel->oplists.begin(); iter != _originaleModel->oplists.end();) {
         auto op           = iter->get();
         const auto opType = op->type;
         const auto name   = op->name;
         // check whether is output op
         // if Yes, insert dequantization op after this op  支持int8的op,跳過
         if (Helper::INT8SUPPORTED_OPS.find(opType) != Helper::INT8SUPPORTED_OPS.end()) {
             // this is quantized op
             iter++;
             continue;
         }auto& inputIndexes  = op->inputIndexes;
         const int inputSize = inputIndexes.size();// insert dequantization op before this op
         for (int i = 0; i < inputSize; ++i) {// 對於該op的所有輸入 tensor
             const auto curInputIndex = inputIndexes[i];
             if (int8Tensors.find(curInputIndex) == int8Tensors.end()) {// 如果該tensor不是量化版的,跳過
                 continue;
             }
             auto input        = _tensorMap[curInputIndex];
             auto inputOpScale = _scales[input];// construct new op, 創建一個 定點轉浮點的 op
             auto dequantizationOp       = new MNN::OpT;
             dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
             dequantizationOp->name      = "___Int8ToFloat___For_" + name + flatbuffers::NumToString(i);
             // 填充參數
             dequantizationOp->type           = MNN::OpType_Int8ToFloat;
             auto dequantizationParam         = new MNN::QuantizedFloatParamT;
             dequantizationOp->main.value     = dequantizationParam;
             dequantizationParam->tensorScale = inputOpScale;
 ​
             dequantizationOp->inputIndexes.push_back(curInputIndex);
             dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
             _originaleModel->tensorName.push_back(dequantizationOp->name);// reset current op's input index at i, 新創建定點轉浮點op的 輸出 接到 op的輸入
             inputIndexes[i] = dequantizationOp->outputIndexes[0];
             iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(dequantizationOp));
             iter++;
         }
 ​
         iter++;
         // LOG(INFO) << "insert quantization op after this op if neccessary";
         // insert quantization op after this op if neccessary
         //// 對於該op的所有輸出tensor
         for (int i = 0; i < op->outputIndexes.size(); ++i) {
             const auto outputIndex = op->outputIndexes[i];
             if (int8Tensors.find(outputIndex) == int8Tensors.end()) { // 如果該tensor不是量化版本,跳過
                 continue;
             }
             auto output   = _tensorMap[outputIndex];
             auto curScale = _scales[output];
             // construct one quantization op(FloatToInt8)
             // 創建反量化的op
             auto quantizationOp        = new MNN::OpT;
             quantizationOp->main.type  = MNN::OpParameter_QuantizedFloatParam;
             quantizationOp->name       = name + "___FloatToInt8___" + flatbuffers::NumToString(i);
             quantizationOp->type       = MNN::OpType_FloatToInt8;
             auto quantizationParam     = new MNN::QuantizedFloatParamT;
             quantizationOp->main.value = quantizationParam;
              // 填充參數
             const int channels = curScale.size();
             std::vector<float> quantizationScale(channels);
             Helper::invertData(quantizationScale.data(), curScale.data(), channels);
             quantizationParam->tensorScale = quantizationScale;
             // 插入 反量化op到 該op後面
             quantizationOp->inputIndexes.push_back(_originaleModel->tensorName.size());
             quantizationOp->outputIndexes.push_back(outputIndex);
             _originaleModel->tensorName.push_back(_originaleModel->tensorName[outputIndex]);
             _originaleModel->tensorName[outputIndex] = quantizationOp->name;
             op->outputIndexes[i]                              = quantizationOp->inputIndexes[0];
 ​
             iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(quantizationOp));
             iter++;
         }
     }// Insert Turn float Op for output
     for (auto index : int8Outputs) { // 對該圖的 輸出tensor,添加反量化, 因爲圖最終輸出還是要浮點的
         // construct new op
         auto dequantizationOp       = new MNN::OpT;
         dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
         dequantizationOp->name      = "___Int8ToFloat___For_" + flatbuffers::NumToString(index);
 ​
         dequantizationOp->type           = MNN::OpType_Int8ToFloat;
         auto dequantizationParam         = new MNN::QuantizedFloatParamT;
         dequantizationOp->main.value     = dequantizationParam;
         dequantizationParam->tensorScale = _scales[_tensorMap[index]];
 ​
         dequantizationOp->inputIndexes.push_back(index);
         dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
         auto originTensorName              = _originaleModel->tensorName[index];
         _originaleModel->tensorName[index] = dequantizationOp->name;
         _originaleModel->tensorName.emplace_back(originTensorName);
 ​
         _originaleModel->oplists.insert(_originaleModel->oplists.end(), std::unique_ptr<MNN::OpT>(dequantizationOp));
     }
 }

_computeFeatureScaleADMM函數

ADMM方式計算特徵,其中computeScaleADMM包含在TensorStatistic.cpp中。

void Calibration::_computeFeatureScaleADMM() {
    // feed input data according to input images
    int count                           = 0;
    std::vector<int> oneImageTensorDims = _inputTensorDims;
    oneImageTensorDims[0]               = 1;
    auto inputTensorDataFormat          = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
    auto dimType                        = MNN::Tensor::CAFFE_C4;
    if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
        dimType = MNN::Tensor::TENSORFLOW;
    }

    for (const auto& img : _imgaes) {
        auto curPtr = _inputTensor->host<float>() + count * _inputTensor->stride(0);
        std::shared_ptr<MNN::Tensor> tensorWarp(
            MNN::Tensor::create(oneImageTensorDims, _inputTensor->getType(), curPtr, dimType));
        Helper::preprocessInput(_process.get(), _width, _height, img, tensorWarp.get());

        count++;
        MNN_PRINT("\rProcessImage: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
        fflush(stdout);
    }
    MNN_PRINT("\n");
    _scales.clear();

    const int totalLayers = _featureInfo.size();
    count                 = 0;

    MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
            for (auto t : nTensors) {
                if (_featureInfo.find(t) != _featureInfo.end()) {
                    _scales[t] = _featureInfo[t]->computeScaleADMM();
                    count++;
                    MNN_PRINT("\rComputeADMM: %.2lf %%", (float)count * 100.0f / (float)totalLayers);
                    fflush(stdout);
                }
            }
        }
        return true;
    };
    MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
        if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
            for (auto t : nTensors) {
                if (_featureInfo.find(t) != _featureInfo.end()) {
                    _scales[t] = _featureInfo[t]->computeScaleADMM();
                    count++;
                    MNN_PRINT("\rComputeADMM: %.2lf %%", (float)count * 100.0f / (float)totalLayers);
                    fflush(stdout);
                }
            }
        }
        return true;
    };

    _interpreter->runSessionWithCallBackInfo(_session, before, after);
    MNN_PRINT("\n");
}

computeScaleADMM函數

std::vector<float> TensorStatistic::computeScaleADMM() {
    std::vector<float> scaleValue(mOriginTensor->channel(), 0.0f);

    const int count         = mOriginTensor->elementSize();
    float max               = 0;
    const float bound       = 127;
    const float* originData = mOriginTensor->host<float>();

    for (int i = 0; i < count; i++) {
        float absData = std::fabs(originData[i]);
        if (absData > max) {
            max = absData;
        }
    }
    float alpha = max / (bound * 2.5);

    // DLOG(INFO) << "alpha init: " << alpha;

    const int maxStep = 300;
    float sum1        = 0;
    float sum2        = 0;
    float invAlpha;

    for (int i = 0; i < maxStep; i++) {
        sum1     = 0;
        sum2     = 0;
        invAlpha = 1 / alpha;

        for (int i = 0; i < count; i++) {
            auto origin    = originData[i];
            auto dataQuant = std::roundf(origin * invAlpha);
            dataQuant      = std::fmin(bound, std::fmax(-bound, dataQuant));
            sum1 += (dataQuant * origin);
            sum2 += (dataQuant * dataQuant);
        }

        alpha = sum1 / sum2;
    }
    // DLOG(INFO) << "alpha final: " << alpha;

    std::fill(scaleValue.begin(), scaleValue.end(), alpha);
    return scaleValue;
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章