參考鏈接:https://www.zhihu.com/question/337513515
MNN量化工具使用
編譯
cd MNN
mkdir build
cd build
cmake -DMNN_BUILD_QUANTOOLS=ON ..
make -j4
使用
./quantized.out origin.mnn quantized.mnn ModelConfig.json
也可以用python安裝mnn
pip install mnn
mnnquant origin.mnn quantized.mnn ModelConfig.json
ModelConfig.json配置格式
{
"format":"GRAY",
"mean":[
0
],
"normal":[
0.00784314
],
"width":28,
"height":28,
"path":"/mldb/dataset/MNIST/test_data/8",
"used_image_num":100,
"feature_quantize_method":"KL",
"weight_quantize_method":"MAX_ABS"
}
format
圖片統一按RGBA讀取,然後轉換到format指定格式,可選:“RGB”, “BGR”, “RGBA”, “GRAY”。
mean, normal
模型預處理需要的mean,normal, 數據按此公式填寫:
width, height
模型輸入的寬高
path
存放校正特徵量化係數的圖片目錄
used_image_num
用於指定使用上述目錄下多少張圖片進行校正,默認使用path下全部圖片
注意:請確保圖片經過上述步驟處理之後的數據是輸入到模型input接口的數據
feature_quantize_method
指定計算特徵量化係數的方法,可選:
“KL”: 使用KL散度進行特徵量化係數的校正,一般需要100 ~ 1000張圖片
“ADMM”: 使用ADMM(Alternating Direction Method of Multipliers)方法進行特徵量化係數的校正,一般需要一個batch的數據
默認:“KL”
weight_quantize_method
指定權值量化方法,可選:
“MAX_ABS”: 使用權值的絕對值的最大值進行對稱量化
“ADMM”: 使用ADMM方法進行權值量化
默認:“MAX_ABS”
上述特徵量化方法和權值量化方法可進行多次測試,擇優使用。
源碼詳解
主函數quantized.cpp
int main(int argc, const char* argv[]) {
if (argc < 4) {
DLOG(INFO) << "Usage: ./quantized.out src.mnn dst.mnn preTreatConfig.json\n";
return 0;
}
const char* modelFile = argv[1];
const char* preTreatConfig = argv[3];
const char* dstFile = argv[2];
DLOG(INFO) << ">>> modelFile: " << modelFile;
DLOG(INFO) << ">>> preTreatConfig: " << preTreatConfig;
DLOG(INFO) << ">>> dstFile: " << dstFile
std::unique_ptr<MNN::NetT> netT;
{// 讀取原始的model文件, 藉助於flattbuffer生成Net對象
std::ifstream input(modelFile);
std::ostringstream outputOs;
outputOs << input.rdbuf();
netT = MNN::UnPackNet(outputOs.str().c_str()); //獲取Net對象
}
// temp build net for inference
flatbuffers::FlatBufferBuilder builder(1024);
auto offset = MNN::Net::Pack(builder, netT.get());//打包模型準備放入buffer中
builder.Finish(offset);
int size = builder.GetSize();
auto ocontent = builder.GetBufferPointer();
// 創建兩個buffer,兩個都用來放模型數據
std::unique_ptr<uint8_t> modelForInference(new uint8_t[size]);
memcpy(modelForInference.get(), ocontent, size);
std::unique_ptr<uint8_t> modelOriginal(new uint8_t[size]);
memcpy(modelOriginal.get(), ocontent, size);
netT.reset();
netT = MNN::UnPackNet(modelOriginal.get());
// 進行量化操作, 主要這個靠的是Calibration類
DLOG(INFO) << "Calibrate the feature and quantize model...";
std::shared_ptr<Calibration> calibration(
new Calibration(netT.get(), modelForInference.get(), size, preTreatConfig));
calibration->runQuantizeModel();
DLOG(INFO) << "Quantize model done!";
// 量化後的模型寫入到FlatBufferBuilder
flatbuffers::FlatBufferBuilder builderOutput(1024);
builderOutput.ForceDefaults(true);
auto len = MNN::Net::Pack(builderOutput, netT.get());
builderOutput.Finish(len);
// FlatBufferBuilder的內容寫入文件,得到量化模型
{
std::ofstream output(dstFile);
output.write((const char*)builderOutput.GetBufferPointer(), builderOutput.GetSize());
}
}
Calibration類
MNN量化的核心類,權重量化,特徵量化。
Calibration.hpp
class Calibration {
public:
// 參數 原始模型,模型uint8_t buffer,size,json配置文件
Calibration(MNN::NetT* model, uint8_t* modelBuffer, const int bufferSize, const std::string& configPath);
void runQuantizeModel();
private:
Calibration();
MNN::NetT* _originaleModel;// 需要量化的模型
std::shared_ptr<MNN::CV::ImageProcess> _process;// 負責image到tensor的轉化類
const int _binNums = 2048;
int _imageNum = 0;
int _width;
int _height;
std::vector<std::string> _imgaes;//圖片,用於校正特徵量化係數的
// Tensor and Info
// tensor 到 對應的 TensorStatistic, TensorStatistic是描述tensor在量化過程中需要的統計數據,後面有解釋
std::map<const MNN::Tensor*, std::shared_ptr<TensorStatistic>> _featureInfo;
// 所有的tensor
std::map<int, const MNN::Tensor*> _tensorMap;
// Op's name, Inputs, Outputs
// op到 input/output tensor的映射
std::map<std::string, std::pair<std::vector<MNN::Tensor*>, std::vector<MNN::Tensor*>>> _opInfo;
// The scale results
std::map<const MNN::Tensor*, std::vector<float>> _scales;
std::shared_ptr<MNN::Interpreter> _interpreter;
// keep mnn forward information
MNN::Session* _session;
MNN::Tensor* _inputTensor;
std::vector<int> _inputTensorDims;
std::string _featureQuantizeMethod = "KL";
std::string _weightQuantizeMethod = "MAX_ABS";
void _initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels);
void _initMaps();
void _computeFeatureMapsRange();
void _collectFeatureMapsDistribution();
void _computeFeatureScaleKL();
void _computeFeatureScaleADMM();
void _updateScale();
// insert the dequantization op before the not supported op(int8), and insert dequantization op
// after the output op, so that get original float data conveniently
void _insertDequantize();
};
#endif // CALIBRATION_HPP
Calibration構造函數
Calibration::Calibration(MNN::NetT* model, uint8_t* modelBuffer, const int bufferSize, const std::string& configPath)
: _originaleModel(model) {
// when the format of input image is RGB/BGR, channels equal to 3, GRAY is 1
int channles = 3;
// 解析json
rapidjson::Document document;
{
std::ifstream fileNames(configPath.c_str());
std::ostringstream output;
output << fileNames.rdbuf();
auto outputStr = output.str();
document.Parse(outputStr.c_str());
if (document.HasParseError()) {
MNN_ERROR("Invalid json\n");
return;
}
}
auto picObj = document.GetObject();
// 構造ImageProcess::config對象,將json內容傳入
ImageProcess::Config config;
config.filterType = BILINEAR;
config.destFormat = BGR;
{
if (picObj.HasMember("format")) {
auto format = picObj["format"].GetString();
static std::map<std::string, ImageFormat> formatMap{{"BGR", BGR}, {"RGB", RGB}, {"GRAY", GRAY}};
if (formatMap.find(format) != formatMap.end()) {
config.destFormat = formatMap.find(format)->second;
}
}
}
if (config.destFormat == GRAY) {
channles = 1;
}
config.sourceFormat = RGBA;
std::string imagePath;
_imageNum = 0;
{
if (picObj.HasMember("mean")) {
auto mean = picObj["mean"].GetArray();
int cur = 0;
for (auto iter = mean.begin(); iter != mean.end(); iter++) {
config.mean[cur++] = iter->GetFloat();
}
}
if (picObj.HasMember("normal")) {
auto normal = picObj["normal"].GetArray();
int cur = 0;
for (auto iter = normal.begin(); iter != normal.end(); iter++) {
config.normal[cur++] = iter->GetFloat();
}
}
if (picObj.HasMember("width")) {
_width = picObj["width"].GetInt();
}
if (picObj.HasMember("height")) {
_height = picObj["height"].GetInt();
}
if (picObj.HasMember("path")) {
imagePath = picObj["path"].GetString();
}
if (picObj.HasMember("used_image_num")) {
_imageNum = picObj["used_image_num"].GetInt();
}
if (picObj.HasMember("feature_quantize_method")) {
std::string method = picObj["feature_quantize_method"].GetString();
if (Helper::featureQuantizeMethod.find(method) != Helper::featureQuantizeMethod.end()) {
_featureQuantizeMethod = method;
} else {
MNN_ERROR("not supported feature quantization method: %s\n", method.c_str());
return;
}
}
if (picObj.HasMember("weight_quantize_method")) {
std::string method = picObj["weight_quantize_method"].GetString();
if (Helper::weightQuantizeMethod.find(method) != Helper::weightQuantizeMethod.end()) {
_weightQuantizeMethod = method;
} else {
MNN_ERROR("not supported weight quantization method: %s\n", method.c_str());
return;
}
}
DLOG(INFO) << "Use feature quantization method: " << _featureQuantizeMethod;
DLOG(INFO) << "Use weight quantization method: " << _weightQuantizeMethod;
}
std::shared_ptr<ImageProcess> process(ImageProcess::create(config));// 生成ImageProcess對象
_process = process;
// read images file names
Helper::readImages(_imgaes, imagePath.c_str(), &_imageNum);
_initMNNSession(modelBuffer, bufferSize, channles);
_initMaps();
}
initMNNSession函數
主要用於初始化,做好模型推理的準備
void Calibration::_initMNNSession(const uint8_t* modelBuffer, const int bufferSize, const int channels) {
_interpreter.reset(MNN::Interpreter::createFromBuffer(modelBuffer, bufferSize));
MNN::ScheduleConfig config;
_session = _interpreter->createSession(config);
_inputTensor = _interpreter->getSessionInput(_session, NULL);
_inputTensorDims.resize(4);
auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
DCHECK(4 == _inputTensor->dimensions()) << "Only support 4 dimensions input";
if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
_inputTensorDims[0] = 1;
_inputTensorDims[1] = _height;
_inputTensorDims[2] = _width;
_inputTensorDims[3] = channels;
} else if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NC4HW4) {
_inputTensorDims[0] = 1;
_inputTensorDims[1] = channels;
_inputTensorDims[2] = _height;
_inputTensorDims[3] = _width;
} else {
DLOG(ERROR) << "Input Data Format ERROR!";
}
if (_featureQuantizeMethod == "KL") {
_interpreter->resizeTensor(_inputTensor, _inputTensorDims);
_interpreter->resizeSession(_session);
} else if (_featureQuantizeMethod == "ADMM") {
DCHECK((_imageNum * 4 * _height * _width) < (INT_MAX / 4)) << "Use Little Number of Images When Use ADMM";
_inputTensorDims[0] = _imageNum;
_interpreter->resizeTensor(_inputTensor, _inputTensorDims);
_interpreter->resizeSession(_session);
}
_interpreter->releaseModel();
}
_initMaps函數
初始化Map,定義兩個回調函數,執行前後分別完成op到input和output的映射,初始化特徵信息和op信息。
void Calibration::_initMaps() {
_featureInfo.clear();
_opInfo.clear();
_tensorMap.clear();
// run mnn once, initialize featureMap, opInfo map
// MNN提供了每個op計算的callback,一個計算前一個是計算後
// 計算前的callback完成的工作是爲input tensor創建TensorStatistic對象; op info的填充 op->input,output的映射
MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
_opInfo[info->name()].first = nTensors;
if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
for (auto t : nTensors) {
if (_featureInfo.find(t) == _featureInfo.end()) {
_featureInfo[t] = std::shared_ptr<TensorStatistic>(
new TensorStatistic(t, _featureQuantizeMethod, info->name() + "__input"));
}
}
}
return false;
};
// 計算後的callback完成的工作是 爲output tensor創建TensorStatistic對象;op info的填充 op->input,output的映射
MNN::TensorCallBackWithInfo after = [this](const std::vector<MNN::Tensor*>& nTensors,
const MNN::OperatorInfo* info) {
_opInfo[info->name()].second = nTensors;
if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
for (auto t : nTensors) {
if (_featureInfo.find(t) == _featureInfo.end()) {
_featureInfo[t] =
std::shared_ptr<TensorStatistic>(new TensorStatistic(t, _featureQuantizeMethod, info->name()));
}
}
}
return true;
};
_interpreter->runSessionWithCallBackInfo(_session, before, after);
// 遍歷op,由op的<input/output index,input/output>加入到 tensorMap
for (auto& op : _originaleModel->oplists) {
if (_opInfo.find(op->name) == _opInfo.end()) {
continue;
}
for (int i = 0; i < op->inputIndexes.size(); ++i) {
_tensorMap[op->inputIndexes[i]] = _opInfo[op->name].first[i];
}
for (int i = 0; i < op->outputIndexes.size(); ++i) {
_tensorMap[op->outputIndexes[i]] = _opInfo[op->name].second[i];
}
}
if (_featureQuantizeMethod == "KL") {
// set the tensor-statistic method of input tensor as THRESHOLD_MAX
auto inputTensorStatistic = _featureInfo.find(_inputTensor);
if (inputTensorStatistic != _featureInfo.end()) {
inputTensorStatistic->second->setThresholdMethod(THRESHOLD_MAX);
}
}
}
TensorStatistic類
由於在Calibration實例裏有個map包含了
std::map<const MNN::Tensor*, std::shared_ptr<TensorStatistic>> _featureInfo;
表示每個tensor都有一個TensorStatistic對象與之對應,tensor在量化過程中數據存在TensorStatistic裏面。
TensorStatistic.hpp
class TensorStatistic {
public:
TensorStatistic(const MNN::Tensor* tensor, std::string method, const std::string& name, int binNumber = 2048, GET_THRESHOLD_METHOD thresholdMethod = THRESHOLD_KL);
. . .
void updateRange();
void resetDistribution();
void updateDistribution();
void setThresholdMethod(GET_THRESHOLD_METHOD thresholdMethod);
void setChannelWise(bool mergeChannel);
std::vector<float> finishAndCompute();
// only this one for ADMM
std::vector<float> computeScaleADMM();
private:
int _computeThreshold(const std::vector<float>& distribution);
std::vector<std::pair<float, float>> mRangePerChannel;// 該tensor在每個channel上的最大最小值
// 記該channel上的 (取最大值絕對值,取最小值的絕對值,的最大值) 爲maxValue
std::vector<float> mIntervals;//每個channel上的interval = mBinNumber / maxValue
std::vector<bool> mValidChannel; //如果channel上的maxValue>0.00001f, 爲true
std::vector<std::vector<float>> mDistribution;// 對於每個channel,tensor上浮點數據做均勻映射,映射到[0,mBinNumber],統計各個整數出現次數, 形成了直方圖。
std::shared_ptr<MNN::Tensor> mHostTensor; // 該tensor在cpu端的表示
const MNN::Tensor* mOriginTensor; // 原始的tensor
int mBinNumber; // 默認是2048
bool mUpdatedDistributionFlag = false;
bool mUpdatedRangeFlags = false;
bool mMergeChannel = true;
std::string mName;
GET_THRESHOLD_METHOD mThresholdMethod = THRESHOLD_KL;
};
runQuantizeModel函數
兩種特徵量化方法:KL和ADMM
void Calibration::runQuantizeModel() {
if (_featureQuantizeMethod == "KL") { // 如果配置文件裏是 KL散度做
_computeFeatureScaleKL();
} else if (_featureQuantizeMethod == "ADMM") {// 如果配置文件裏是ADMM
_computeFeatureScaleADMM();
}
_updateScale();
_insertDequantize();
}
computeFeatureScaleKL函數
void Calibration::_computeFeatureScaleKL() {
_computeFeatureMapsRange(); // 計算 feature map裏的數據範圍
_collectFeatureMapsDistribution(); // 計算feature map裏的數據分佈
_scales.clear();
for (auto& iter : _featureInfo) {
AUTOTIME;_imgaes
_scales[iter.first] = iter.second->finishAndCompute();//縮放係數
}
//_featureInfo.clear();//No need now
}
_computeFeatureMapsRange函數
用指定的圖片集合做爲模型輸入,做推理,在推理過程中,每一個op的計算前後分別統計tensor的每一個channel上featuremap的最大值最小值,更新到該tensor關聯的TensorStatistic對象實例裏。其中updateRange函數在TensorStatistic.cpp中
void Calibration::_computeFeatureMapsRange() {
// feed input data according to input images
int count = 0;
for (const auto& img : _imgaes) { // 對於每一個圖片文件
for (auto& iter : _featureInfo) {
iter.second->resetUpdatedRangeFlags();
}
count++;
// 讀取圖片,放到input tensor
Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor);
// 設置回調, 做推理時,每個op計算前,計算後分別調用
MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors,
const MNN::OperatorInfo* info) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
_featureInfo[t]->updateRange();// 統計輸入tensor裏的最大值最小值
}
}
return true;
};
MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors,
const MNN::OperatorInfo* info) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
_featureInfo[t]->updateRange();// 統計輸出tensor裏的最大值最小值
}
}
return true;
};
// 推理一遍
_interpreter->runSessionWithCallBackInfo(_session, before, after);
MNN_PRINT("\rComputeFeatureRange: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
fflush(stdout);
}
MNN_PRINT("\n");
}
updateRange函數
得到特徵的最大值和最小值
void TensorStatistic::updateRange() {
if (mUpdatedRangeFlags) {
return;
}
mUpdatedRangeFlags = true;
mOriginTensor->copyToHostTensor(mHostTensor.get());
int batch = mHostTensor->batch();
int channel = mHostTensor->channel();
int width = mHostTensor->width();
int height = mHostTensor->height();
auto area = width * height;
for (int n = 0; n < batch; ++n) {
auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);
for (int c = 0; c < channel; ++c) {
int cIndex = c;
if (mMergeChannel) {
cIndex = 0;
}
auto minValue = mRangePerChannel[cIndex].first;
auto maxValue = mRangePerChannel[cIndex].second;
auto dataChannel = dataBatch + c * mHostTensor->stride(1);
for (int v = 0; v < area; ++v) {
minValue = std::min(minValue, dataChannel[v]);
maxValue = std::max(maxValue, dataChannel[v]);
}
mRangePerChannel[cIndex].first = minValue;
mRangePerChannel[cIndex].second = maxValue;
}
}
}
_collectFeatureMapsDistribution函數
計算每個tensor的數據分佈情況,其中resetDistributio函數和updateDistribution函數在TensorStatistic.cpp中。
void Calibration::_collectFeatureMapsDistribution() {
for (auto& iter : _featureInfo) {
iter.second->resetDistribution(); // 初始化,清空TensorStatistic實例裏的記錄數據分佈的屬性
}
// 定義兩個回調, 分別在op計算前 計算後執行
// feed input data according to input images
MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
_featureInfo[t]->updateDistribution(); // 對該input tensor的TensorStatistic實例更新數據分佈
}
}
return true;
};
MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
_featureInfo[t]->updateDistribution();// 對該output tensor的TensorStatistic實例更新數據分佈
}
}
return true;
};
int count = 0;
for (const auto& img : _imgaes) {// 對所有的image,跑一遍推理
count++;
for (auto& iter : _fiter : _featureInfoeatureInfo) {
iter.second->resetUpdatedDistributionFlag();
}
Helper::preprocessInput(_process.get(), _width, _height, img, _inputTensor); //讀圖片填到input
_interpreter->runSessionWithCallBackInfo(_session, before, after); // 執行推理
MNN_PRINT("\rCollectFeatureDistribution: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
fflush(stdout);
}
MNN_PRINT("\n");
}
resetDistributio、updateDistribution函數
void TensorStatistic::resetDistribution() { // 初始化分佈
for (int i = 0; i < mIntervals.size(); ++i) { // 在每一個channel上做運算
int cIndex = i;
if (mMergeChannel) {
cIndex = 0;
}
// 最大值是 該channel上出現的數據的 絕對值的最大值
auto maxValue = std::max(fabsf(mRangePerChannel[cIndex].second), fabsf(mRangePerChannel[cIndex].first));
mValidChannel[cIndex] = maxValue > 0.00001f;// 最大值要足夠大, 然後除以mBinNumber=2048,得到mIntervals
mIntervals[cIndex] = 0.0f;
if (mValidChannel[cIndex]) {
// mIntervals 代表是 原始float均勻映射到 [0-2048)時, 整數1對應到浮點數上的值
mIntervals[cIndex] = (float)mBinNumber / maxValue;
}
}
for (auto& c : mDistribution) {
std::fill(c.begin(), c.end(), 1.0e-07); // mDistribution初始值設爲 接近0的很小的浮點數。
}
}
void TensorStatistic::updateDistribution() {
if (mUpdatedDistributionFlag) { //
return;
}
mUpdatedDistributionFlag = true;
// 取tensor上的數據, 和shape
mOriginTensor->copyToHostTensor(mHostTensor.get());
int batch = mHostTensor->batch();
int channel = mHostTensor->channel();
int width = mHostTensor->width();
int height = mHostTensor->height();
auto area = width * height;
for (int n = 0; n < batch; ++n) {
auto dataBatch = mHostTensor->host<float>() + n * mHostTensor->stride(0);// 取該batch上的數據
for (int c = 0; c < channel; ++c) {// 對每一個channel維度上計算
int cIndex = c;
if (mMergeChannel) {
cIndex = 0;
}
if (!mValidChannel[cIndex]) {
continue;
}
auto multi = mIntervals[cIndex]; // 取間隔值
auto target = mDistribution[cIndex].data();//取該channel上分佈
auto dataChannel = dataBatch + c * mHostTensor->stride(1);//取該channel上的feature map
for (int v = 0; v < area; ++v) {
auto data = dataChannel[v]; // data是feature map上的數值點
if (data == 0) {
continue;
}
int index = static_cast<int>(fabs(data) * multi);// 該數值點 均勻映射到 整數點, 整數記爲index
index = std::min(index, mBinNumber - 1); // 限制範圍是 不超過mBinNumber - 1
target[index] += 1.0f; // 統計到 mDistribution裏
}
}
}
}
finishAndCompute函數
計算出從float32到int8的縮放參數
std::vector<float> TensorStatistic::finishAndCompute() {
std::vector<float> scaleValue(mDistribution.size(), 0.0f);
. . .
for (int c = 0; c < mDistribution.size(); ++c) { // 對每一個channel
if (!mValidChannel[c]) {
continue;
}
float sum = 0.0f;
auto& distribution = mDistribution[c];
std::for_each(distribution.begin(), distribution.end(), [&](float n) { sum += n; });// 求和
std::for_each(distribution.begin(), distribution.end(), [sum](float& n) { n /= sum; }); // 除以和,等於佔比, 相當於歸一化了
auto threshold = _computeThreshold(distribution); // 計算一個閾值
scaleValue[c] = ((float)threshold + 0.5) / mIntervals[c] / 127.0; // 計算縮放係數
}
return scaleValue;
}
_computeThreshold函數
在targetBinNums 到mBinNumber之間找一個閾值,使得KL散度最小。
int TensorStatistic::_computeThreshold(const std::vector<float>& distribution) {
const int targetBinNums = 128;
int threshold = targetBinNums; // 默認128
if (mThresholdMethod == THRESHOLD_KL) { // 如果是通過KL散度
float minKLDivergence = 10000.0f;
float afterThresholdSum = 0.0f;
//targetBinNums=128, mBinNumber=2048
std::for_each(distribution.begin() + targetBinNums, distribution.end(),
[&](float n) { afterThresholdSum += n; });
for (int i = targetBinNums; i < mBinNumber; ++i) { // i 從128 到 2047, 尋找潛在的threshold
std::vector<float> quantizedDistribution(targetBinNums);
std::vector<float> candidateDistribution(i);
std::vector<float> expandedDistribution(i);
// candidateDistribution是保留了 從0到i的分佈, 同時把 i到mBinNumber直接的分佈加到最後i-1位置
std::copy(distribution.begin(), distribution.begin() + i, candidateDistribution.begin());
candidateDistribution[i - 1] += afterThresholdSum;
afterThresholdSum -= distribution[i];
// 空間大小i 縮小到targetBinNums時的間隔。
const float binInterval = (float)i / (float)targetBinNums;
// merge i bins to target bins j從0到127
// 把0-i之間分佈, 映射到更小空間 0-targetBinNums, 當然映射過後會有損失, 結果保存到quantizedDistribution
for (int j = 0; j < targetBinNums; ++j) {
// [j,j+1)在mBinNumber空間
// [start, end]是 i空間 相對應的映射
const float start = j * binInterval;
const float end = start + binInterval;
// 向上取整
const int leftUpper = static_cast<int>(std::ceil(start));
if (leftUpper > start) {
const float leftScale = leftUpper - start;
quantizedDistribution[j] += leftScale * distribution[leftUpper - 1];
}// 向下取整
const int rightLower = static_cast<int>(std::floor(end));
if (rightLower < end) {
const float rightScale = end - rightLower;
quantizedDistribution[j] += rightScale * distribution[rightLower];
}
// 轉化成在[0, targetBinNums]分佈
std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower,
[&](float n) { quantizedDistribution[j] += n; });
}
// expand target bins to i bins 從小空間 0-targetBinNums 在反過來映射回來到0-i之間
// 結果保存到expandedDistribution;
for (int j = 0; j < targetBinNums; ++j) {
const float start = j * binInterval;
const float end = start + binInterval;
float count = 0;
const int leftUpper = static_cast<int>(std::ceil(start));
float leftScale = 0.0f;
if (leftUpper > start) {
leftScale = leftUpper - start;
if (distribution[leftUpper - 1] != 0) {
count += leftScale;
}
}
const int rightLower = static_cast<int>(std::floor(end));
float rightScale = 0.0f;
if (rightLower < end) {
rightScale = end - rightLower;
if (distribution[rightLower] != 0) {
count += rightScale;
}
}
std::for_each(distribution.begin() + leftUpper, distribution.begin() + rightLower, [&](float n) {
if (n != 0) {
count += 1;
}
});
if (count == 0) {
continue;
}
const float toExpandValue = quantizedDistribution[j] / count;
if (leftUpper > start && distribution[leftUpper - 1] != 0) {
expandedDistribution[leftUpper - 1] += toExpandValue * leftScale;
}
if (rightLower < end && distribution[rightLower] != 0) {
expandedDistribution[rightLower] += toExpandValue * rightScale;
}
for (int k = leftUpper; k < rightLower; ++k) {
if (distribution[k] != 0) {
expandedDistribution[k] += toExpandValue;
}
}
}
// KL散度計算公式 Sum(P[i] * log(P[i] / Q[i]))
const float curKL = _klDivergence(candidateDistribution, expandedDistribution);
if (curKL < minKLDivergence) { // 記錄最小的KL散度,以及使得KL散度最小的 threshold
minKLDivergence = curKL;
threshold = i;
}
}
} else if (mThresholdMethod == THRESHOLD_MAX) {
threshold = mBinNumber - 1;
} else {
// TODO, support other method
MNN_ASSERT(false);
}
return threshold;
}
權重量化_updateScale函數
主要是對權重進行量化,完成可量化op的量化
void Calibration::_updateScale() {
for (const auto& op : _originaleModel->oplists) {
const auto opType = op->type;
// 只針對 conv, 或者Eltwise類型op
if (opType != MNN::OpType_Convolution && opType != MNN::OpType_ConvolutionDepthwise &&
opType != MNN::OpType_Eltwise) {
continue;
}
auto tensorsPair = _opInfo.find(op->name);
if (tensorsPair == _opInfo.end()) {
MNN_ERROR("Can't find tensors for %s\n", op->name.c_str());
}
if (opType == MNN::OpType_Eltwise) {//Eltwise類型op
auto param = op->main.AsEltwise();
// Now only support AddInt8
if (param->type != MNN::EltwiseType_SUM) {
continue;
}
// 取出前面算出來的 scale值
const auto& inputScale0 = _scales[tensorsPair->second.first[0]];
const auto& inputScale1 = _scales[tensorsPair->second.first[1]];
const auto& outputScale = _scales[tensorsPair->second.second[0]];
const int outputScaleSize = outputScale.size();
std::vector<float> outputInvertScale(outputScaleSize);
Helper::invertData(outputInvertScale.data(), outputScale.data(), outputScaleSize);
op->type = MNN::OpType_EltwiseInt8; // 修改op類型爲量化版本
op->main.Reset(); // 重新構造op的參數了
op->main.type = MNN::OpParameter_EltwiseInt8;
auto eltwiseInt8Param = new MNN::EltwiseInt8T;
auto input0ScaleParam = new MNN::QuantizedFloatParamT;
auto input1ScaleParam = new MNN::QuantizedFloatParamT;
auto outputScaleParam = new MNN::QuantizedFloatParamT;
input0ScaleParam->tensorScale = inputScale0;
input1ScaleParam->tensorScale = inputScale1;
outputScaleParam->tensorScale = outputInvertScale;
// 從int8恢復float32需要的scale參數
eltwiseInt8Param->inputQuan0 = std::unique_ptr<MNN::QuantizedFloatParamT>(input0ScaleParam);
eltwiseInt8Param->inputQuan1 = std::unique_ptr<MNN::QuantizedFloatParamT>(input1ScaleParam);
eltwiseInt8Param->outputQuan = std::unique_ptr<MNN::QuantizedFloatParamT>(outputScaleParam);
op->main.value = eltwiseInt8Param;
continue;
}
// below is Conv/DepthwiseConv
const auto& inputScale = _scales[tensorsPair->second.first[0]];
const auto& outputScale = _scales[tensorsPair->second.second[0]];
auto param = op->main.AsConvolution2D();
param->common->inputCount = tensorsPair->second.first[0]->channel();
const int channles = param->common->outputCount;
const int weightSize = param->weight.size();
param->symmetricQuan.reset(new MNN::QuantizedFloatParamT);// 參數重置
auto& quantizedParam = param->symmetricQuan;// 接着構建 int8版本參數
quantizedParam->scale.resize(channles);
quantizedParam->weight.resize(weightSize);
quantizedParam->bias.resize(channles);
// conv 和 deptwise_conv 分別算 weight,bias
if (opType == MNN::OpType_Convolution) {
QuantizeConvPerChannel(param->weight.data(), param->weight.size(), param->bias.data(),
quantizedParam->weight.data(), quantizedParam->bias.data(),
quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod);
op->type = MNN::OpType_ConvInt8;
} else if (opType == MNN::OpType_ConvolutionDepthwise) {
QuantizeDepthwiseConv(param->weight.data(), param->weight.size(), param->bias.data(),
quantizedParam->weight.data(), quantizedParam->bias.data(),
quantizedParam->scale.data(), inputScale, outputScale, _weightQuantizeMethod);
op->type = MNN::OpType_DepthwiseConvInt8;
}
if (param->common->relu6) {
param->common->relu = true;
param->common->relu6 = false;
}// 有了int8版本的參數, 原始的浮點參數就清空了
param->weight.clear();
param->bias.clear();
}
}
添加反量化_insertDequantize函數
主要作用是添加反量化操作,對於不支持int8的op,前後要加反量化的層。 量化op輸出是整數, 但是以量化op輸出爲輸入的下一個op是float,可能需要是float版本輸出,所以需要添加反量化。
大致說一下過程:
1.遍歷op,找出所有被量化op的輸入輸出tensor
2.遍歷op,找出所有不能量化的op
如果該op的輸入tensor是可量化op產生的, 那麼需要輸入後面加反量化
如果該op的輸出tensor是給到可量化op的, 那麼需要在輸出tensor後面加反量化
對於該圖的輸出tensor,後面加反量化, 確保輸出結果是浮點的
void Calibration::_insertDequantize() {
// Search All Int Tensors
std::set<int> int8Tensors;
std::set<int> int8Outputs;
for (auto& op : _originaleModel->oplists) {// 遍歷op
if (Helper::INT8SUPPORTED_OPS.count(op->type) > 0) {//如果op是 int8量化的, 記錄其input和output
for (auto index : op->inputIndexes) {//記錄其input
int8Tensors.insert(index);
}
for (auto index : op->outputIndexes) {//記錄其output
int8Tensors.insert(index);
int8Outputs.insert(index);
}
}
}
for (auto& op : _originaleModel->oplists) {// 去重,tensor
for (auto index : op->inputIndexes) {
auto iter = int8Outputs.find(index);
if (iter != int8Outputs.end()) {
int8Outputs.erase(iter);
}
}
}
// Insert Convert For Not Support Int8 Ops, 對於不支持int8的op,前後要加反量化的層
for (auto iter = _originaleModel->oplists.begin(); iter != _originaleModel->oplists.end();) {
auto op = iter->get();
const auto opType = op->type;
const auto name = op->name;
// check whether is output op
// if Yes, insert dequantization op after this op 支持int8的op,跳過
if (Helper::INT8SUPPORTED_OPS.find(opType) != Helper::INT8SUPPORTED_OPS.end()) {
// this is quantized op
iter++;
continue;
}
auto& inputIndexes = op->inputIndexes;
const int inputSize = inputIndexes.size();
// insert dequantization op before this op
for (int i = 0; i < inputSize; ++i) {// 對於該op的所有輸入 tensor
const auto curInputIndex = inputIndexes[i];
if (int8Tensors.find(curInputIndex) == int8Tensors.end()) {// 如果該tensor不是量化版的,跳過
continue;
}
auto input = _tensorMap[curInputIndex];
auto inputOpScale = _scales[input];
// construct new op, 創建一個 定點轉浮點的 op
auto dequantizationOp = new MNN::OpT;
dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
dequantizationOp->name = "___Int8ToFloat___For_" + name + flatbuffers::NumToString(i);
// 填充參數
dequantizationOp->type = MNN::OpType_Int8ToFloat;
auto dequantizationParam = new MNN::QuantizedFloatParamT;
dequantizationOp->main.value = dequantizationParam;
dequantizationParam->tensorScale = inputOpScale;
dequantizationOp->inputIndexes.push_back(curInputIndex);
dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
_originaleModel->tensorName.push_back(dequantizationOp->name);
// reset current op's input index at i, 新創建定點轉浮點op的 輸出 接到 op的輸入
inputIndexes[i] = dequantizationOp->outputIndexes[0];
iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(dequantizationOp));
iter++;
}
iter++;
// LOG(INFO) << "insert quantization op after this op if neccessary";
// insert quantization op after this op if neccessary
//// 對於該op的所有輸出tensor
for (int i = 0; i < op->outputIndexes.size(); ++i) {
const auto outputIndex = op->outputIndexes[i];
if (int8Tensors.find(outputIndex) == int8Tensors.end()) { // 如果該tensor不是量化版本,跳過
continue;
}
auto output = _tensorMap[outputIndex];
auto curScale = _scales[output];
// construct one quantization op(FloatToInt8)
// 創建反量化的op
auto quantizationOp = new MNN::OpT;
quantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
quantizationOp->name = name + "___FloatToInt8___" + flatbuffers::NumToString(i);
quantizationOp->type = MNN::OpType_FloatToInt8;
auto quantizationParam = new MNN::QuantizedFloatParamT;
quantizationOp->main.value = quantizationParam;
// 填充參數
const int channels = curScale.size();
std::vector<float> quantizationScale(channels);
Helper::invertData(quantizationScale.data(), curScale.data(), channels);
quantizationParam->tensorScale = quantizationScale;
// 插入 反量化op到 該op後面
quantizationOp->inputIndexes.push_back(_originaleModel->tensorName.size());
quantizationOp->outputIndexes.push_back(outputIndex);
_originaleModel->tensorName.push_back(_originaleModel->tensorName[outputIndex]);
_originaleModel->tensorName[outputIndex] = quantizationOp->name;
op->outputIndexes[i] = quantizationOp->inputIndexes[0];
iter = _originaleModel->oplists.insert(iter, std::unique_ptr<MNN::OpT>(quantizationOp));
iter++;
}
}
// Insert Turn float Op for output
for (auto index : int8Outputs) { // 對該圖的 輸出tensor,添加反量化, 因爲圖最終輸出還是要浮點的
// construct new op
auto dequantizationOp = new MNN::OpT;
dequantizationOp->main.type = MNN::OpParameter_QuantizedFloatParam;
dequantizationOp->name = "___Int8ToFloat___For_" + flatbuffers::NumToString(index);
dequantizationOp->type = MNN::OpType_Int8ToFloat;
auto dequantizationParam = new MNN::QuantizedFloatParamT;
dequantizationOp->main.value = dequantizationParam;
dequantizationParam->tensorScale = _scales[_tensorMap[index]];
dequantizationOp->inputIndexes.push_back(index);
dequantizationOp->outputIndexes.push_back(_originaleModel->tensorName.size());
auto originTensorName = _originaleModel->tensorName[index];
_originaleModel->tensorName[index] = dequantizationOp->name;
_originaleModel->tensorName.emplace_back(originTensorName);
_originaleModel->oplists.insert(_originaleModel->oplists.end(), std::unique_ptr<MNN::OpT>(dequantizationOp));
}
}
_computeFeatureScaleADMM函數
ADMM方式計算特徵,其中computeScaleADMM包含在TensorStatistic.cpp中。
void Calibration::_computeFeatureScaleADMM() {
// feed input data according to input images
int count = 0;
std::vector<int> oneImageTensorDims = _inputTensorDims;
oneImageTensorDims[0] = 1;
auto inputTensorDataFormat = MNN::TensorUtils::getDescribe(_inputTensor)->dimensionFormat;
auto dimType = MNN::Tensor::CAFFE_C4;
if (inputTensorDataFormat == MNN::MNN_DATA_FORMAT_NHWC) {
dimType = MNN::Tensor::TENSORFLOW;
}
for (const auto& img : _imgaes) {
auto curPtr = _inputTensor->host<float>() + count * _inputTensor->stride(0);
std::shared_ptr<MNN::Tensor> tensorWarp(
MNN::Tensor::create(oneImageTensorDims, _inputTensor->getType(), curPtr, dimType));
Helper::preprocessInput(_process.get(), _width, _height, img, tensorWarp.get());
count++;
MNN_PRINT("\rProcessImage: %.2lf %%", (float)count * 100.0f / (float)_imageNum);
fflush(stdout);
}
MNN_PRINT("\n");
_scales.clear();
const int totalLayers = _featureInfo.size();
count = 0;
MNN::TensorCallBackWithInfo before = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
_scales[t] = _featureInfo[t]->computeScaleADMM();
count++;
MNN_PRINT("\rComputeADMM: %.2lf %%", (float)count * 100.0f / (float)totalLayers);
fflush(stdout);
}
}
}
return true;
};
MNN::TensorCallBackWithInfo after = [&](const std::vector<MNN::Tensor*>& nTensors, const MNN::OperatorInfo* info) {
if (Helper::gNeedFeatureOp.find(info->type()) != Helper::gNeedFeatureOp.end()) {
for (auto t : nTensors) {
if (_featureInfo.find(t) != _featureInfo.end()) {
_scales[t] = _featureInfo[t]->computeScaleADMM();
count++;
MNN_PRINT("\rComputeADMM: %.2lf %%", (float)count * 100.0f / (float)totalLayers);
fflush(stdout);
}
}
}
return true;
};
_interpreter->runSessionWithCallBackInfo(_session, before, after);
MNN_PRINT("\n");
}
computeScaleADMM函數
std::vector<float> TensorStatistic::computeScaleADMM() {
std::vector<float> scaleValue(mOriginTensor->channel(), 0.0f);
const int count = mOriginTensor->elementSize();
float max = 0;
const float bound = 127;
const float* originData = mOriginTensor->host<float>();
for (int i = 0; i < count; i++) {
float absData = std::fabs(originData[i]);
if (absData > max) {
max = absData;
}
}
float alpha = max / (bound * 2.5);
// DLOG(INFO) << "alpha init: " << alpha;
const int maxStep = 300;
float sum1 = 0;
float sum2 = 0;
float invAlpha;
for (int i = 0; i < maxStep; i++) {
sum1 = 0;
sum2 = 0;
invAlpha = 1 / alpha;
for (int i = 0; i < count; i++) {
auto origin = originData[i];
auto dataQuant = std::roundf(origin * invAlpha);
dataQuant = std::fmin(bound, std::fmax(-bound, dataQuant));
sum1 += (dataQuant * origin);
sum2 += (dataQuant * dataQuant);
}
alpha = sum1 / sum2;
}
// DLOG(INFO) << "alpha final: " << alpha;
std::fill(scaleValue.begin(), scaleValue.end(), alpha);
return scaleValue;
}