c++調用pytorch模型並使用GPU進行預測

pytorch很好用，但是真實部署還是使用c++較多，因此需要用c++調用訓練好的pytorch模型。參考官方示例https://github.com/apachecn/pytorch-doc-zh/blob/master/docs/1.0/cpp_export.md安裝libtorch。但是官方示例中並沒有使用真實的例子，且使用的是cpu版本的。

下面用一個真實的例子並用gpu，本示例的所有程序在https://github.com/zhangming8/pytorch-cpp-model

我使用的環境(沒試過其他)：ubuntu16.04， cuda9，採用源碼安裝的opencv3.4(https://blog.csdn.net/u010397980/article/details/89439515)，libtorch（https://download.pytorch.org/libtorch/cu90/libtorch-shared-with-deps-latest.zip），python3，torch1.0.0。

1. 首先轉化模型：

新建: vim create_model.py文件，內容如下，並執行下面的腳本：python3 create_model.py：

#coding:utf-8
import numpy as np
import os
import glob
import cv2
import shutil
import time
import torch
import torch.nn as nn

from mobilenetv2 import MobileNetV2


# 獲取模型實例
model = MobileNetV2()
model.classifier = nn.Sequential(nn.Linear(1280, 8), nn.Sigmoid())
#model.load_state_dict(torch.load("latest.pt"))

img_size = 224
# 生成一個樣本供網絡前向傳播 forward()
example = torch.rand(1, 3, img_size, img_size)

# 使用 torch.jit.trace 生成 torch.jit.ScriptModule 來跟蹤
traced_script_module = torch.jit.trace(model, example)

img_list = ["test.jpg"]

s = time.time()
for i in img_list:
    img_org = cv2.imread(i)
    org_shape = img_org.shape[:-1]
    org_shape = org_shape[::-1]
    # process data
    img = cv2.resize(img_org, (img_size, img_size))
    img = img[:, :, ::-1].transpose(2, 0, 1)  # 1. BGR to RGB; 2. change hxwx3 to 3xhxw
    img = np.ascontiguousarray(img, dtype=np.float32)  # uint8 to float32
    img /= 255.0  # 0 - 255 to 0.0 - 1.0

    inputs = torch.from_numpy(img)
    inputs = inputs.unsqueeze(0)
    output = traced_script_module(inputs)
    print("output:", output)

traced_script_module.save("model_cpp.pt")
print("create c++ model done...")

mobilenetv2.py文件的內容如下：

import torch
import torch.nn as nn
import math

# A PyTorch implementation of MobileNet V2 architecture and pretrained model.
# from https://github.com/tonylins/pytorch-mobilenet-v2
# pretrained model can be downloaded from https://drive.google.com/file/d/1jlto6HRVD3ipNkAl1lNhDbkBp7HylaqR/view
# env: python3

def conv_bn(inp, oup, stride):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )


def conv_1x1_bn(inp, oup):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )


class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio):
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1, 2]

        hidden_dim = round(inp * expand_ratio)
        self.use_res_connect = self.stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


class MobileNetV2(nn.Module):
    def __init__(self, n_class=1000, input_size=224, width_mult=1.):
        super(MobileNetV2, self).__init__()
        block = InvertedResidual
        input_channel = 32
        last_channel = 1280
        interverted_residual_setting = [
            # t, c, n, s
            [1, 16, 1, 1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        # building first layer
        assert input_size % 32 == 0
        input_channel = int(input_channel * width_mult)
        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
        self.features = [conv_bn(3, input_channel, 2)]
        # building inverted residual blocks
        for t, c, n, s in interverted_residual_setting:
            output_channel = int(c * width_mult)
            for i in range(n):
                if i == 0:
                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
                else:
                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
                input_channel = output_channel
        # building last several layers
        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)

        # building classifier
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(self.last_channel, n_class),
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        #print("1 x.size()", x.size())  # bs,1280,7,7
        x = x.mean(3).mean(2)
        #print("2 x.size()", x.size())  # bs,1280
        x = self.classifier(x)
        #print("3 x.size()", x.size())  # bs,num_classes
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()


if __name__ == "__main__":
    img = torch.rand((2, 3, 224, 224))
    net = MobileNetV2(n_class=1000)
    print(net)
    state_dict = torch.load('finetune_weight/mobilenet_v2.pth.tar') # add map_location='cpu' if no gpu
    net.load_state_dict(state_dict)
    print("***"*50)
    pred = net(img)
    print(pred.size())

    '''
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

    input_size = 224
    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(input_size, scale=(0.2, 1.0)), 
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True,
        num_workers=n_worker, pin_memory=True)
    
    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(int(input_size/0.875)),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=batch_size, shuffle=False,
        num_workers=n_worker, pin_memory=True)
    '''

從代碼可以知道，該模型的輸出爲8個置信度爲0-1的值(網絡最後我用了一個sigmoid函數)，當然你可以修改模型的結構產生其他輸出。爲了便於演示這個模型是隨機初始化的，因此在實際中你需要加載已經訓練好的模型再轉，如使用model.load_state_dict(torch.load("latest.pt"))加載。

會生成一個mobilenetv2的model_cpp.pt文件，該文件是c++調用的模型。並輸出8個結果，由於模型是採用的隨機初始化，所以每個的值都不一樣。這是python調用pytorch模型的結果。

2.新建一個predict-app.cpp文件

該文件主要用於調用pytorch模型。vim predict-app.cpp

注意修改model_path爲模型model_cpp.pt的路徑和測試圖像的路徑test_path，並在test_path中隨便放幾張.jpg的圖像。

該cpp文件首先加載並初始化了model_path指定的模型，把模型放到gpu上，之後遍歷test_path下面所有".jpg"圖像，再對圖像進行預處理，送進網絡，並把最大概率對應的索引位置輸出。可以參考註釋

#include <torch/torch.h>
#include <ATen/ATen.h>
#include <torch/script.h>

#include <iostream>
#include <memory>
#include <string>
#include <chrono>

#include "opencv2/core.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/opencv.hpp"

using namespace cv;
using namespace std;
using namespace chrono;

#include <experimental/filesystem>
namespace fs = std::experimental::filesystem;

int main(int argc, const char* argv[]) {
    string model_path;
    if (argc == 2){
        model_path = argv[1];
    }
    else {
        model_path = "/home/xxxxxxxxx/model_cpp.pt";
    }
    cout << "using model:" << model_path << endl;
    string test_path = "/home/xxxxxxxxx/data/";
    
    // init model
    int img_size = 224;  // resize img to 224
    vector<torch::jit::IValue> inputs;  //def an input
    shared_ptr<torch::jit::script::Module> module = torch::jit::load(model_path);  //load model
    module->to(at::kCUDA);  // put model to gpu
    assert(module != nullptr);
    cout << "[INFO] init model done...\n";

    int i = 0;
    double t_start, t_end, t_cost;
    t_start = getTickCount(); // get now time
    
    Mat src, image, float_image;
    for (const auto &p : fs::directory_iterator(test_path)){ //遍歷文件夾中的所有文件
        string s = p.path();  //get one file path
        string suffix = s.substr(s.find_last_of(".")+1);  //獲取文件的格式(後綴)
        if (suffix != "jpg"){
            continue;
        }
        cout << i << "-------------------------" << endl;
        cout << p.path() << '\n';

        src = imread(s);  //讀圖
	// 圖像預處理 注意需要和python訓練時的預處理一致
        resize(src, image, Size(img_size, img_size));  // resize 圖像
        cvtColor(image, image, CV_BGR2RGB);  // bgr -> rgb
	image.convertTo(float_image, CV_32F, 1.0 / 255);   //歸一化到[0,1]區間
        //cout << float_image.at<Vec3f>(100,100)[1] << endl;  //輸出一個像素點點值
        auto img_tensor = torch::CPU(torch::kFloat32).tensorFromBlob(float_image.data, {1, img_size, img_size, 3});   //將cv::Mat轉成tensor,大小爲1,224,224,3
        img_tensor = img_tensor.permute({0, 3, 1, 2});  //調換順序變爲torch輸入的格式 1,3,224,224
        //img_tensor[0][0] = img_tensor[0][0].sub_(0.485).div_(0.229);  //減去均值,除以標準差
        //img_tensor[0][1] = img_tensor[0][1].sub_(0.456).div_(0.224);
        //img_tensor[0][2] = img_tensor[0][2].sub_(0.406).div_(0.225);
        
        auto img_var = torch::autograd::make_variable(img_tensor, false);  //不需要梯度
	inputs.emplace_back(img_var.to(at::kCUDA));  // 把預處理後的圖像放入gpu
        torch::Tensor result = module->forward(inputs).toTensor();  //前向傳播獲取結果
        inputs.pop_back();
        cout << "result:" << result << endl;

        auto pred = result.argmax(1);
        cout << "max index:" << pred << endl;

	/*std::tuple<torch::Tensor,torch::Tensor> res_sort = result.sort(-1, true);
	torch::Tensor top_scores = get<0>(res_sort)[0];
	torch::Tensor top_idxs = get<1>(res_sort)[0].toType(torch::kInt32);
        auto top_scores_a = top_scores.accessor<float,1>();
	auto top_idxs_a = top_idxs.accessor<int,1>();
        for (int j = 0; j < 3; ++j) {
            int idx = top_idxs_a[j];
            cout << "top-" << j+1 << " index: " << idx << ", score: " << top_scores_a[j] << endl;
        }*/

        i++;
        if (i > 1000){
            break;
        }
    }
    t_end = getTickCount();
    t_cost = t_end - t_start;
    //t_cost = t_cost / getTickFrequency();
    printf("time cost: %4.f ms\n", t_cost/1000000.0);
    return 0;
}

/*
void Mycircle(){
    Point p = Point(320, 190); //圓的中心點
    int r= 50; //圓的半徑
    Scalar color = Scalar(0, 255, 0);
    circle(src, p, r, color);
}
*/

3. 編譯

默認opencv和libtorch都已經安裝好了。

首先新建一個CMakeLists.txt文件，內容如下：vim CMakeLists.txt

cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(custom_ops)

find_package(Torch REQUIRED)
find_package( OpenCV REQUIRED )
include_directories( ${OpenCV_INCLUDE_DIRS} )

add_executable(predict-app predict-app.cpp)
target_link_libraries(predict-app "${TORCH_LIBRARIES}" ${OpenCV_LIBS} stdc++fs)
set_property(TARGET predict-app PROPERTY CXX_STANDARD 11)

再依次執行：

mkdir build
cd build
cmake -DCMAKE_PREFIX_PATH=/media/DATA2/libtorch ..

(其中/media/DATA2/libtorch爲你的libtorch的路徑)
make

如果不出意外的話會出現以下界面：

以及

發現在build文件夾下生成了predict-app可執行文件，及其他文件：

最後執行./predict-app即可運行程序，界面如下所示：

發現使用c++的模型和使用python的模型結果輸出是一樣的(只要預處理沒問題)。

c++調用pytorch模型並使用GPU進行預測

認知提升的方法

螞蟻面試：Springcloud核心組件的底層原理，你知道多少？

C#開源的兩款功能強大的錄屏神器

ubuntu16.04 安裝opencv過程

目標檢測1: rcnn流程梳理及邊框損失函數loss分析

把voc格式的標註文件.xml轉爲coco格式的.json文件

c++調用pytorch模型並使用GPU進行預測

18.6使用官方的slim訓練模型並finetune微調

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結