tensorflow註冊自己實現的Op

目的是雲端算法中執行LSTM部分計算過程的加速，即用cu文件編譯出so，用此so中的LSTM類或函數替代tf.LSTMCell進行運算。
整個項目見Github，流程見博客，博主也剛入門cuda，歡迎留言探討~

1. 源代碼編譯tensorflow

因爲我們要對tf庫進行修改，所以需要用源碼編譯方式重新安裝tensorflow，官方步驟寫的很清楚，就不自己瞎寫了。

2. 註冊OP流程：

定義 Op 的接口，即按規則寫好cc文件
爲 Op 實現 kernel，即你自己的.cu文件
編譯出so，即(BUILD.sh)文件，上述三個文件如下，同樣先看官方網站，再來看例子會豁然開朗

3. 例子

按上圖準備好cc文件

#include <stdio.h>
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/shape_inference.h"
#include "tensorflow/core/kernels/bounds_check.h"
#include "tensorflow/core/framework/allocator.h"
#include "fsmn_forward.h"
#include <cstddef>
#include <iostream>
#include <algorithm>
using namespace tensorflow;
void LSTMTest(int miniBatch, int seqLength, int inputSize, int hiddenSize, int outSize,
              float* input, float* h_data_in, float *c_data_in, float* weight_i, float* weight_h, float*bias_data_in, float* w_i_diag_in,
              float* w_f_diag_in, float* w_o_diag_in, float* proj_kernel_in, float* h_data_out, float* c_data_out, float* output,
              bool use_peepholes = true, float cell_clip = 0.0, float proj_clip = 0.0);


REGISTER_OP("CudaLstmForward")
.Input("input: float32")
.Input("cdata_in: float32")
.Input("hdata_in: float32")
.Input("weight_i: float32")
.Input("weight_h: float32")
.Input("bias: float32")
.Input("w_i_diag: float32")
.Input("w_f_diag: float32")
.Input("w_o_diag: float32")
.Input("proj_kernel: float32")
.Output("cdata_out: float32")
.Output("hdata_out: float32")
.Output("output: float32")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) {
    c->set_output(0, c->Matrix(c->Dim(c->input(1), 0), c->Dim(c->input(1), 1)));
    c->set_output(1, c->Matrix(c->Dim(c->input(1), 0), c->Dim(c->input(2), 1)));
    c->set_output(2, c->Matrix(c->Dim(c->input(0), 0), c->Dim(c->input(2), 1)));
    return Status::OK();
});


class CudaLstmForwardOp : public OpKernel {
public:
    explicit CudaLstmForwardOp(OpKernelConstruction* ctx) : OpKernel(ctx){
    }
    void Compute(OpKernelContext* ctx) override {
        //printf("begin....");
        
        Tensor input_data= ctx->mutable_input(0, true);
        Tensor cdata_in= ctx->mutable_input(1, true);
        Tensor hdata_in= ctx->mutable_input(2, true);
        
        //printf("input....");
        OP_REQUIRES(ctx, input_data.shape().dims() == 2,
                    errors::InvalidArgument("input data is not a 2-Tensor"));
        OP_REQUIRES(ctx, hdata_in.shape().dims() == 2,
                    errors::InvalidArgument("hdata in is not a 2-Tensor"));
        OP_REQUIRES(ctx, cdata_in.shape().dims() == 2,
                    errors::InvalidArgument("cdata in is not a 2-Tensor"));
        //printf("weight....");
        Tensor weight_i= ctx->mutable_input(3, true);
        Tensor weight_h= ctx->mutable_input(4, true);
        Tensor bias= ctx->mutable_input(5, true);
        Tensor wi_diag= ctx->mutable_input(6, true);
        Tensor wf_diaf= ctx->mutable_input(7, true);
        Tensor wo_diag= ctx->mutable_input(8, true);
        Tensor proj= ctx->mutable_input(9, true);
        
        auto inputdata_t = input_data.tensor<float, 2>();
        auto cdatain_t = cdata_in.tensor<float, 2>();
        auto hdatain_t = hdata_in.tensor<float, 2>();
        
        auto weighti_t = weight_i.tensor<float, 2>();
        auto weighth_t = weight_h.tensor<float, 2>();
        auto bias_t = bias.tensor<float, 1>();
        
        auto widiag_t = wi_diag.tensor<float, 1>();
        auto wfdiag_t = wf_diaf.tensor<float, 1>();
        auto wodiag_t = wo_diag.tensor<float, 1>();
        auto proj_t = proj.tensor<float, 2>();
        
        const auto &acti_shape = input_data.shape();
        int seq_batch = acti_shape.dim_size(0);
        int inputsize = acti_shape.dim_size(1);
        
        const auto &acth_shape = cdata_in.shape();
        int batch = acth_shape.dim_size(0);
        int hiddensize = acth_shape.dim_size(1);
        
        const auto &actc_shape = hdata_in.shape();
        int outputsize = actc_shape.dim_size(1);
        
        int length = seq_batch/batch;
        
        // Create an state out tensor
        Tensor *state_outc = nullptr;
        TensorShape indice_shape({batch, hiddensize});
        OP_REQUIRES_OK(ctx, ctx->allocate_output("cdata_out", indice_shape, &state_outc));
        auto statec_t = state_outc->tensor<float, 2>();
        
        // Create an state out tensor
        Tensor *state_outh = nullptr;
        TensorShape indice_shape1({batch, outputsize});
        OP_REQUIRES_OK(ctx, ctx->allocate_output("hdata_out", indice_shape1, &state_outh));
        auto stateh_t = state_outh->tensor<float, 2>();
        
        // Create an output tensor
        Tensor *out_put = nullptr;
        TensorShape indice_shape2({seq_batch, outputsize});
        OP_REQUIRES_OK(ctx, ctx->allocate_output("output", indice_shape2, &out_put));
        auto out_t = out_put->tensor<float, 2>();
        
        
        // 執行計算操作
        LSTMTest(batch, length, inputsize, hiddensize, outputsize,
                 inputdata_t.data(), cdatain_t.data(), hdatain_t.data(),
                 weighti_t.data(), weighth_t.data(), bias_t.data(),
                 widiag_t.data(), wfdiag_t.data(), wodiag_t.data(), proj_t.data(),
                 statec_t.data(), stateh_t.data(), out_t.data(),
                 true, 0.0, 50.0);

        
    }
    
private:
    
}; //class CudaLstmForward end
REGISTER_KERNEL_BUILDER(Name("CudaLstmForward").Device(::tensorflow::DEVICE_CPU), CudaLstmForwardOp);
REGISTER_KERNEL_BUILDER(Name("CudaLstmForward").Device(DEVICE_GPU), CudaLstmForwardOp);

00_lstm.cu

extern "C" void LSTMTest(int miniBatch, int seqLength, int inputSize, int hiddenSize, int outSize,
              float* input, float* c_data_in, float *h_data_in, float* weight_i, float* weight_h, float*bias_data_in, float* w_i_diag_in,
              float* w_f_diag_in, float* w_o_diag_in, float* proj_kernel_in, float* c_data_out, float* h_data_out, float* output,
              bool use_peepholes = true, float cell_clip = 0.0, float proj_clip = 0.0){
  

    static int layer0_size = (7 + 320 + 2 * miniBatch + 4 * miniBatch) * 1536 + (2 * miniBatch + miniBatch * 4) * 320 + (miniBatch * 4) * 320 + (320 + 320 + miniBatch * 4 + miniBatch) * 4 * 1536;
    static int layer1_size = (7 + 320 + 2 * miniBatch + 4 * miniBatch) * 1536 + (2 * miniBatch + miniBatch * 4) * 320 + (miniBatch * 4) * 320 + (320 + 320 + miniBatch * 4 + miniBatch) * 4 * 1536;
    static int layer2_size = (7 + 448 + 2 * miniBatch + 4 * miniBatch) * 1536 + (2 * miniBatch + miniBatch * 4) * 448 + (miniBatch * 4) * 320 + (320 + 448 + miniBatch * 4 + miniBatch) * 4 * 1536;
    static int layer3_size = (7 + 448 + 2 * miniBatch + 4 * miniBatch) * 1536 + (2 * miniBatch + miniBatch * 4) * 448 + (miniBatch * 4) * 448 + (448 + 448 + miniBatch * 4 + miniBatch) * 4 * 1536;
    static float *w_diag_bias_proj;
    static float *init_pointer;
    static int all_layer_size = layer0_size + layer1_size + layer2_size + layer3_size;
    static int flag = 0;



    float alpha = 1.f;
    float beta  = 0.f;
    int input_depth = inputSize;
    int gateSize = hiddenSize * 4;
    int h_depth;
    int numElements = miniBatch * hiddenSize;
    
    if(use_peepholes == true)
        h_depth = outSize;
    else
        h_depth = hiddenSize;
  
    int w_diag_bias_proj_size = (7 + h_depth) * hiddenSize;
    int h_op_data_size = 2 * miniBatch * h_depth + miniBatch * seqLength * h_depth;
    int input_T_size = miniBatch * seqLength * input_depth + (input_depth + h_depth) * gateSize;
    int c_o_data_size = 2 * miniBatch * hiddenSize + miniBatch * seqLength * hiddenSize;
    int tmp_i_size = miniBatch * seqLength * gateSize;
    // int tmp_h_size = miniBatch * gateSize;
    
    // printf("the seqLength is: %d, inputSize: %d, input_depth: %d, hiddenSize: %d, outSize: %d\n", seqLength, inputSize, input_depth, hiddenSize, outSize);
    
    cudaErrCheck(cudaGetLastError());

    if(flag == 0){
        cudaErrCheck(cudaMalloc((void**)&w_diag_bias_proj, all_layer_size * sizeof(float)));
        init_pointer = w_diag_bias_proj;
    }

    if(flag == 1)
      w_diag_bias_proj = w_diag_bias_proj + layer0_size;
    else if(flag == 2)
      w_diag_bias_proj = w_diag_bias_proj + layer1_size;
    else if(flag == 3)
      w_diag_bias_proj = w_diag_bias_proj + layer2_size;
    flag++;

    printf("flag: %d\n", flag);

    //b = a + size(a);
    float *input_T = w_diag_bias_proj + w_diag_bias_proj_size;
    //c = b + size(b);
    float *h_op_data = input_T + input_T_size;
    float *c_o_data = h_op_data + h_op_data_size;
    float *tmp_i = c_o_data + c_o_data_size;
    float *tmp_h = tmp_i + tmp_i_size;

    cudaStream_t stream_i, stream_h;
    cudaErrCheck(cudaStreamCreate(&stream_i));
    cudaErrCheck(cudaStreamCreate(&stream_h));
    bool stream_i_flag = true;

    

    cudaErrCheck(cudaMemcpyAsync(input_T, input, miniBatch * input_depth * seqLength * sizeof(float), cudaMemcpyHostToDevice, stream_i));
    cudaErrCheck(cudaMemcpyAsync(input_T + miniBatch * input_depth * seqLength, weight_i, input_depth * gateSize * sizeof(float), cudaMemcpyHostToDevice, stream_i));
    cudaErrCheck(cudaMemcpyAsync(input_T + miniBatch * input_depth * seqLength + input_depth * gateSize, weight_h, h_depth * gateSize * sizeof(float), cudaMemcpyHostToDevice, stream_h));
    // printf("*************************%d\n", hiddenSize);
    cudaErrCheck(cudaMemcpyAsync(h_op_data, h_data_in, h_depth * miniBatch * sizeof(float), cudaMemcpyHostToDevice, stream_h));
    cudaErrCheck(cudaMemcpyAsync(c_o_data, c_data_in, numElements * sizeof(float), cudaMemcpyHostToDevice, stream_h));

    
    // printf("i_data up and i_data_beforeProj down and the seqLength is%d\n", seqLength);
    
    cudaErrCheck(cudaMemcpyAsync(w_diag_bias_proj, w_i_diag_in, hiddenSize * sizeof(float), cudaMemcpyHostToDevice, stream_h));
    cudaErrCheck(cudaMemcpyAsync(w_diag_bias_proj + hiddenSize, w_f_diag_in, hiddenSize * sizeof(float), cudaMemcpyHostToDevice, stream_h));
    cudaErrCheck(cudaMemcpyAsync(w_diag_bias_proj + 2 * hiddenSize, w_o_diag_in, hiddenSize * sizeof(float), cudaMemcpyHostToDevice, stream_h));
    cudaErrCheck(cudaMemcpyAsync(w_diag_bias_proj + 3 * hiddenSize , bias_data_in, gateSize * sizeof(float), cudaMemcpyHostToDevice, stream_h));
    cudaErrCheck(cudaMemcpyAsync(w_diag_bias_proj + 7 * hiddenSize, proj_kernel_in, h_depth * hiddenSize * sizeof(float), cudaMemcpyHostToDevice, stream_h));
    

    
    
    cudaErrCheck(cudaGetLastError());
    
    // cudaDeviceSynchronize();
    // Need a cuBLAS handle.
    cublasHandle_t handle;
    cublasErrCheck(cublasCreate(&handle));


    cublasErrCheck(cublasSetStream(handle, stream_i));
    cublasErrCheck(cublasSgemm(handle,
                               CUBLAS_OP_N, CUBLAS_OP_N,
                               gateSize, miniBatch * seqLength, input_depth,
                               &alpha,
                               input_T + miniBatch * input_depth * seqLength,
                               gateSize,
                               input_T,
                               input_depth,
                               &beta,
                               tmp_i,
                               gateSize));
    
    
    
    cudaErrCheck(cudaGetLastError());
    
    
    for(int i = 0; i < seqLength; ++i){
        // cudaEventRecord(event1, 0);
        cublasErrCheck(cublasSetStream(handle, stream_h));
        cublasErrCheck(cublasSgemm(handle,
                                   CUBLAS_OP_N, CUBLAS_OP_N,
                                   gateSize, miniBatch, h_depth,
                                   &alpha,
                                   input_T + miniBatch * input_depth * seqLength + input_depth * gateSize,
                                   gateSize,
                                   h_op_data,
                                   h_depth ,
                                   &beta,
                                   tmp_h,
                                   gateSize));
        
        dim3 blockDim;
        dim3 gridDim;
        
        blockDim.x = 256;
        gridDim.x = (miniBatch * hiddenSize + blockDim.x - 1) / blockDim.x;
      
        if(stream_i_flag == true)
            cudaErrCheck(cudaStreamSynchronize(stream_i));
        elementWise_fp <<< gridDim, blockDim, 0 , stream_h >>>
        (hiddenSize, miniBatch,
         tmp_h,
         tmp_i + i * miniBatch * gateSize,
         w_diag_bias_proj + 3 * hiddenSize,
         NULL,
         h_op_data + miniBatch * h_depth,
         c_o_data + 2 * numElements + i * miniBatch * hiddenSize,
         c_o_data,
         c_o_data + numElements,
         false,
         w_diag_bias_proj,
         w_diag_bias_proj + hiddenSize,
         w_diag_bias_proj + 2 * hiddenSize,
         use_peepholes,
         h_depth,
         cell_clip);

        if(stream_i_flag == true){
            cudaErrCheck(cudaStreamDestroy(stream_i));
            stream_i_flag = false;
         }

        cudaErrCheck(cudaGetLastError());
        if(use_peepholes != 0){
            cublasErrCheck(cublasSgemm(handle,
                                       CUBLAS_OP_N, CUBLAS_OP_N,
                                       h_depth, miniBatch, hiddenSize,
                                       &alpha,
                                       w_diag_bias_proj + 7 * hiddenSize,
                                       h_depth,
                                       c_o_data + 2 * numElements + i * miniBatch * hiddenSize,
                                       hiddenSize,
                                       &beta,
                                       h_op_data + 2 * miniBatch * h_depth + i * miniBatch * h_depth,
                                       h_depth));
            if(proj_clip != 0){
                // printf("in proj_clip\n");
                dim3 blockDim;
                dim3 gridDim;
                blockDim.x = 256;
                gridDim.x = (h_depth * miniBatch + blockDim.x - 1) / blockDim.x;
                clip_by_value <<< gridDim, blockDim, 0, stream_h >>>
                (h_op_data + 2 * miniBatch * h_depth + i * h_depth * miniBatch, proj_clip, miniBatch * h_depth);
            }
            //h_data和i_data保持同步
        }

        
        cudaErrCheck(cudaMemcpy(h_op_data + miniBatch * h_depth, h_op_data + 2 * miniBatch * h_depth + i * miniBatch * h_depth, miniBatch * h_depth * sizeof(float), cudaMemcpyDeviceToDevice));
        cudaErrCheck(cudaMemcpy(h_op_data, h_op_data + miniBatch * h_depth, miniBatch * h_depth * sizeof(float), cudaMemcpyDeviceToDevice));
        cudaErrCheck(cudaMemcpy(c_o_data, c_o_data + numElements, miniBatch * hiddenSize * sizeof(float), cudaMemcpyDeviceToDevice));
        cudaErrCheck(cudaGetLastError());
        

        cudaErrCheck(cudaGetLastError());
        
    }


    
    cudaErrCheck(cudaMemcpy(h_data_out, h_op_data + miniBatch * h_depth, miniBatch * h_depth * sizeof(float), cudaMemcpyDeviceToHost));
    cudaErrCheck(cudaMemcpy(c_data_out, c_o_data + numElements, miniBatch * hiddenSize * sizeof(float), cudaMemcpyDeviceToHost));
    cudaErrCheck(cudaMemcpy(output, h_op_data + 2 * miniBatch * h_depth, seqLength * miniBatch * h_depth * sizeof(float), cudaMemcpyDeviceToHost));
    
    if(flag == 4){
        cudaErrCheck(cudaFree(init_pointer));
        flag = 0;
      }
 
    cudaErrCheck(cudaStreamDestroy(stream_h));

}  
   

int main(int argc, char* argv[]) {
   int seqLength;
   int numLayers;
   int hiddenSize;
   int miniBatch; 
   bool use_peepholes;
   int num_proj;
   float cell_clip = 0.0;
   float proj_clip = 0.0;

   if (argc == 5) {
      seqLength = atoi(argv[1]);
      numLayers =  atoi(argv[2]);
      hiddenSize =  atoi(argv[3]);
      miniBatch =  atoi(argv[4]);   
   }
   else if (argc == 1) {
      printf("Running with default settings\n");
      seqLength = 2;
      numLayers = 1;
      hiddenSize = 1536;
      miniBatch = 100;
      use_peepholes = true;
      num_proj = 320;
      cell_clip = 0.00;
      proj_clip = 50.00;
   }
   else {
      printf("Usage: ./LSTM <seqLength> <numLayers> <hiddenSize> <miniBatch>\n");
      return 1;      
   }

   printf("seqLength %d, numLayers %d, num_proj %d, miniBatch %d\n", seqLength, numLayers, num_proj, miniBatch);  
   
   int outSize = num_proj;
   int numRuns = 4;  
   float totalTime = 0.f;
   int input_depth = 320;
   float* input = init_Matrix(miniBatch * input_depth * seqLength);
   float* h_data_in = init_Matrix_zeros(miniBatch * num_proj);
   float* c_data_in = init_Matrix_zeros(miniBatch * hiddenSize);
   float* weight_i = init_Matrix(input_depth* hiddenSize * 4);
   float* weight_h = init_Matrix(num_proj* hiddenSize * 4);
   float* bias_data_in = init_Matrix_zeros(hiddenSize * 4);
   float* w_i_diag_in = init_Matrix(hiddenSize);
   float* w_f_diag_in = init_Matrix(hiddenSize);
   float* w_o_diag_in = init_Matrix(hiddenSize);
   float* proj_kernel_in = init_Matrix(hiddenSize * num_proj);
   float* h_data_out = init_Matrix_zeros(miniBatch * num_proj);
   float* c_data_out = init_Matrix_zeros(miniBatch * hiddenSize);
   float* output = init_Matrix_zeros(miniBatch * seqLength * num_proj);

   for (int run = 0; run < numRuns; run++) {

      LSTMTest(miniBatch, seqLength, input_depth, hiddenSize, outSize,
        input, c_data_in, h_data_in, weight_i, weight_h, bias_data_in, w_i_diag_in,
      w_f_diag_in, w_o_diag_in, proj_kernel_in, c_data_out, h_data_out, output, use_peepholes, cell_clip, proj_clip);
   }
   
   printf("Runtime %fms\n", totalTime / numRuns);
   
   return time < 0;
}

混合編譯.c/.cpp與.cu文件

即在cpp裏使用cu文件，編譯cpp時將編譯好的cuda庫鏈接進來

分別編譯：g++ -o test 00_lstm.o 01_cpptest_cuda_lstm.o -lcudart -L/usr/local/cuda/lib64 -lcublas -lcurand -L/home/resources/yxwang/cuda-10.0/lib64/

靜態庫： nvcc -lib 00_lstm.cu -o lib00_lstm.a

g++ -o test 00_lstm.o 01_cpptest_cuda_lstm.o -L/usr/local/cuda/lib64

動態庫(BUILD.sh)：

#注意要在源碼編譯後的tensorflow文件夾編譯,pwd=~/tensorflow/tensorflow/core/user_ops
#注意執行sh用cpu1.5.0版tf[py27tf15]，單純運行py或cuda用gpu1.4.0版tf[py27tf15gpu], [py27tf15s]爲1.14版cpu
TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') );TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )

#g++ -std=c++11 -shared -o cuda_lstm_forward.so -c cuda_lstm_forward.cc -ltestcu -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]} -O2  -ltensorflow_framework -L /home/resources/yxwang/cuda-10.0/lib64/ -lcublas -lcurand
  
#把00_lstm.cu -o 成lib00_lstm.so
nvcc -o lib00_lstm.so -shared -Xcompiler -fPIC 00_lstm.cu -lcublas -lcurand -L /home/resources/yxwang/cuda-10.0/lib64/
 
#把cuda_lstm_forward.cc -o成cuda_lstm_forward.so，用到-l00_lstm -L.
g++ -std=c++11 -shared cuda_lstm_forward.cc -o cuda_lstm_forward.so -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]} -O2  -ltensorflow_framework -l00_lstm -L. -lcublas -lcurand -L/home/resources/yxwang/cuda-10.0/lib64/

#-l鏈接庫名 -庫地址 
#單純編譯測試：nvcc -g -G 00_lstm.cu -o 00_lstm -L -arch=sm_52 -DPERFOPTS=31 -lcublas -lcurand

或寫Makefile, : 後爲依賴項，從下往上看

all : cpp

cpp : lib00_lstm.so
	g++ 01_cpptest_cuda_lstm.cpp -o 01_cpptest_cuda_lstm /home/resources/yxwang/cuda-10.0/lib64/libcublas.so -l00_lstm -L.

lib00_lstm.so : 00_lstm.cu
	nvcc -o lib00_lstm.so -shared -Xcompiler -fPIC 00_lstm.cu -lcublas -lcurand -L /home/resources/yxwang/cuda-10.0/lib64/

tensorflow註冊自己實現的Op

1. 源代碼編譯tensorflow

2. 註冊OP流程：

3. 例子

按上圖準備好cc文件

混合編譯.c/.cpp與.cu文件

SQL優化-20231016

Git版本控制和個人一些總結

tensorflow的三種保存格式相互轉換

tfs模型加速之固化和轉半精度

ASR語音識別筆記總結

Group Normalization論文筆記--深入淺出組歸一化

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結