紋理內存

特性

  • 紋理內存是隻讀的

  • 紋理內存可以通過歸一化座標訪問,當使用非歸一化的座標訪問紋理時,不會進行filter

  • 當訪問紋理內存的模式具有較大的 “ 空間局部性 ” 時,性能會得到改善。這裏的空間局部性不只是像素在水平方向(內存地址)上的連續性,也包括了豎直方向上的連續性

  • 紋理內存爲streaming featch(即同一個wrap中的所有thread同時訪問具有空間局部性的一塊內存)做了優化

  • 即使全局內存爲16位或32位的整型,當作爲紋理內存進行訪問時,能將數據轉換爲32位浮點類型

  • 不論是紋理引用還是紋理對象的創建,都需要先在全局內存創建一個內存區域,然後將該內存與紋理對象或紋理引用綁定,最後kernel通過紋理對象或紋理引用訪問紋理數據

  • 如果在訪問紋理期間修改與其綁定的全局內存,除非在讀取紋理時發生cache miss,否則讀取到的像素數據和全局內存中的不一致

  • Packed data may be broadcast to separate variables in a single operation;

紋理對象

  • 紋理對象在運行時創建,並且在創建時與紋理內存進行綁定

     __host __ ​cudaError_t cudaCreateTextureObject(cudaTextureObject_t* pTexObject,
                                                   const cudaResourceDesc* pResDesc,
                                                   const cudaTextureDesc* pTexDesc,
                                                   const cudaResourceViewDesc* pResViewDesc)    
    
    struct cudaTextureDesc
    {
        enum cudaTextureAddressMode addressMode[3];         //clamp?wrap?
        enum cudaTextureFilterMode filterMode;              //linera?point?
        enum cudaTextureReadMode readMode;                  //是否將有/無符號整型像素化爲【0,1】/【-1,1】範圍內的浮點數
        int sRGB;
        int normalizedCoords;                               //是否通過歸一化座標訪問紋理
        unsigned int maxAnisotropy;
        enum cudaTextureFilterMode mipmapFilterMode;
        float mipmapLevelBias;
        float minMipmapLevelClamp;
        float maxMipmapLevelClamp;
    };
    
    struct cudaResourceDesc
    {
        cudaArray_t array;                                  //原始數據
        struct cudaChannelFormatDesc desc;
        void * devPtr;                                      //原始數據
        size_t  height;
        cudaMipmappedArray_t mipmap;                        //原始數據
        size_t  pitchInBytes;                               //對於pitch2D的數據的padding大小
        enum cudaResourceType resType;                      //原始數據的類型:array?linera?mipmapArray?pitch2D?
        size_t  sizeInBytes;
        size_t  width;
    }
    
  • 紋理對象可作爲kernel參數傳入

    __global__ void kernel(cudaTextureObject_t texObj)
    {
        ...
    }
    
  • 紋理內存的訪問都是模板函數,以下函數的返回值可能是float,int,unsigned int …

    tex1Dfetch()
    tex1D()
    tex2D()
    ...
    tex2DLayered()          //訪問某一層mipmap
    texCubemap()
    texCubemapLayered()
    tex2Dgather()
    

紋理引用

  • 紋理引用在編譯時創建,它不能作爲參數傳給kernel,必須聲明爲靜態全局變量

    texture<DataType, Type, ReadMode> texRef;
    
    //無符號char,4通道,二維,不把整型通道數據轉爲浮點類型
    texture<uchar4, cudaTextureType2D, cudaReadModeElementType> texRef;
    
  • 紋理引用的其他屬性在運行時可改,該些mutable屬性繼承於textureReference類型

    struct textureReference {
        int                          normalized;
        enum cudaTextureFilterMode   filterMode;
        enum cudaTextureAddressMode  addressMode[3];
        struct cudaChannelFormatDesc channelDesc;       //規定像素獲取時返回的數據類型
        int                          sRGB;
        unsigned int                 maxAnisotropy;
        enum cudaTextureFilterMode   mipmapFilterMode;
        float                        mipmapLevelBias;
        float                        minMipmapLevelClamp;
        float                        maxMipmapLevelClamp;
    }
    
    struct cudaChannelFormatDesc { 
      int x, y, z, w; 
      enum cudaChannelFormatKind f; 
    };
    
  • 紋理引用在被訪問前必須綁定到一塊設備端內存,多個紋理引用可以綁定到同一塊紋理內存

    template<class T , int dim, enum cudaTextureReadMode readMode>
    cudaError_t cudaBindTextureToArray( 
      const struct texture< T, dim, readMode >& tex,  
      const struct cudaArray* array;   
    )
    
  • 紋理引用可以與紋理內存解綁定

代碼實例(摘自CUDA_C_Programming_Guide)

  • 紋理引用

    // 2D float texture
    texture<float, cudaTextureType2D, cudaReadModeElementType> texRef;
    
    // Simple transformation kernel
    __global__ void transformKernel(float* output,
                                    int width, int height,
                                    float theta)
    {
         // Calculate normalized texture coordinates
         unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
         unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
         float u = x / (float)width;
         float v = y / (float)height;
    
         // Transform coordinates
         u -= 0.5f;
         v -= 0.5f;
         float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
         float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
    
         // Read from texture and write to global memory
         output[y * width + x] = tex2D(texRef, tu, tv);
    }
    // Host code
    int main()
    {
         // Allocate CUDA array in device memory
         cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,
                                                                   cudaChannelFormatKindFloat);
         cudaArray* cuArray;
         cudaMallocArray(&cuArray, &channelDesc, width, height);
    
         // Copy to device memory some data located at address h_data
         // in host memory
         cudaMemcpyToArray(cuArray, 0, 0, h_data, size,
         cudaMemcpyHostToDevice);
    
         // Set texture reference parameters
         texRef.addressMode[0] = cudaAddressModeWrap;
         texRef.addressMode[1] = cudaAddressModeWrap;
         texRef.filterMode = cudaFilterModeLinear;
         texRef.normalized = true;
    
         // Bind the array to the texture reference
         cudaBindTextureToArray(texRef, cuArray, channelDesc);
    
         // Allocate result of transformation in device memory
         float* output;
         cudaMalloc(&output, width * height * sizeof(float));
    
         // Invoke kernel
         dim3 dimBlock(16, 16);
         dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x,
         (height + dimBlock.y - 1) / dimBlock.y);
         transformKernel<<<dimGrid, dimBlock>>>(output, width, height,
         angle);
    
         // Free device memory
         cudaFreeArray(cuArray);
         cudaFree(output);
         return 0;
    }
    
  • 紋理對象

    // Simple transformation kernel
    __global__ void transformKernel(float* output,
                                    cudaTextureObject_t texObj,
                                    int width, int height,
                                    float theta)
    {
         // Calculate normalized texture coordinates
         unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
         unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
         float u = x / (float)width;
         float v = y / (float)height;
    
         // Transform coordinates
         u -= 0.5f;
         v -= 0.5f;
         float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
         float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
    
         // Read from texture and write to global memory
         output[y * width + x] = tex2D<float>(texObj, tu, tv);
    }
    
    int main()
    {
         // Allocate CUDA array in device memory
         cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,
                                                                   cudaChannelFormatKindFloat);
         cudaArray* cuArray;
         cudaMallocArray(&cuArray, &channelDesc, width, height);
    
         // Copy to device memory some data located at address h_data
         // in host memory
         cudaMemcpyToArray(cuArray, 0, 0, h_data, size,
         cudaMemcpyHostToDevice);
    
         // Specify texture
         struct cudaResourceDesc resDesc;
         memset(&resDesc, 0, sizeof(resDesc));
         resDesc.resType = cudaResourceTypeArray;
         resDesc.res.array.array = cuArray;
    
         // Specify texture object parameters
         struct cudaTextureDesc texDesc;
         memset(&texDesc, 0, sizeof(texDesc));
         texDesc.addressMode[0] = cudaAddressModeWrap;
         texDesc.addressMode[1] = cudaAddressModeWrap;
         texDesc.filterMode = cudaFilterModeLinear;
         texDesc.readMode = cudaReadModeElementType;
         texDesc.normalizedCoords = 1;
    
         // Create texture object
         cudaTextureObject_t texObj = 0;
         cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
    
         // Allocate result of transformation in device memory
         float* output;
         cudaMalloc(&output, width * height * sizeof(float));
    
         // Invoke kernel
         dim3 dimBlock(16, 16);
         dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x,
                      (height + dimBlock.y - 1) / dimBlock.y);
         transformKernel<<<dimGrid, dimBlock>>>(output,
         texObj, width, height,
         angle);
    
         // Destroy texture object
         cudaDestroyTextureObject(texObj);
    
         // Free device memory
         cudaFreeArray(cuArray);
         cudaFree(output);
         return 0;
    }
    
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章