紋理內存

特性

紋理內存是隻讀的
紋理內存可以通過歸一化座標訪問，當使用非歸一化的座標訪問紋理時，不會進行filter
當訪問紋理內存的模式具有較大的 “ 空間局部性 ” 時，性能會得到改善。這裏的空間局部性不只是像素在水平方向（內存地址）上的連續性，也包括了豎直方向上的連續性
紋理內存爲streaming featch（即同一個wrap中的所有thread同時訪問具有空間局部性的一塊內存）做了優化
即使全局內存爲16位或32位的整型，當作爲紋理內存進行訪問時，能將數據轉換爲32位浮點類型
不論是紋理引用還是紋理對象的創建，都需要先在全局內存創建一個內存區域，然後將該內存與紋理對象或紋理引用綁定，最後kernel通過紋理對象或紋理引用訪問紋理數據
如果在訪問紋理期間修改與其綁定的全局內存，除非在讀取紋理時發生cache miss，否則讀取到的像素數據和全局內存中的不一致
Packed data may be broadcast to separate variables in a single operation;

紋理對象

紋理對象在運行時創建，並且在創建時與紋理內存進行綁定

 __host __ cudaError_t cudaCreateTextureObject(cudaTextureObject_t* pTexObject,
                                               const cudaResourceDesc* pResDesc,
                                               const cudaTextureDesc* pTexDesc,
                                               const cudaResourceViewDesc* pResViewDesc)    

struct cudaTextureDesc
{
    enum cudaTextureAddressMode addressMode[3];         //clamp?wrap?
    enum cudaTextureFilterMode filterMode;              //linera?point?
    enum cudaTextureReadMode readMode;                  //是否將有/無符號整型像素化爲【0，1】/【-1，1】範圍內的浮點數
    int sRGB;
    int normalizedCoords;                               //是否通過歸一化座標訪問紋理
    unsigned int maxAnisotropy;
    enum cudaTextureFilterMode mipmapFilterMode;
    float mipmapLevelBias;
    float minMipmapLevelClamp;
    float maxMipmapLevelClamp;
};

struct cudaResourceDesc
{
    cudaArray_t array;                                  //原始數據
    struct cudaChannelFormatDesc desc;
    void * devPtr;                                      //原始數據
    size_t  height;
    cudaMipmappedArray_t mipmap;                        //原始數據
    size_t  pitchInBytes;                               //對於pitch2D的數據的padding大小
    enum cudaResourceType resType;                      //原始數據的類型:array?linera?mipmapArray?pitch2D?
    size_t  sizeInBytes;
    size_t  width;
}

紋理對象可作爲kernel參數傳入

__global__ void kernel(cudaTextureObject_t texObj)
{
    ...
}

紋理內存的訪問都是模板函數，以下函數的返回值可能是float,int,unsigned int …

tex1Dfetch()
tex1D()
tex2D()
...
tex2DLayered()          //訪問某一層mipmap
texCubemap()
texCubemapLayered()
tex2Dgather()

紋理引用

紋理引用在編譯時創建，它不能作爲參數傳給kernel，必須聲明爲靜態全局變量

texture<DataType, Type, ReadMode> texRef;

//無符號char，4通道，二維，不把整型通道數據轉爲浮點類型
texture<uchar4, cudaTextureType2D, cudaReadModeElementType> texRef;

紋理引用的其他屬性在運行時可改，該些mutable屬性繼承於textureReference類型

struct textureReference {
    int                          normalized;
    enum cudaTextureFilterMode   filterMode;
    enum cudaTextureAddressMode  addressMode[3];
    struct cudaChannelFormatDesc channelDesc;       //規定像素獲取時返回的數據類型
    int                          sRGB;
    unsigned int                 maxAnisotropy;
    enum cudaTextureFilterMode   mipmapFilterMode;
    float                        mipmapLevelBias;
    float                        minMipmapLevelClamp;
    float                        maxMipmapLevelClamp;
}

struct cudaChannelFormatDesc { 
  int x, y, z, w; 
  enum cudaChannelFormatKind f; 
};

紋理引用在被訪問前必須綁定到一塊設備端內存，多個紋理引用可以綁定到同一塊紋理內存

template<class T , int dim, enum cudaTextureReadMode readMode>
cudaError_t cudaBindTextureToArray( 
  const struct texture< T, dim, readMode >& tex,  
  const struct cudaArray* array;   
)

紋理引用可以與紋理內存解綁定

代碼實例（摘自CUDA_C_Programming_Guide）

紋理引用

// 2D float texture
texture<float, cudaTextureType2D, cudaReadModeElementType> texRef;

// Simple transformation kernel
__global__ void transformKernel(float* output,
                                int width, int height,
                                float theta)
{
     // Calculate normalized texture coordinates
     unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
     unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
     float u = x / (float)width;
     float v = y / (float)height;

     // Transform coordinates
     u -= 0.5f;
     v -= 0.5f;
     float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
     float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;

     // Read from texture and write to global memory
     output[y * width + x] = tex2D(texRef, tu, tv);
}
// Host code
int main()
{
     // Allocate CUDA array in device memory
     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,
                                                               cudaChannelFormatKindFloat);
     cudaArray* cuArray;
     cudaMallocArray(&cuArray, &channelDesc, width, height);

     // Copy to device memory some data located at address h_data
     // in host memory
     cudaMemcpyToArray(cuArray, 0, 0, h_data, size,
     cudaMemcpyHostToDevice);

     // Set texture reference parameters
     texRef.addressMode[0] = cudaAddressModeWrap;
     texRef.addressMode[1] = cudaAddressModeWrap;
     texRef.filterMode = cudaFilterModeLinear;
     texRef.normalized = true;

     // Bind the array to the texture reference
     cudaBindTextureToArray(texRef, cuArray, channelDesc);

     // Allocate result of transformation in device memory
     float* output;
     cudaMalloc(&output, width * height * sizeof(float));

     // Invoke kernel
     dim3 dimBlock(16, 16);
     dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x,
     (height + dimBlock.y - 1) / dimBlock.y);
     transformKernel<<<dimGrid, dimBlock>>>(output, width, height,
     angle);

     // Free device memory
     cudaFreeArray(cuArray);
     cudaFree(output);
     return 0;
}

紋理對象

// Simple transformation kernel
__global__ void transformKernel(float* output,
                                cudaTextureObject_t texObj,
                                int width, int height,
                                float theta)
{
     // Calculate normalized texture coordinates
     unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
     unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
     float u = x / (float)width;
     float v = y / (float)height;

     // Transform coordinates
     u -= 0.5f;
     v -= 0.5f;
     float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
     float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;

     // Read from texture and write to global memory
     output[y * width + x] = tex2D<float>(texObj, tu, tv);
}

int main()
{
     // Allocate CUDA array in device memory
     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,
                                                               cudaChannelFormatKindFloat);
     cudaArray* cuArray;
     cudaMallocArray(&cuArray, &channelDesc, width, height);

     // Copy to device memory some data located at address h_data
     // in host memory
     cudaMemcpyToArray(cuArray, 0, 0, h_data, size,
     cudaMemcpyHostToDevice);

     // Specify texture
     struct cudaResourceDesc resDesc;
     memset(&resDesc, 0, sizeof(resDesc));
     resDesc.resType = cudaResourceTypeArray;
     resDesc.res.array.array = cuArray;

     // Specify texture object parameters
     struct cudaTextureDesc texDesc;
     memset(&texDesc, 0, sizeof(texDesc));
     texDesc.addressMode[0] = cudaAddressModeWrap;
     texDesc.addressMode[1] = cudaAddressModeWrap;
     texDesc.filterMode = cudaFilterModeLinear;
     texDesc.readMode = cudaReadModeElementType;
     texDesc.normalizedCoords = 1;

     // Create texture object
     cudaTextureObject_t texObj = 0;
     cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);

     // Allocate result of transformation in device memory
     float* output;
     cudaMalloc(&output, width * height * sizeof(float));

     // Invoke kernel
     dim3 dimBlock(16, 16);
     dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x,
                  (height + dimBlock.y - 1) / dimBlock.y);
     transformKernel<<<dimGrid, dimBlock>>>(output,
     texObj, width, height,
     angle);

     // Destroy texture object
     cudaDestroyTextureObject(texObj);

     // Free device memory
     cudaFreeArray(cuArray);
     cudaFree(output);
     return 0;
}

特性

紋理對象

紋理引用

代碼實例（摘自CUDA_C_Programming_Guide）

Android.Camera2 API

Occupancy

細分着色器

共享內存

流

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結