特性
紋理內存是隻讀的
紋理內存可以通過歸一化座標訪問,當使用非歸一化的座標訪問紋理時,不會進行filter
當訪問紋理內存的模式具有較大的 “ 空間局部性 ” 時,性能會得到改善。這裏的空間局部性不只是像素在水平方向(內存地址)上的連續性,也包括了豎直方向上的連續性
紋理內存爲streaming featch(即同一個wrap中的所有thread同時訪問具有空間局部性的一塊內存)做了優化
即使全局內存爲16位或32位的整型,當作爲紋理內存進行訪問時,能將數據轉換爲32位浮點類型
不論是紋理引用還是紋理對象的創建,都需要先在全局內存創建一個內存區域,然後將該內存與紋理對象或紋理引用綁定,最後kernel通過紋理對象或紋理引用訪問紋理數據
如果在訪問紋理期間修改與其綁定的全局內存,除非在讀取紋理時發生cache miss,否則讀取到的像素數據和全局內存中的不一致
Packed data may be broadcast to separate variables in a single operation;
紋理對象
紋理對象在運行時創建,並且在創建時與紋理內存進行綁定
__host __ cudaError_t cudaCreateTextureObject(cudaTextureObject_t* pTexObject, const cudaResourceDesc* pResDesc, const cudaTextureDesc* pTexDesc, const cudaResourceViewDesc* pResViewDesc) struct cudaTextureDesc { enum cudaTextureAddressMode addressMode[3]; //clamp?wrap? enum cudaTextureFilterMode filterMode; //linera?point? enum cudaTextureReadMode readMode; //是否將有/無符號整型像素化爲【0,1】/【-1,1】範圍內的浮點數 int sRGB; int normalizedCoords; //是否通過歸一化座標訪問紋理 unsigned int maxAnisotropy; enum cudaTextureFilterMode mipmapFilterMode; float mipmapLevelBias; float minMipmapLevelClamp; float maxMipmapLevelClamp; }; struct cudaResourceDesc { cudaArray_t array; //原始數據 struct cudaChannelFormatDesc desc; void * devPtr; //原始數據 size_t height; cudaMipmappedArray_t mipmap; //原始數據 size_t pitchInBytes; //對於pitch2D的數據的padding大小 enum cudaResourceType resType; //原始數據的類型:array?linera?mipmapArray?pitch2D? size_t sizeInBytes; size_t width; }
紋理對象可作爲kernel參數傳入
__global__ void kernel(cudaTextureObject_t texObj) { ... }
紋理內存的訪問都是模板函數,以下函數的返回值可能是float,int,unsigned int …
tex1Dfetch() tex1D() tex2D() ... tex2DLayered() //訪問某一層mipmap texCubemap() texCubemapLayered() tex2Dgather()
紋理引用
紋理引用在編譯時創建,它不能作爲參數傳給kernel,必須聲明爲靜態全局變量
texture<DataType, Type, ReadMode> texRef; //無符號char,4通道,二維,不把整型通道數據轉爲浮點類型 texture<uchar4, cudaTextureType2D, cudaReadModeElementType> texRef;
紋理引用的其他屬性在運行時可改,該些mutable屬性繼承於textureReference類型
struct textureReference { int normalized; enum cudaTextureFilterMode filterMode; enum cudaTextureAddressMode addressMode[3]; struct cudaChannelFormatDesc channelDesc; //規定像素獲取時返回的數據類型 int sRGB; unsigned int maxAnisotropy; enum cudaTextureFilterMode mipmapFilterMode; float mipmapLevelBias; float minMipmapLevelClamp; float maxMipmapLevelClamp; } struct cudaChannelFormatDesc { int x, y, z, w; enum cudaChannelFormatKind f; };
紋理引用在被訪問前必須綁定到一塊設備端內存,多個紋理引用可以綁定到同一塊紋理內存
template<class T , int dim, enum cudaTextureReadMode readMode> cudaError_t cudaBindTextureToArray( const struct texture< T, dim, readMode >& tex, const struct cudaArray* array; )
- 紋理引用可以與紋理內存解綁定
代碼實例(摘自CUDA_C_Programming_Guide)
紋理引用
// 2D float texture texture<float, cudaTextureType2D, cudaReadModeElementType> texRef; // Simple transformation kernel __global__ void transformKernel(float* output, int width, int height, float theta) { // Calculate normalized texture coordinates unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; float u = x / (float)width; float v = y / (float)height; // Transform coordinates u -= 0.5f; v -= 0.5f; float tu = u * cosf(theta) - v * sinf(theta) + 0.5f; float tv = v * cosf(theta) + u * sinf(theta) + 0.5f; // Read from texture and write to global memory output[y * width + x] = tex2D(texRef, tu, tv); } // Host code int main() { // Allocate CUDA array in device memory cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray* cuArray; cudaMallocArray(&cuArray, &channelDesc, width, height); // Copy to device memory some data located at address h_data // in host memory cudaMemcpyToArray(cuArray, 0, 0, h_data, size, cudaMemcpyHostToDevice); // Set texture reference parameters texRef.addressMode[0] = cudaAddressModeWrap; texRef.addressMode[1] = cudaAddressModeWrap; texRef.filterMode = cudaFilterModeLinear; texRef.normalized = true; // Bind the array to the texture reference cudaBindTextureToArray(texRef, cuArray, channelDesc); // Allocate result of transformation in device memory float* output; cudaMalloc(&output, width * height * sizeof(float)); // Invoke kernel dim3 dimBlock(16, 16); dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y); transformKernel<<<dimGrid, dimBlock>>>(output, width, height, angle); // Free device memory cudaFreeArray(cuArray); cudaFree(output); return 0; }
紋理對象
// Simple transformation kernel __global__ void transformKernel(float* output, cudaTextureObject_t texObj, int width, int height, float theta) { // Calculate normalized texture coordinates unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; float u = x / (float)width; float v = y / (float)height; // Transform coordinates u -= 0.5f; v -= 0.5f; float tu = u * cosf(theta) - v * sinf(theta) + 0.5f; float tv = v * cosf(theta) + u * sinf(theta) + 0.5f; // Read from texture and write to global memory output[y * width + x] = tex2D<float>(texObj, tu, tv); } int main() { // Allocate CUDA array in device memory cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray* cuArray; cudaMallocArray(&cuArray, &channelDesc, width, height); // Copy to device memory some data located at address h_data // in host memory cudaMemcpyToArray(cuArray, 0, 0, h_data, size, cudaMemcpyHostToDevice); // Specify texture struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeArray; resDesc.res.array.array = cuArray; // Specify texture object parameters struct cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeWrap; texDesc.addressMode[1] = cudaAddressModeWrap; texDesc.filterMode = cudaFilterModeLinear; texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 1; // Create texture object cudaTextureObject_t texObj = 0; cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); // Allocate result of transformation in device memory float* output; cudaMalloc(&output, width * height * sizeof(float)); // Invoke kernel dim3 dimBlock(16, 16); dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y); transformKernel<<<dimGrid, dimBlock>>>(output, texObj, width, height, angle); // Destroy texture object cudaDestroyTextureObject(texObj); // Free device memory cudaFreeArray(cuArray); cudaFree(output); return 0; }