如何在结构中嵌入CUDA纹理对象

How to embed CUDA Texture Objects in structs?

本文关键字:CUDA 纹理 对象 结构      更新时间:2023-10-16

我们已经成功地使用下面的帖子来帮助创建包含int*等基本类型的结构。纹理为只读阵列提供了很好的性能提升。我们使用了其中的许多,这使得内核和内核子函数的参数列表又长又复杂。我们希望将纹理嵌入结构中,以减少参数长度和复杂性。

复制包含指向CUDA设备的指针的结构

下面是一个表示我们使用的代码方法的片段。它进行编译,但在运行时崩溃。

// Initialize texture description
memset(&textureDescription, 0, sizeof(textureDescription));
textureDescription.readMode = cudaReadModeElementType;
// Create Texture from variable
cudaTextureObject_t texture = 0;
cudaResourceDesc resource;
memset(&resource, 0, sizeof(resource));
resource.resType = cudaResourceTypeLinear;
resource.res.linear.devPtr = intArray;
resource.res.linear.desc.f = cudaChannelFormatKindSigned;
resource.res.linear.desc.x = 32; // bits per channel
resource.res.linear.sizeInBytes = count*sizeof(int);
cudaCreateTextureObject(&texture, resource, &textureDescription, NULL);
// These declarations are in the .h file
typedef struct SampleStructure {
   cudaTextureObject_t texture;
} SampleStructure;
SampleStructure *structureHost;
SampleStructure *structureDevice;
// Create host and device structures
structureHost = (SampleStructure *)malloc(sizeof(SampleStructure));
cudaMalloc(&structureDevice, sizeof(SampleStructure));
// Assign the texture object to the host structure
structureHost->texture = texture;
// Copy the host structure to Global Memory
cudaMemcpy(structureDevice, structureHost, sizeof(SampleStructure), cudaMemcpyHostToDevice));
// Pass Texture and Texture-embedded-in-structure to kernel
kenerl<<<1,1>>>(texture, structureDevice);
...
__global__ void
kernel(cudaTextureObject_t texture, SampleStructure *structureDevice) {
    value = tex1Dfetch<int>(texture, index); // Runs successfully at runtime
    value = tex1Dfetch<int>(structureDevice->texture, index); // Crashes at runtime
}

在内核代码(或子函数)中使用"纹理"变量时正确运行。当使用"structureDevice->texture"时,它会在运行时崩溃。

有人能展示一个简单的代码来展示如何成功地将纹理对象嵌入到传递给内核并在不崩溃的情况下运行的结构中吗?或者有人能指出我们提供的代码中可能存在的错误吗?

通过值传递结构得到一个有效的解决方案。以下是使其工作的等效代码。感谢@talonmies的建议。

虽然结构可以简化参数列表,但它可能会减慢执行速度,因为系统必须对全局内存进行2次调用,而不是1:1调用才能获得结构,1调用才能获得纹理。为了提高性能,可以将结构复制到共享内存中。在共享内存中使用该结构可以提高性能。

// Create the Texture Object
cudaResourceDesc resource;
memset(&resource, 0, sizeof(resource));
resource.resType = cudaResourceTypeLinear;
resource.res.linear.devPtr = intArray;
resource.res.linear.desc.f = cudaChannelFormatKindSigned;
resource.res.linear.desc.x = 32; // bits per channel
resource.res.linear.sizeInBytes = count*sizeof(int);
cudaCreateTextureObject(&texture, resource, &textureDescription, NULL);
// These structure declarations are in the .h file
typedef struct SampleStructure {
   cudaTextureObject_t texture;
} SampleStructure;
SampleStructure structureHost;
// Assign the texture object to the host structure
structureHost.texture = texture;
// Pass Texture and Texture-object-embedded-in-structure to kernel
kenerl<<<1,1>>>(texture, structureHost);
...
__global__ void
kernel(cudaTextureObject_t texture, SampleStructure structureDevice) {
    __shared__ SampleStructure structureSharedMemory;
    // Copy the structure to shared memory for faster access
    if (threadIdx.x == 0)
       structureSharedMemory = structureDevice;
    __threadfence_block();
    value = tex1Dfetch<int>(texture, index); // Runs successfully at runtime
    value = tex1Dfetch<int>(structureSharedMemory.texture, index); // Runs successfully at runtime
}