谁能从标记文档中给我一个此代码的主程序
Who can give me a main program for this code from the documentation of mark?
我这里有马克哈里斯记录的代码(并行缩减)。我是 cuda 编程的新手,我不知道如何为此代码制作主程序。请帮助我,谢谢。
这是代码:
template <unsigned int blockSize>
__global__ voidreduce6(int *g_idata, int *g_odata, unsigned int n)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
do{sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; } while (i < n);
__syncthreads();
if (blockSize >= 512) {if(tid<256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) {if(tid<128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) {if(tid< 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32){
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
CUDA 示例代码中给出了使用此内核的完整示例代码。
请参阅文件 reduction_kernel.cu。 内核启动的包装器函数将根据特定的缩减方法选择一个内核,并且内核也针对线程块大小进行模板化:
case 6:
default:
if (isPow2(size))
{
switch (threads)
{
case 512:
reduce6<T, 512, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 256:
reduce6<T, 256, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 128:
reduce6<T, 128, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 64:
reduce6<T, 64, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 32:
reduce6<T, 32, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 16:
reduce6<T, 16, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 8:
reduce6<T, 8, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 4:
reduce6<T, 4, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 2:
reduce6<T, 2, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 1:
reduce6<T, 1, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
}
}
else
{
switch (threads)
{
case 512:
reduce6<T, 512, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 256:
reduce6<T, 256, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 128:
reduce6<T, 128, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 64:
reduce6<T, 64, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 32:
reduce6<T, 32, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 16:
reduce6<T, 16, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 8:
reduce6<T, 8, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 4:
reduce6<T, 4, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 2:
reduce6<T, 2, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 1:
reduce6<T, 1, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
}
}
break;
}
}
// Instantiate the reduction function for 3 types
template void
reduce<int>(int size, int threads, int blocks,
int whichKernel, int *d_idata, int *d_odata);
template void
reduce<float>(int size, int threads, int blocks,
int whichKernel, float *d_idata, float *d_odata);
template void
reduce<double>(int size, int threads, int blocks,
int whichKernel, double *d_idata, double *d_odata);
相关文章:
- 如何创建一个CMake变量,除非显式重写,否则使用默认值
- 删除一个线程上有数百万个字符串的大型哈希映射会影响另一个线程的性能
- 为什么两个不同的未命名名称空间可以共存于一个cpp文件中
- 运行同一解决方案的另一个项目的项目
- 挂起和取消挂起一个文件DLL
- 用C++中的一个变量定义一个常量
- 函数向量_指针有不同的原型,我可以构建一个吗
- 在c++中用vector填充一个简单的动态数组
- 如何在选项卡视图Qt中设置一个新项目,并保存以前的项目
- 预处理器:插入结构名称中的前一个行号
- 我在c++代码中生成了一个运行时#3异常
- 我想将一个对T类型的非常量左值引用绑定到一个T类型的临时值
- 从链接列表c++中删除一个项目
- 告诉一个 const char 数组,除了编译时 C 样式的字符串外,它不以 '