Cuda:具有位集数组的 XOR 单位集

Cuda: XOR single bitset with array of bitsets

本文关键字：数组 XOR 单位 Cuda 更新时间：2023-10-16

我想用一堆其他位集(~100k(对单个位集进行XOR运算，并计算每个异或结果的设置位。单个位集的大小约为 20k 位。

位集已经转换为unsigned int数组，以便能够使用固有的__popc()函数。"一堆"已经连续驻留在设备内存中。

我当前的内核代码如下所示：

// Grid/Blocks used for kernel invocation 
dim3 block(32); 
dim3 grid((bunch_size / 31) + 32);
__global__ void kernelXOR(uint * bitset, uint * bunch, int * set_bits, int bitset_size, int bunch_size) {
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < bunch_size){      // 1 Thread for each bitset in the 'bunch'
int sum = 0;
uint xor_res = 0;
for (int i = 0; i < bitset_size; ++i){  // Iterate through every uint-block of the bitsets
xor_res = bitset[i] ^ bunch[bitset_size * tid + i];
sum += __popc(xor_res);
}
set_bits[tid] = sum;
}
}

但是，与并行化的c ++/boost版本相比，我认为使用Cuda没有任何好处。

优化这个内核有什么潜力吗？

优化这个内核有什么潜力吗？

我在这里看到 2 个问题(它们是任何 CUDA 程序员的前两个经典主要优化目标(：

您希望尝试有效地使用全局内存。您对bitset和bunch的访问权限不会合并。 (高效使用内存子系统(
通常不建议每个块使用 32 个线程，这可能会限制您的整体占用率。每个位集一个线程也可能存在问题。(公开足够的并行性(

如果没有比较测试用例，解决这些问题是否符合您对收益的定义是不可能说的。此外，像这样的简单内存绑定问题在 CUDA 中很少有趣。但是，我们可以(可能(提高内核的性能。

我们将使用一系列想法：

让每个块处理一个位集，而不是每个线程，以启用合并
使用共享内存加载比较位集，并重用它
使用足够的块使 GPU 饱和，以及跨步循环
使用const ... __restrict__样式装饰可能从RO缓存中受益

这是一个工作示例：

$ cat t1649.cu
#include <iostream>
#include <cstdlib>
const int my_bitset_size = 20000/(32);
const int my_bunch_size = 100000;
typedef unsigned uint;
//using one thread per bitset in the bunch
__global__ void kernelXOR(uint * bitset, uint * bunch, int * set_bits, int bitset_size, int bunch_size) {
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < bunch_size){      // 1 Thread for each bitset in the 'bunch'
int sum = 0;
uint xor_res = 0;
for (int i = 0; i < bitset_size; ++i){  // Iterate through every uint-block of the bitsets
xor_res = bitset[i] ^ bunch[bitset_size * tid + i];
sum += __popc(xor_res);
}
set_bits[tid] = sum;
}
}
const int nTPB = 256;
// one block per bitset, multiple bitsets per block
__global__ void kernelXOR_imp(const uint * __restrict__  bitset, const uint * __restrict__  bunch, int * __restrict__  set_bits, int bitset_size, int bunch_size) {
__shared__ uint sbitset[my_bitset_size];  // could also be dynamically allocated for varying bitset sizes
__shared__ int ssum[nTPB];
// load shared, block-stride loop
for (int idx = threadIdx.x; idx < bitset_size; idx += blockDim.x) sbitset[idx] = bitset[idx];
__syncthreads();
// stride across all bitsets in bunch
for (int bidx = blockIdx.x; bidx < bunch_size; bidx += gridDim.x){
int my_sum = 0;
for (int idx = threadIdx.x; idx < bitset_size; idx += blockDim.x) my_sum += __popc(sbitset[idx] ^ bunch[bidx*bitset_size + idx]);
// block level parallel reduction
ssum[threadIdx.x] = my_sum;
for (int ridx = nTPB>>1; ridx > 0; ridx >>=1){
__syncthreads();
if (threadIdx.x < ridx) ssum[threadIdx.x] += ssum[threadIdx.x+ridx];}
if (!threadIdx.x) set_bits[bidx] = ssum[0];}
}

int main(){
// data setup
uint *d_cbitset, *d_bitsets, *h_cbitset, *h_bitsets;
int *d_r, *h_r, *h_ri;
h_cbitset = new uint[my_bitset_size];
h_bitsets = new uint[my_bitset_size*my_bunch_size];
h_r  = new int[my_bunch_size];
h_ri = new int[my_bunch_size];
for (int i = 0; i < my_bitset_size*my_bunch_size; i++){
h_bitsets[i] = rand();
if (i < my_bitset_size) h_cbitset[i] = rand();}
cudaMalloc(&d_cbitset, my_bitset_size*sizeof(uint));
cudaMalloc(&d_bitsets, my_bitset_size*my_bunch_size*sizeof(uint));
cudaMalloc(&d_r,  my_bunch_size*sizeof(int));
cudaMemcpy(d_cbitset, h_cbitset, my_bitset_size*sizeof(uint), cudaMemcpyHostToDevice);
cudaMemcpy(d_bitsets, h_bitsets, my_bitset_size*my_bunch_size*sizeof(uint), cudaMemcpyHostToDevice);
// original
// Grid/Blocks used for kernel invocation
dim3 block(32);
dim3 grid((my_bunch_size / 31) + 32);
kernelXOR<<<grid, block>>>(d_cbitset, d_bitsets, d_r, my_bitset_size, my_bunch_size);
cudaMemcpy(h_r, d_r, my_bunch_size*sizeof(int), cudaMemcpyDeviceToHost);

// improved
dim3 iblock(nTPB);
dim3 igrid(640);
kernelXOR_imp<<<igrid, iblock>>>(d_cbitset, d_bitsets, d_r, my_bitset_size, my_bunch_size);
cudaMemcpy(h_ri, d_r, my_bunch_size*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < my_bunch_size; i++)
if (h_r[i] != h_ri[i]) {std::cout << "mismatch at i: " << i << " was: " << h_ri[i] << " should be: " << h_r[i] << std::endl; return 0;}
std::cout << "Results match." << std::endl;
return 0;
}
$ nvcc -o t1649 t1649.cu
$ cuda-memcheck ./t1649
========= CUDA-MEMCHECK
Results match.
========= ERROR SUMMARY: 0 errors
$ nvprof ./t1649
==18868== NVPROF is profiling process 18868, command: ./t1649
Results match.
==18868== Profiling application: ./t1649
==18868== Profiling result:
Type  Time(%)      Time     Calls       Avg       Min       Max  Name
GPU activities:   97.06%  71.113ms         2  35.557ms  2.3040us  71.111ms  [CUDA memcpy HtoD]
2.26%  1.6563ms         1  1.6563ms  1.6563ms  1.6563ms  kernelXOR(unsigned int*, unsigned int*, int*, int, int)
0.59%  432.68us         1  432.68us  432.68us  432.68us  kernelXOR_imp(unsigned int const *, unsigned int const *, int*, int, int)
0.09%  64.770us         2  32.385us  31.873us  32.897us  [CUDA memcpy DtoH]
API calls:   78.20%  305.44ms         3  101.81ms  11.373us  304.85ms  cudaMalloc
18.99%  74.161ms         4  18.540ms  31.554us  71.403ms  cudaMemcpy
1.39%  5.4121ms         4  1.3530ms  675.30us  3.3410ms  cuDeviceTotalMem
1.26%  4.9393ms       388  12.730us     303ns  530.95us  cuDeviceGetAttribute
0.11%  442.37us         4  110.59us  102.61us  125.59us  cuDeviceGetName
0.03%  128.18us         2  64.088us  21.789us  106.39us  cudaLaunchKernel
0.01%  35.764us         4  8.9410us  2.9670us  18.982us  cuDeviceGetPCIBusId
0.00%  8.3090us         8  1.0380us     540ns  1.3870us  cuDeviceGet
0.00%  5.9530us         3  1.9840us     310ns  3.9900us  cuDeviceGetCount
0.00%  2.8800us         4     720ns     574ns     960ns  cuDeviceGetUuid
$

在这种情况下，在我的特斯拉 V100 上，对于您的问题大小，我见证了内核性能的大约 4 倍提升。但是，与数据移动的成本相比，这里的内核性能微不足道。因此，如果这是您在 GPU 上唯一做的事情，那么这些优化不太可能对您的比较测试用例产生重大影响。

上面的代码在块级别和网格级别使用跨步循环，这意味着它应该在几乎所有选择的线程块大小(请是 32 的倍数(以及网格大小的情况下正确运行。这并不意味着任何/所有选择都会表现相同。线程块大小的选择是为了允许几乎完全占用的可能性(因此不要选择 32(。网格大小的选择是实现每个SM完全占用的块数乘以SM的数量。这些应该是接近最优的选择，但根据我的测试，例如，更多的块并没有真正降低性能，并且几乎任何线程块大小(32 个除外(的性能应该大致恒定，假设块的数量是相应地计算的。