使用cuda选择基数

radix select using cuda

本文关键字：选择 cuda 使用更新时间：2023-10-16

我一直在使用CUDA开发基数选择，它利用k个最小元素对给定数量的元素进行排序。这种基数选择背后的主要思想是扫描从MSB到LSB的32位整数。它将左侧的所有0位和右侧的所有1位进行分区。包含k个最小元素的边被递归求解。我的分区过程运行得很好，但我在处理递归函数调用时遇到了问题。我无法停止递归。请帮帮我！我的内核函数如下：这是kernel.h

  #include "header.h"
  #define WARP_SIZE 32
  #define BLOCK_SIZE 32
__device__ int Partition(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
int threadID = threadIdx.x + BLOCK_SIZE * blockIdx.x;
int WarpID = threadID >> 5;
int LocWarpID = threadID - 32 * WarpID;
int NumWarps = N / WARP_SIZE;
int pivot;
__shared__ int DataPartition[BLOCK_SIZE];
__shared__ int DataBinary[WARP_SIZE];
for(int i = 0; i < NumWarps; i++)
{
    if(LocWarpID >= firstidx && LocWarpID <=lastidx)
    {
        int r = d_DataIn[i * WARP_SIZE + LocWarpID];
        int p = (r>>(31-bit))&1;
        unsigned int B = __ballot(p);
        unsigned int B_flip = ~B;
        if(p==1)
        {
            int b = B << (32-LocWarpID);
            int RightLoc = __popc(b);
            DataPartition[lastidx - RightLoc] = r;
        }
        else
        {
            int b_flip = B_flip << (32 - LocWarpID);
            int LeftLoc = __popc(b_flip);
            DataPartition[LeftLoc] = r;
        }
    if(LocWarpID <= lastidx - __popc(B))
        {
            d_DataIn[LocWarpID] = DataPartition[LocWarpID];
        }
    else
        {
            d_DataIn[LocWarpID] = DataPartition[LocWarpID];
        }
        pivot = lastidx - __popc(B); 
        return pivot+1;
    }
}   
}

__device__ int RadixSelect(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
if(firstidx == lastidx)
    return *d_DataIn;
int q = Partition(d_DataIn, firstidx, lastidx, k, N, bit);
int length = q - firstidx;
if(k == length)
    return *d_DataIn; 
else if(k < length)
    return RadixSelect(d_DataIn, firstidx, q-1, k, N, bit+1);
else
    return RadixSelect(d_DataIn, q, lastidx, k-length, N, bit+1);
}
 __global__ void radix(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
RadixSelect(d_DataIn, firstidx, lastidx, k, N, bit);
}

主机代码是main.cu，看起来像：

#include "header.h"
#include <iostream>
#include <fstream>
#include "kernel.h"
#define BLOCK_SIZE 32
using namespace std;
int main()
{
int N = 32;
thrust::host_vector<float>h_HostFloat(N);
thrust::counting_iterator <unsigned int> Numbers(0);
thrust::transform(Numbers, Numbers + N, h_HostFloat.begin(),         RandomFloatNumbers(1.f, 100.f));
thrust::host_vector<int>h_HostInt(N);
thrust::transform(h_HostFloat.begin(), h_HostFloat.end(), h_HostInt.begin(), FloatToInt());
thrust::device_vector<float>d_DeviceFloat = h_HostFloat;
thrust::device_vector<int>d_DeviceInt(N);   
thrust::transform(d_DeviceFloat.begin(), d_DeviceFloat.end(), d_DeviceInt.begin(), FloatToInt());
int *d_DataIn = thrust::raw_pointer_cast(d_DeviceInt.data());
int *h_DataOut;
float *h_DataOut1;
int fsize = N * sizeof(float);
int size = N * sizeof(int);
h_DataOut = new int[size];
h_DataOut1 = new float[fsize];
int firstidx = 0;
int lastidx = BLOCK_SIZE-1;
int k = 20;
int bit = 1;
int NUM_BLOCKS = N / BLOCK_SIZE;
radix <<< NUM_BLOCKS, BLOCK_SIZE >>> (d_DataIn, firstidx, lastidx, k, N, bit);
cudaMemcpy(h_DataOut, d_DataIn, size, cudaMemcpyDeviceToHost);  
WriteData(h_DataOut1, h_DataOut, 10, N);

return 0;
   }

我使用的标题列表：

   #include "cuda.h"
   #include "cuda_runtime_api.h"
   #include "device_launch_parameters.h"
   #include <thrust/host_vector.h>
   #include <thrust/device_vector.h>
   #include <thrust/transform.h>
   #include <thrust/generate.h>
   #include "functor.h"
   #include <thrust/iterator/counting_iterator.h>
   #include <thrust/copy.h>
   #include <thrust/device_ptr.h>

另一个头文件"functor.h"，用于将浮点数转换为int类型并生成随机浮点数。

  #include <thrust/random.h>
  #include <sstream>
  #include <fstream>
  #include <iomanip>
  struct RandomFloatNumbers
   {
float a, b;
__host__ __device__
    RandomFloatNumbers(float _a, float _b) : a(_a), b(_b) {};  
__host__ __device__
    float operator() (const unsigned int n) const{
        thrust::default_random_engine rng;
        thrust::uniform_real_distribution<float> dist(a,b);
        rng.discard(n);
        return dist(rng);
}
    };
   struct FloatToInt
    {
__host__ __device__
    int operator() (const float &x)
const {
    union {
        float f_value;
        int i_value;
    } value;
    value.f_value = x;
    return value.i_value;
}
    };
   float IntToFloat(int &x)
     {
    union{
          float f_value;
          int i_value;
        }value;
        value.i_value = x;
        return value.f_value;
     }
   bool WriteData(float *h_DataOut1, int *h_DataOut, int bit, int N)
    {
std::ofstream data;
std::stringstream file;
file << "out\Partition_";
file << std::setfill('0') <<std::setw(2) << bit;
file << ".txt";
data.open((file.str()).c_str());
if(data.is_open() ==  false)
{
    std::cout << "File is not open" << std::endl;
    return false;
}
    for(int i = 0; i < N; i++)
    {
        h_DataOut1[i] = IntToFloat(h_DataOut[i]);
        //cout << h_HostFloat[i] << " t" <<  h_DataOut1[i] << endl;
        //std::bitset<32>bitshift(h_DataOut[i]&1<<31-bit);
        //data << bitshift[31-bit] << "t" <<h_DataOut1[i] <<std::endl;
        data << h_DataOut1[i] << std::endl;
    }
    data << std::endl;
data.close();
std::cout << "Partition=" <<bit <<"n"; 
return true;
    }

根据您的请求，我发布了我用来调查此事的代码，并帮助我研究您的代码。

#include <stdio.h>
#include <stdlib.h>
__device__ int gpu_partition(unsigned int *data, unsigned int *partition, unsigned int *ones, unsigned int* zeroes, int bit, int idx, unsigned int* warp_ones){
  int one = 0;
  int valid = 0;
  int my_one, my_zero;
  if (partition[idx]){
    valid = 1;
    if(data[idx] & (1ULL<<(31-bit))) one=1;}
  __syncthreads();
  if (valid){
    if (one){
      my_one=1;
      my_zero=0;}
    else{
      my_one=0;
      my_zero=1;}
    }
  else{
    my_one=0;
    my_zero=0;}
  ones[idx]=my_one;
  zeroes[idx]=my_zero;
  unsigned int warp_one = __popc(__ballot(my_one));
  if (!(threadIdx.x & 31))
    warp_ones[threadIdx.x>>5] = warp_one;
  __syncthreads();
  // reduce
  for (int i = 16; i > 0; i>>=1){
    if (threadIdx.x < i)
      warp_ones[threadIdx.x] += warp_ones[threadIdx.x + i];
    __syncthreads();}
  return warp_ones[0];
}
__global__ void gpu_radixkernel(unsigned int *data, unsigned int m, unsigned int n, unsigned int *result){
  __shared__ unsigned int loc_data[1024];
  __shared__ unsigned int loc_ones[1024];
  __shared__ unsigned int loc_zeroes[1024];
  __shared__ unsigned int loc_warp_ones[32];
  int l=0;
  int bit = 0;
  unsigned int u = n;
  if (n<2){
    if ((n == 1) && !(threadIdx.x)) *result = data[0];
    return;}
  loc_data[threadIdx.x] = data[threadIdx.x];
  loc_ones[threadIdx.x] = (threadIdx.x<n)?1:0;
  __syncthreads();
  unsigned int *next = loc_ones;
  do {
    int s = gpu_partition(loc_data, next, loc_ones, loc_zeroes, bit++, threadIdx.x, loc_warp_ones);
    if ((u-s) > m){
      u = (u-s);
      next = loc_zeroes;}
    else{
      l = (u-s);
      next = loc_ones;}}
    while ((u != l) && (bit<32));
  if (next[threadIdx.x]) *result = loc_data[threadIdx.x];
}
int partition(unsigned int *data, int l, int u, int bit){
  unsigned int *temp = (unsigned int *)malloc(((u-l)+1)*sizeof(unsigned int));
  int pos = 0;
  for (int i = l; i<=u; i++)
    if(data[i] & (1ULL<<(31-bit))) temp[pos++] = data[i];
  int result = u-pos;
  for (int i = l; i<=u; i++)
    if(!(data[i] & (1ULL<<(31-bit)))) temp[pos++] = data[i];
  pos = 0;
  for (int i = u; i>=l; i--)
    data[i] = temp[pos++];
  free(temp);
  return result;
}
unsigned int radixselect(unsigned int *data, int l, int u, int m, int bit){
  if (l == u) return(data[l]);
  if (bit > 32) {printf("radixselect fail!n"); return 0;}
  int s = partition(data, l, u, bit);
  if (s>=m) return radixselect(data, l, s, m, bit+1);
  return radixselect(data, s+1, u, m, bit+1);
}
int main(){
  unsigned int data[8] = {32767, 22, 88, 44, 99, 101, 0, 7};
  unsigned int data1[8];
  for (int i = 0; i<8; i++){
    for (int j=0; j<8; j++) data1[j] = data[j];
    printf("value[%d] = %dn", i, radixselect(data1, 0, 7, i, 0));}
  unsigned int *d_data;
  cudaMalloc((void **)&d_data, 1024*sizeof(unsigned int));
  unsigned int h_result, *d_result;
  cudaMalloc((void **)&d_result, sizeof(unsigned int));
  cudaMemcpy(d_data, data, 8*sizeof(unsigned int), cudaMemcpyHostToDevice);
  for (int i = 0; i < 8; i++){
    gpu_radixkernel<<<1,1024>>>(d_data, i, 8, d_result);
    cudaMemcpy(&h_result, d_result, sizeof(unsigned int), cudaMemcpyDeviceToHost);
    printf("gpu result index %d = %dn", i, h_result);
    }
  unsigned int data2[1024];
  unsigned int data3[1024];
  for (int i = 0; i < 1024; i++) data2[i] = rand();
  cudaMemcpy(d_data, data2, 1024*sizeof(unsigned int), cudaMemcpyHostToDevice);
  for (int i = 0; i < 1024; i++){
    for (int j = 0; j<1024; j++) data3[j] = data2[j];
    unsigned int cpuresult = radixselect(data3, 0, 1023, i, 0);
    gpu_radixkernel<<<1,1024>>>(d_data, i, 1024, d_result);
    cudaMemcpy(&h_result, d_result, sizeof(unsigned int), cudaMemcpyDeviceToHost);
    if (h_result != cpuresult) {printf("mismatch at index %d, cpu: %d, gpu: %dn", i, cpuresult, h_result); return 1;}
    }
  printf("Finishedn");
  return 0;
}

以下是一些注意事项，没有特别的顺序：

我去掉了你所有的推力代码，就基数选择算法而言，它没有任何用处。我也觉得你选float到int很奇怪。我还没有考虑过在指数位序列后面跟着尾数位序列进行逐位基数选择的后果。它可能会起作用（尽管我认为如果你包括符号位，它肯定不会起作用），但我再次认为它不是理解算法的核心
我包含了一个主机版本，我只是为了检查我的设备结果而写的
我敢肯定，在某些情况下，如果元素重复，这种算法会失败。例如，如果你给它一个全为零的向量，我认为它会失败。不过，我认为处理那个案子并不困难
我的主机版本是递归的，但我的设备版本不是。我认为递归在这里没有那么有用，因为算法的非递归形式也很容易编写，尤其是因为最多有32位要遍历。尽管如此，如果您想创建递归设备版本，通过在分区函数中合并u、s和l操作代码应该不会很困难
我省去了典型的cuda错误检查。不过我还是推荐它
我不认为这是cuda编程的典范。如果你深入研究一个基数排序算法（比如这里），你会发现它非常复杂。快速GPU基数选择看起来一点也不像我的代码。我编写的代码类似于串行递归分区基数排序，这不是在大规模并行体系结构上实现这一点的最佳方式
由于基数选择不是一种排序，我试图编写一个不会对输入数据进行数据移动的设备代码，因为我认为这既昂贵又不必要。在内核开始时，我从全局内存中读取数据，然后在共享内存中完成所有工作，即使在共享内存内，我也不会重新排列数据（就像我在主机版本中所做的那样），以避免数据移动的成本。相反，我保留了1和0分区的标志数组，以供下一个分区步骤使用。数据移动将涉及相当数量的未恢复和/或银行冲突的流量，而标志数组允许所有访问都是非银行冲突的