推力矢量距离计算

thrust vector distance calculation

本文关键字：距离计算更新时间：2023-10-16

考虑以下数据集和质心。有 7 个个体和两个均值，每个均值有 8 个维度。它们是存储的行主顺序。

short dim = 8;
float centroids[] = {
    0.223, 0.002, 0.223, 0.412, 0.334, 0.532, 0.244, 0.612, 
    0.742, 0.812, 0.817, 0.353, 0.325, 0.452, 0.837, 0.441
};   
float data[] = {
    0.314, 0.504, 0.030, 0.215, 0.647, 0.045, 0.443, 0.325,
    0.731, 0.354, 0.696, 0.604, 0.954, 0.673, 0.625, 0.744, 
    0.615, 0.936, 0.045, 0.779, 0.169, 0.589, 0.303, 0.869, 
    0.275, 0.406, 0.003, 0.763, 0.471, 0.748, 0.230, 0.769, 
    0.903, 0.489, 0.135, 0.599, 0.094, 0.088, 0.272, 0.719, 
    0.112, 0.448, 0.809, 0.157, 0.227, 0.978, 0.747, 0.530, 
    0.908, 0.121, 0.321, 0.911, 0.884, 0.792, 0.658, 0.114
};

我想计算每个欧几里得距离。 c1 - d1， c1 - d2 ....在 CPU 上，我会做：

float dist = 0.0, dist_sqrt;
for(int i = 0; i < 2; i++)
    for(int j = 0; j < 7; j++)
    { 
        float dist_sum = 0.0;
        for(int k = 0; k < dim; k++)
        {
            dist = centroids[i * dim + k] - data[j * dim + k];
            dist_sum += dist * dist;
        }
        dist_sqrt = sqrt(dist_sum);
        // do something with the distance
        std::cout << dist_sqrt << std::endl;
    }

推力中是否有任何内置的矢量距离计算解决方案？

它可以在推力中完成。解释如何将相当复杂，并且代码相当密集。

首先要观察的关键是，核心操作可以通过转换后的还原来完成。推力变换操作用于执行向量（单个质心）的元素减法和每个结果的平方，并且归约将结果相加以产生欧几里得距离的平方。此操作的起点是 thrust::reduce_by_key ，但将数据正确呈现给reduce_by_key会相当复杂。

最终结果是通过从上面取每个结果的平方根产生的，我们可以使用普通thrust::transform。

以上是对完成所有工作的仅有的 2 行推力代码的摘要描述。但是，第一行具有相当大的复杂性。为了利用并行性，我采取的方法是按顺序虚拟"布置"必要的向量，以呈现给reduce_by_key。举一个简单的例子，假设我们有 2 个质心和 4 个个体，假设我们的维度是 2。

centroid 0: C00 C01
centroid 1: C10 C11
individ  0: I00 I01
individ  1: I10 I11
individ  2: I20 I21
individ  3: I30 I31

我们可以像这样"布置"向量：

 C00 C01 C00 C01 C00 C01 C00 C01 C10 C11 C10 C11 C10 C11 C10 C11
 I00 I01 I10 I11 I20 I21 I30 I31 I00 I01 I10 I11 I20 I21 I30 I31

为了便于reduce_by_key，我们还需要生成键值来描述向量：

   0   0   1   1   0   0   1   1   0   0   1   1   0   0   1   1

上述数据"布局"数据集可能相当大，我们不想产生存储和检索成本，因此我们将使用 Thrust 的花哨迭代器集合生成这些"动态"。这就是事情变得非常密集的地方。考虑到上述策略，我们将使用 thrust：：reduce_by_key 来完成这项工作。我们将创建一个提供给transform_iterator的自定义函子，以对I和C向量进行减法（和平方），为此目的，这些向量将被压缩在一起。向量的"布局"将使用具有附加自定义索引创建函子的排列迭代器动态创建，以帮助在每个I和C中复制模式。

因此，从"由内而外"工作，步骤顺序如下：

对于I（data）和C（centr），使用counting_iterator与transform_iterator内部的自定义索引函子相结合，以产生我们需要的索引序列。
使用在步骤 1 中创建的索引序列以及基本I和C向量，通过permutation_iterator（每个布局的向量一个）虚拟地"布局"向量。
将 2 个"布局"的虚拟I和C向量压缩在一起，以创建一个<float, float>元组向量（虚拟）。
取步骤 3 中的zip_iterator，并在transform_iterator中与自定义距离计算函子（ (I-C)^2 ）结合使用
使用另一个transform_iterator，将counting_iterator与自定义密钥生成函子相结合，以生成密钥序列（虚拟）
将步骤 4 和 5 中的迭代器传递给reduce_by_key作为要减少的输入（键、值）。 reduce_by_key的输出向量也是键和值。我们不需要密钥，因此我们将使用discard_iterator来转储这些密钥。我们将保存的值。

上述步骤都是在一行推力代码中完成的。

下面是说明上述内容的代码：

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/copy.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <stdlib.h>
#define MAX_DATA 100000000
#define MAX_CENT 5000
#define TOL 0.001
unsigned long long dtime_usec(unsigned long long prev){
#define USECPSEC 1000000ULL
  timeval tv1;
  gettimeofday(&tv1,0);
  return ((tv1.tv_sec * USECPSEC)+tv1.tv_usec) - prev;
}
unsigned verify(float *d1, float *d2, int len){
  unsigned pass = 1;
  for (int i = 0; i < len; i++)
    if (fabsf(d1[i] - d2[i]) > TOL){
      std::cout << "mismatch at:  " << i << " val1: " << d1[i] << " val2: " << d2[i] << std::endl;
      pass = 0;
      break;}
  return pass;
}
void eucl_dist_cpu(const float *centroids, const float *data, float *rdist, int num_centroids, int dim, int num_data, int print){
  int out_idx = 0;
  float dist, dist_sqrt;
  for(int i = 0; i < num_centroids; i++)
    for(int j = 0; j < num_data; j++)
    {
        float dist_sum = 0.0;
        for(int k = 0; k < dim; k++)
        {
            dist = centroids[i * dim + k] - data[j * dim + k];
            dist_sum += dist * dist;
        }
        dist_sqrt = sqrt(dist_sum);
        // do something with the distance
        rdist[out_idx++] = dist_sqrt;
        if (print) std::cout << dist_sqrt << ", ";
    }
    if (print) std::cout << std::endl;
}

struct dkeygen : public thrust::unary_function<int, int>
{
  int dim;
  int numd;
  dkeygen(const int _dim, const int _numd) : dim(_dim), numd(_numd) {};
  __host__ __device__ int operator()(const int val) const {
    return (val/dim);
    }
};
typedef thrust::tuple<float, float> mytuple;
struct my_dist : public thrust::unary_function<mytuple, float>
{
  __host__ __device__ float operator()(const mytuple &my_tuple) const {
    float temp = thrust::get<0>(my_tuple) - thrust::get<1>(my_tuple);
    return temp*temp;
  }
};

struct d_idx : public thrust::unary_function<int, int>
{
  int dim;
  int numd;
  d_idx(int _dim, int _numd) : dim(_dim), numd(_numd) {};
  __host__ __device__ int operator()(const int val) const {
    return (val % (dim*numd));
    }
};
struct c_idx : public thrust::unary_function<int, int>
{
  int dim;
  int numd;
  c_idx(int _dim, int _numd) : dim(_dim), numd(_numd) {};
  __host__ __device__ int operator()(const int val) const {
    return (val % dim) + (dim * (val/(dim*numd)));
    }
};
struct my_sqrt : public thrust::unary_function<float, float>
{
  __host__ __device__ float operator()(const float val) const {
    return sqrtf(val);
  }
};

unsigned long long eucl_dist_thrust(thrust::host_vector<float> &centroids, thrust::host_vector<float> &data, thrust::host_vector<float> &dist, int num_centroids, int dim, int num_data, int print){
  thrust::device_vector<float> d_data = data;
  thrust::device_vector<float> d_centr = centroids;
  thrust::device_vector<float> values_out(num_centroids*num_data);
  unsigned long long compute_time = dtime_usec(0);
  thrust::reduce_by_key(thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), dkeygen(dim, num_data)), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(dim*num_data*num_centroids), dkeygen(dim, num_data)),thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_centr.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), c_idx(dim, num_data))), thrust::make_permutation_iterator(d_data.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), d_idx(dim, num_data))))), my_dist()), thrust::make_discard_iterator(), values_out.begin());
  thrust::transform(values_out.begin(), values_out.end(), values_out.begin(), my_sqrt());
  cudaDeviceSynchronize();
  compute_time = dtime_usec(compute_time);
  if (print){
    thrust::copy(values_out.begin(), values_out.end(), std::ostream_iterator<float>(std::cout, ", "));
    std::cout << std::endl;
    }
  thrust::copy(values_out.begin(), values_out.end(), dist.begin());
  return compute_time;
}

int main(int argc, char *argv[]){
  int dim = 8;
  int num_centroids = 2;
  float centroids[] = {
    0.223, 0.002, 0.223, 0.412, 0.334, 0.532, 0.244, 0.612,
    0.742, 0.812, 0.817, 0.353, 0.325, 0.452, 0.837, 0.441
  };
  int num_data = 8;
  float data[] = {
    0.314, 0.504, 0.030, 0.215, 0.647, 0.045, 0.443, 0.325,
    0.731, 0.354, 0.696, 0.604, 0.954, 0.673, 0.625, 0.744,
    0.615, 0.936, 0.045, 0.779, 0.169, 0.589, 0.303, 0.869,
    0.275, 0.406, 0.003, 0.763, 0.471, 0.748, 0.230, 0.769,
    0.903, 0.489, 0.135, 0.599, 0.094, 0.088, 0.272, 0.719,
    0.112, 0.448, 0.809, 0.157, 0.227, 0.978, 0.747, 0.530,
    0.908, 0.121, 0.321, 0.911, 0.884, 0.792, 0.658, 0.114,
    0.721, 0.555, 0.979, 0.412, 0.007, 0.501, 0.844, 0.234
  };
  std::cout << "cpu results: " << std::endl;
  float dist[num_data*num_centroids];
  eucl_dist_cpu(centroids, data, dist, num_centroids, dim, num_data, 1);
  thrust::host_vector<float> h_data(data, data + (sizeof(data)/sizeof(float)));
  thrust::host_vector<float> h_centr(centroids, centroids + (sizeof(centroids)/sizeof(float)));
  thrust::host_vector<float> h_dist(num_centroids*num_data);
  std::cout << "gpu results: " << std::endl;
  eucl_dist_thrust(h_centr, h_data, h_dist, num_centroids, dim, num_data, 1);
  float *data2, *centroids2, *dist2;
  num_centroids = 10;
  num_data = 1000000;
  if (argc > 2) {
    num_centroids = atoi(argv[1]);
    num_data = atoi(argv[2]);
    if ((num_centroids < 1) || (num_centroids > MAX_CENT)) {std::cout << "Num centroids out of range" << std::endl; return 1;}
    if ((num_data < 1) || (num_data > MAX_DATA)) {std::cout << "Num data out of range" << std::endl; return 1;}
    if (num_data * dim * num_centroids > 2000000000) {std::cout << "data set out of range" << std::endl; return 1;}}
  std::cout << "Num Data: " << num_data << std::endl;
  std::cout << "Num Cent: " << num_centroids << std::endl;
  std::cout << "result size: " << ((num_data*num_centroids*4)/1048576) << " Mbytes" << std::endl;
  data2 = new float[dim*num_data];
  centroids2 = new float[dim*num_centroids];
  dist2 = new float[num_data*num_centroids];
  for (int i = 0; i < dim*num_data; i++) data2[i] = rand()/(float)RAND_MAX;
  for (int i = 0; i < dim*num_centroids; i++) centroids2[i] = rand()/(float)RAND_MAX;
  unsigned long long dtime = dtime_usec(0);
  eucl_dist_cpu(centroids2, data2, dist2, num_centroids, dim, num_data, 0);
  dtime = dtime_usec(dtime);
  std::cout << "cpu time: " << dtime/(float)USECPSEC << "s" << std::endl;
  thrust::host_vector<float> h_data2(data2, data2 + (dim*num_data));
  thrust::host_vector<float> h_centr2(centroids2, centroids2 + (dim*num_centroids));
  thrust::host_vector<float> h_dist2(num_data*num_centroids);
  dtime = dtime_usec(0);
  unsigned long long ctime = eucl_dist_thrust(h_centr2, h_data2, h_dist2, num_centroids, dim, num_data, 0);
  dtime = dtime_usec(dtime);
  std::cout << "gpu total time: " << dtime/(float)USECPSEC << "s, gpu compute time: " << ctime/(float)USECPSEC << "s" << std::endl;
  if (!verify(dist2, &(h_dist2[0]), num_data*num_centroids)) {std::cout << "Verification failure." << std::endl; return 1;}
  std::cout << "Success!" << std::endl;
  return 0;
}

笔记：

该代码设置为执行 2 次传递，一次简短的传递使用与您的数据集类似的数据集，并带有打印输出以进行目视检查。然后，可以通过命令行大小调整参数（质心数，然后是个体数）输入更大的数据集，以进行基准比较和结果验证。
与我在评论中所说的相反，推力代码的运行速度仅比朴素的单线程 CPU 代码快约 25%。您的里程可能会有所不同。
这只是考虑处理它的一种方式。我有其他想法，但没有足够的时间来充实它们。
数据集可能会变得相当大。现在的代码旨在仅限于dimension*number_of_centroids*number_of_individuals乘积小于 20 亿的数据集。但是，当您接近这个数字时，您将需要一个 GPU 和 CPU 都有几 GB 的内存。我简要地探讨了更大的数据集大小。在不同的地方需要进行一些代码更改才能从例如 int unsigned long long等。但是我没有提供，因为我仍在调查该代码的问题。
对于另一个与推力无关的关于在 GPU 上计算欧几里得距离的视图，您可能对这个问题感兴趣。如果您遵循那里进行的优化顺序，它可能会阐明如何改进此推力代码，或者如何使用另一个非推力实现。
抱歉，我无法挤出更多性能。