使用CUDA的最大绝对差值

max Absolute difference using CUDA

本文关键字：CUDA 使用更新时间：2023-10-16

我们在上运行以下串行C代码

两个向量a[]和b[]：

double a[20000],b[20000],r=0.9;
for(int i=1;i<=10000;++i)
{
    a[i]=r*a[i]+(1-r)*b[i]];
    errors=max(errors,fabs(a[i]-b[i]);
    b[i]=a[i];
}

请告诉我们如何将此代码移植到CUDA和cublas？

使用thrust::transform_reduce也可以实现推力的减小。这个解决方案融合了整个操作，正如talonmies所建议的：

#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
// this functor unpacks a tuple and then computes
// a weighted absolute difference of its members
struct weighted_absolute_difference
{
  double r;
  weighted_absolute_difference(const double r)
    : r(r)
  {}
  __host__ __device__
  double operator()(thrust::tuple<double,double> t)
  {
    double a = thrust::get<0>(t);
    double b = thrust::get<1>(t);
    a = r * a + (1.0 - r) * b;
    return fabs(a - b);
  }
};
int main()
{
  using namespace thrust;
  const std::size_t n = 20000;
  const double r = 0.9;
  device_vector<double> a(n), b(n);
  // initialize a & b
  ...
  // do the reduction
  double result =
    transform_reduce(make_zip_iterator(make_tuple(a.begin(), b.begin())),
                     make_zip_iterator(make_tuple(a.end(),   b.end())),
                     weighted_absolute_difference(r),
                     -1.f,
                     maximum<double>());
  // note that this solution does not set
  // a[i] = r * a[i] + (1 - r) * b[i]
  return 0;
}

请注意，在这个解决方案中，我们不执行分配a[i] = r * a[i] + (1 - r) * b[i]，尽管在使用thrust::transform进行缩减后执行分配会很简单。在任一函子中修改transform_reduce的参数都是不安全的。

循环中的第二行：

errors=max(errors,fabs(a[i]-b[i]);

被称为减少。幸运的是，CUDA SDK中有reduction示例代码-看看这个，并将其用作算法的模板。

您可能希望将其拆分为两个独立的操作(可能作为两个独立内核(——一个用于并行部分(计算bp[]值(，另一个用于归约(计算errors(。