在transform_reduce中抛出异常bulk_kernel_by_value

thrust exception bulk_kernel_by_value in transform_reduce

本文关键字：kernel by bulk value transform reduce 抛出异常更新时间：2023-10-16

我正在研究一个优化问题，其中包含各种类似于类似形式的数学函数，所以我在FunctionObj

中扭曲它们

template <typename T>
struct FunctionObj
{
    T a;
     FunctionObj(): a(1)
    {
    }
};

并定义一个FuncEval来求的值

template <typename T>
__host__ __device__  inline T FuncEval(const FunctionObj<T> &f_obj, T x)
{
    return f_obj.a+x;
}

我真正想做的是sum {func(x)}，所以我定义了一个FuncEvalF函子来利用thrust::tranform_reduce

template <typename T>
struct FuncEvalF
{
    const FunctionObj<T>& f_obj;
    __host__ __device__ inline FuncEvalF(const FunctionObj<T>& in_f_obj) :f_obj(in_f_obj)
{
}
    __host__ __device__ inline T operator()(T x)
    {
        return FuncEval(f_obj, x);
    }
};
template <typename T>
__host__ __device__ inline T BatchFuncEval(const FunctionObj<T>  &f_obj, int size, const T *x_in);
template<>
inline float BatchFuncEval< float>(const FunctionObj<float>  &f_obj, int size, const float *x_in)
{
    return thrust::transform_reduce(thrust::device, thrust::device_pointer_cast(x_in), thrust::device_pointer_cast(x_in + size), FuncEvalF<float>(f_obj), static_cast<float>(0), thrust::plus<float>());
}

最后在main.cu中我调用transform_reduce

auto func = FuncEvalF<float>(FunctionObj<float>());
    float result = 0;
    try
    {
        result = thrust::transform_reduce(thrust::device, thrust::device_pointer_cast(dev_a), thrust::device_pointer_cast(dev_a + 10000), func, static_cast<float>(0), thrust::plus<float>());
    }
    catch (std::exception e)
    {
        printf("%s in thurst n ", e.what());
    }

这里出现了异常:bulk_kernel_by_value，即使我将10000改为10。当我将FuncEval的定义更改为

时，事情会变得更好

return x;

程序将输出正确但无意义的答案。我忍不住问我的代码出了什么问题?谢谢您的关注。完整的代码如下，cuda 7.0 sm_20

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/inner_product.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#include <thrust/transform_reduce.h>
#include <thrust/transform.h>
#include <stdio.h>
template <typename T>
struct FunctionObj
{
    T a;
     FunctionObj(): a(1)
    {
    }
};
template <typename T>
__host__ __device__  inline T FuncEval(const FunctionObj<T> &f_obj, T x)
{
    return f_obj.a+x;
}

template <typename T>
struct FuncEvalF
{
    const FunctionObj<T>& f_obj;
    __host__ __device__ inline FuncEvalF(const FunctionObj<T>& in_f_obj) :f_obj(in_f_obj)
    {
    }
    __host__ __device__ inline T operator()(T x)
    {
        return FuncEval(f_obj, x);
    }
};
template <typename T>
__host__ __device__ inline T BatchFuncEval(const FunctionObj<T>  &f_obj, int size, const T *x_in);
template<>
inline float BatchFuncEval< float>(const FunctionObj<float>  &f_obj, int size, const float *x_in)
{
    return thrust::transform_reduce(thrust::device, thrust::device_pointer_cast(x_in), thrust::device_pointer_cast(x_in + size), FuncEvalF<float>(f_obj), static_cast<float>(0), thrust::plus<float>());
}
int main()
{
    cudaError_t cudaE;
    float a[10000] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
    float* dev_a;
    cudaE = cudaMalloc((void**)(&dev_a), sizeof(float) * 10000);
    cudaE = cudaMemcpy(dev_a, a, sizeof(float) * 10000, cudaMemcpyHostToDevice);
    auto func = FuncEvalF<float>(FunctionObj<float>());
    float result = 0;
    try
    {
        result = thrust::transform_reduce(thrust::device, thrust::device_pointer_cast(dev_a), thrust::device_pointer_cast(dev_a + 10000), func, static_cast<float>(0), thrust::plus<float>());
    }
    catch (std::exception e)
    {
        printf("%s in thurst n ", e.what());
    }
    printf("the gpu float result is %fn", result);
    cudaFree(dev_a);
}

问题是struct FuncEvalF中的f_obj是const FunctionObj<T>&。

它在主机FunctionObj<float>()上作为临时实例化，但是对它的引用以后将不再有效。

解决这个问题的一种方法是创建它的副本，而不是保存它的引用:

template <typename T>
struct FuncEvalF
{
    FunctionObj<T> f_obj;
    ....
}