命名空间 thrust：：system：：cuda：：thrust 中无法解释的错误，特别是在 "system_error" 和 "cuda_category" 中

Unexplained errors in namespace thrust::system::cuda::thrust specifically in "system_error" and "cuda_category"

本文关键字：cuda system thrust error category 错误无法解释命名空间特别是更新时间：2023-10-16

我正试图使用thrust:：raw_pointer_cast来转换一个原始指针，以捕获函子中的输出。我尝试了多种方法来将指针传递到浮点，但始终会遇到内存冲突和两个intellisense错误thrust:：system:：cuda:：thrust没有成员"system_error"，也没有成员"cuda_category"。奇怪的是，这似乎是throw_on_error.hpp程序中的一个错误，它似乎是BULK库的一部分，尽管我没有具体引用BULK。我是C++的新手，所以可能我误解了指针，或者我错过了某种include。

下面是我一直在努力工作的代码版本。如有任何帮助，我们将不胜感激。

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/sequence.h>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <algorithm>
#include <memory.h>
#include <cstdio>
#include <thread>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>

using namespace std;
const int num_segs = 1;  // number of segments to sort
const int num_vals = 5;  // number of values in each segment

template <typename T> 
struct sort_vector
{
    T *Ndata;
    T *Ddata;
    T *answer;
    sort_vector(T *_Ndata, T *_Ddata, float *a) : Ndata(_Ndata), Ddata(_Ddata), answer(a) {};

    __host__ __device__ void operator()(int idx)
    {
        thrust::sort(thrust::seq, Ndata + idx*num_vals, Ndata + ((idx + 1)*num_vals));
        thrust::sort(thrust::seq, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
        *answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
    }
};
int main() {
    thrust::device_vector<float> d_Ndata(num_segs*num_vals);
    d_Ndata[0] = 30;
    d_Ndata[1] = 5.5;
    d_Ndata[2] = 60;
    d_Ndata[3] = 21;
    d_Ndata[4] = 2;
    thrust::device_vector<float> d_Ddata(num_segs*num_vals);
    d_Ddata[0] = 50;
    d_Ddata[1] = 9.5;
    d_Ddata[2] = 30;
    d_Ddata[3] = 8.1;
    d_Ddata[4] = 1;
    cout << "original norm" << endl;
    int f = 0;
    while (f < num_segs*num_vals){
        cout << d_Ndata[f] << endl;
        f++;
    }
    cout << "original dut" << endl;
    int g = 0;
    while (g < num_segs*num_vals){
        cout << d_Ddata[g] << endl;
        g++;
    }
    thrust::device_vector<int> d_idxs(num_segs);
    thrust::sequence(d_idxs.begin(), d_idxs.end());
    float *answer = (float*)malloc(sizeof(float));
    cudaStream_t s1;
    cudaStreamCreate(&s1);

    clock_t start;
    double duration;
    start = clock();
    thrust::for_each(thrust::cuda::par.on(s1),
        d_idxs.begin(),
    d_idxs.end(), sort_vector<float>(thrust::raw_pointer_cast(d_Ndata.data()), thrust::raw_pointer_cast(d_Ddata.data()), thrust::raw_pointer_cast(answer)));
    cudaStreamSynchronize(s1);
    cout << "sum" << endl;
    cout << answer << endl;
    //free(answer);
    cudaStreamDestroy(s1);

    duration = (clock() - start) / (double)CLOCKS_PER_SEC;
    cout << "time " << duration << endl;
    cin.get();
    return 0;
}

主要问题在这里：

float *answer = (float*)malloc(sizeof(float));

这是在创建主机内存分配。当您将指针传递给函子时：

 thrust::raw_pointer_cast(answer)

您正在将指向主机内存的指针传递给将在设备代码中运行的函子。如果函子试图访问该位置，则这将是非法访问。在CUDA中，设备代码不允许直接访问主机指针位置，反之亦然（忽略此处未涉及的各种概念）。

所以当你的函子代码这样做时：

*answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));

这将在其尝试写入CCD_ 1时触发非法访问。

一个简单的解决方案是将answer创建为指向设备内存中正确分配的位置。下面的代码演示了更改，并为我运行时没有出现错误：

$ cat t1190.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/sequence.h>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <algorithm>
#include <memory.h>
#include <cstdio>
#include <thread>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>

using namespace std;
const int num_segs = 1;  // number of segments to sort
const int num_vals = 5;  // number of values in each segment

template <typename T>
struct sort_vector
{
    T *Ndata;
    T *Ddata;
    T *answer;
    sort_vector(T *_Ndata, T *_Ddata, float *a) : Ndata(_Ndata), Ddata(_Ddata), answer(a) {};

    __host__ __device__ void operator()(int idx)
    {
        thrust::sort(thrust::seq, Ndata + idx*num_vals, Ndata + ((idx + 1)*num_vals));
        thrust::sort(thrust::seq, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
        *answer = thrust::reduce(thrust::device, Ddata + idx*num_vals, Ddata + ((idx + 1)*num_vals));
    }
};
int main() {
    thrust::device_vector<float> d_Ndata(num_segs*num_vals);
    d_Ndata[0] = 30;
    d_Ndata[1] = 5.5;
    d_Ndata[2] = 60;
    d_Ndata[3] = 21;
    d_Ndata[4] = 2;
    thrust::device_vector<float> d_Ddata(num_segs*num_vals);
    d_Ddata[0] = 50;
    d_Ddata[1] = 9.5;
    d_Ddata[2] = 30;
    d_Ddata[3] = 8.1;
    d_Ddata[4] = 1;
    cout << "original norm" << endl;
    int f = 0;
    while (f < num_segs*num_vals){
        cout << d_Ndata[f] << endl;
        f++;
    }
    cout << "original dut" << endl;
    int g = 0;
    while (g < num_segs*num_vals){
        cout << d_Ddata[g] << endl;
        g++;
    }
    thrust::device_vector<int> d_idxs(num_segs);
    thrust::sequence(d_idxs.begin(), d_idxs.end());
    thrust::device_vector<float> dv_answer(1);
    //float *answer = (float*)malloc(sizeof(float));
    cudaStream_t s1;
    cudaStreamCreate(&s1);

    clock_t start;
    double duration;
    start = clock();
    thrust::for_each(thrust::cuda::par.on(s1),
        d_idxs.begin(),
    d_idxs.end(), sort_vector<float>(thrust::raw_pointer_cast(d_Ndata.data()), thrust::raw_pointer_cast(d_Ddata.data()), thrust::raw_pointer_cast(dv_answer.data())));
    cudaStreamSynchronize(s1);
    cout << "sum" << endl;
    cout << dv_answer[0] << endl;
    //free(answer);
    cudaStreamDestroy(s1);

    duration = (clock() - start) / (double)CLOCKS_PER_SEC;
    cout << "time " << duration << endl;
    return 0;
}
$ nvcc -std=c++11  t1190.cu -o t1190
$ ./t1190
original norm
30
5.5
60
21
2
original dut
50
9.5
30
8.1
1
sum
98.6
time 0.000919
$

我不会试图解释intellisense错误。Intellisense通常不能很好地处理CUDA，正如您所看到的，Intellisense可能会标记出实际上编译得很好的东西（比如这个问题中的这段代码）。如果CUDA代码编译正确，Intellisense报告的问题很有可能被安全地忽略。

作为额外的几点评论：

对于推力初学者来说，你似乎走上了一条奇怪的道路，从函子中运行推力算法。从技术上讲，您正在做的事情没有错，但这种类型的代码通常会保留用于特定情况，而不是用于一般的推力使用。由于您的num_segs在本例中为1，因此您将运行一个CUDA线程来执行所有这些工作，这肯定不会具有性能。如果你打算以后扩大规模，那太好了。我之前也发表过类似的评论，所以这里不再赘述。
此函子写入单个位置（*answer）以存放其结果。如果将其扩展到多个线程，则必须为函子提供多个写入位置（每个线程或传递给for_each的向量中的每个元素一个），否则线程将相互覆盖结果。