库达如何将字符**从内核复制到主机

Cuda how to copy char** from kernel to host

本文关键字：内核主机复制字符更新时间：2023-10-16

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <fstream>
#include <algorithm>
#include <time.h>
using namespace std;
__global__ void kern_2D(char **desc, char** merge_char) {
int idx = threadIdx.x + blockDim.x*blockIdx.x;
int idy = threadIdx.y + blockDim.y*blockIdx.y;
if (idx < 10000)
{
char* s1 = desc[idx];
merge_char[idx] = s1;
//printf("From key = %sn", merge_char[idx]);
}
}

int main() {
cudaError_t err = cudaSuccess;
size_t max_line_len = 255;
char line[255];
size_t line_len;
size_t max_lines_desc = 10000;
//---------------------------------------------------------------------------------//
char **d_desc;
cudaMalloc(&d_desc, max_lines_desc * sizeof(char *));
char **m_desc = NULL;
m_desc = (char**)malloc(max_lines_desc * sizeof(char**));
char **d_temp_desc = NULL;
d_temp_desc = (char **)malloc(max_lines_desc * sizeof(char **));
FILE *f_desc = fopen("desc.txt", "r");
if (!f_desc)
{
fprintf(stderr, "Error opening file!n");
}
int idesc = 0;
do
{
if (!fgets(line, max_line_len, f_desc))
{
if (ferror(f_desc) && !feof(f_desc))
{
fprintf(stderr, "Error reading from file!n");
fclose(f_desc);
}
break;
}
line_len = strlen(line);
if ((line_len > 0) && (line[line_len - 1] == 'n'))
{
line[line_len - 1] = '';
--line_len;
}
m_desc[idesc] = line;
cudaMalloc(&(d_temp_desc[idesc]), sizeof(line) * sizeof(char));
cudaMemcpy(d_temp_desc[idesc], m_desc[idesc], sizeof(line) * sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_desc + idesc, &(d_temp_desc[idesc]), sizeof(char *), cudaMemcpyHostToDevice);
++idesc;
} while (idesc < max_lines_desc);
fclose(f_desc);
//---------------------------------------------------------------------------------//

char **merge_char;
cudaMallocManaged(&merge_char, max_lines_desc * sizeof(char *));

kern_2D << < 1, 1000 >> > (d_desc , merge_char);
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %s after launching addKernel!n", cudaGetErrorString(err));
}

//---------------------------------------------------------------------------------//
char** h_dev;
cudaMalloc((void**)(&h_dev), max_lines_desc * sizeof(char*));
err = cudaMemcpy(h_dev, merge_char, max_lines_desc * sizeof(char*), cudaMemcpyDeviceToHost);
if (err == cudaSuccess) printf("2: Okay n");

for (int i = 0; i < max_lines_desc; i++)
{
printf("%sn", h_dev[i]);
}

return 0;

}
//nvcc - arch = sm_30 - o kernel kernel.cu
// cuda - memcheck . / kernel

我很抱歉我的错误。我已经更新了我的代码。它完成了。

对于 desc.txt，该文件有 10000 行，如下所示。从设备复制到主机后，我检查了状态，但我错了。我无法打印字符**h_dev。

摩托车 CKD 新爱普索尼克 CKD 2017 CKD 2018 摩托车阿普索尼新摩托车阿普索尼克编号 125 摩托车APSONIC AP125 新摩托车APSONIC AP125

我不得不说，我真的不明白你在这里的意图是什么，因为你的内核唯一要做的就是交换指针。如果这就是你打算做的全部，你肯定会通过在任何地方使用双指针来让自己变得困难。仅仅管理指数要简单得多。

但是要解决您的问题，据我所知，您的"复制回主机"确实不正确。您实际上是在将数据从主机到设备进行深拷贝，因此您还需要在另一个方向上进行深拷贝(2 阶段拷贝(。

为此，我们不会在要托管的副本上使用cudaMalloc。cudaMalloc分配设备内存。如果要将某些内容复制到主机，则复制目标是主机内存。因此，我们需要一组cudaMemcpy操作，使用主机缓冲区作为目标，将数据深度复制回主机。

以下代码表示我可以对您展示的内容进行最简单的修改以完成此操作，它似乎适用于我的简单测试用例：

$ cat desc.txt
1motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
2motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
3motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
4motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
5motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
6motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap1
$ cat t301.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <fstream>
#include <algorithm>
#include <time.h>
using namespace std;
__global__ void kern_2D(char **desc, char** merge_char) {
int idx = threadIdx.x + blockDim.x*blockIdx.x;
int idy = threadIdx.y + blockDim.y*blockIdx.y;
if (idx < 10000)
{
char* s1 = desc[idx];
merge_char[idx] = s1;
//printf("From key = %sn", merge_char[idx]);
}
}

int main() {
cudaError_t err = cudaSuccess;
size_t max_line_len = 255;
char line[255];
size_t line_len;
size_t max_lines_desc = 10000;
//---------------------------------------------------------------------------------//
char **d_desc;
cudaMalloc(&d_desc, max_lines_desc * sizeof(char *));
char **m_desc = NULL;
m_desc = (char**)malloc(max_lines_desc * sizeof(char**));
char **d_temp_desc = NULL;
d_temp_desc = (char **)malloc(max_lines_desc * sizeof(char **));
FILE *f_desc = fopen("desc.txt", "r");
if (!f_desc)
{
fprintf(stderr, "Error opening file!n");
}
int idesc = 0;
do
{
if (!fgets(line, max_line_len, f_desc))
{
if (ferror(f_desc) && !feof(f_desc))
{
fprintf(stderr, "Error reading from file!n");
fclose(f_desc);
}
break;
}
line_len = strlen(line);
if ((line_len > 0) && (line[line_len - 1] == 'n'))
{
line[line_len - 1] = '';
--line_len;
}
m_desc[idesc] = line;
cudaMalloc(&(d_temp_desc[idesc]), sizeof(line) * sizeof(char));
cudaMemcpy(d_temp_desc[idesc], m_desc[idesc], sizeof(line) * sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_desc + idesc, &(d_temp_desc[idesc]), sizeof(char *), cudaMemcpyHostToDevice);
++idesc;
} while (idesc < max_lines_desc);
fclose(f_desc);
//---------------------------------------------------------------------------------//

char **merge_char;
cudaMallocManaged(&merge_char, max_lines_desc * sizeof(char *));

kern_2D << < 1, 1000 >> > (d_desc , merge_char);
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %s after launching addKernel!n", cudaGetErrorString(err));
}

//---------------------------------------------------------------------------------//
char** h_dev;
h_dev = (char **)malloc(max_lines_desc * sizeof(char*));
err = cudaMemcpy(h_dev, merge_char, max_lines_desc * sizeof(char*), cudaMemcpyDeviceToHost);
if (err == cudaSuccess) printf("2: Okay n");

for (int i = 0; i < 6; i++)
{
cudaMemcpy(line, h_dev[i], sizeof(line), cudaMemcpyDeviceToHost);
printf("%sn", line);
}

return 0;

}
$ nvcc -o t301 t301.cu
t301.cu(15): warning: variable "idy" was declared but never referenced
$ cuda-memcheck ./t301
========= CUDA-MEMCHECK
2: Okay
1motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
2motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
3motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
4motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
5motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap125
6motorcycle ckd new apsonic ckd 2017 ckd 2018 motorcycle apsoni new motorcycle apsonic no 125 motorcycle apsonic ap125 new motorcycle apsonic ap1
========= ERROR SUMMARY: 0 errors
$