CUDA - cudaMallocPitch和cudaMemcpy2D使用,错误:InvalidValue, Inval
CUDA - cudaMallocPitch and cudaMemcpy2D use, Error: InvalidValue, InvalidPitchValue
好的,所以我试图得到一个二维数组cuda的工作,但它成为一个痛苦。错误在标题中,发生在cudaMemcpy2D。我认为这个问题对于训练有素的人来说是显而易见的。提前感谢您的任何帮助,我已经走在了我的班级前面,这是目前的学习指针。
#include <cuda_runtime.h>
#include <iostream>
#pragma comment (lib, "cudart")
/* Program purpose: pass a 10 x 10 matrix and multiply it by another 10x10 matrix */
float matrix1_host[100][100];
float matrix2_host[100][100];
float* matrix1_device;
float* matrix2_device;
size_t pitch;
cudaError_t err;
__global__ void addMatrix(float* matrix1_device,float* matrix2_device, size_t pitch){
// How this works
// first we start to cycle through the rows by using the thread's ID
// then we calculate an address from the address of a point in the row, by adding the pitch (size of each row) and * it by
// the amount of rows we've already completed, then we can use that address of somewhere at a start of a row to get the colums
// in the row with a normal array grab.
int r = threadIdx.x;
float* rowofMat1 = (float*)((char*)matrix1_device + r * pitch);
float* rowofMat2 = (float*)((char*)matrix2_device + r * pitch);
for (int c = 0; c < 100; ++c) {
rowofMat1[c] += rowofMat2[c];
}
}
void initCuda(){
err = cudaMallocPitch((void**)matrix1_device, &pitch, 100 * sizeof(float), 100);
err = cudaMallocPitch((void**)matrix2_device, &pitch, 100 * sizeof(float), 100);
//err = cudaMemcpy(matrix1_device, matrix1_host, 100*100*sizeof(float), cudaMemcpyHostToDevice);
//err = cudaMemcpy(matrix2_device, matrix2_host, 100*100*sizeof(float), cudaMemcpyHostToDevice);
err = cudaMemcpy2D(matrix1_device, 100*sizeof(float), matrix1_host, pitch, 100*sizeof(float), 100, cudaMemcpyHostToDevice);
err = cudaMemcpy2D(matrix2_device, 100*sizeof(float), matrix2_host, pitch, 100*sizeof(float), 100, cudaMemcpyHostToDevice);
}
void populateArrays(){
for(int x = 0; x < 100; x++){
for(int y = 0; y < 100; y++){
matrix1_host[x][y] = (float) x + y;
matrix2_host[y][x] = (float) x + y;
}
}
}
void runCuda(){
dim3 dimBlock ( 100 );
dim3 dimGrid ( 1 );
addMatrix<<<dimGrid, dimBlock>>>(matrix1_device, matrix2_device, 100*sizeof(float));
//err = cudaMemcpy(matrix1_host, matrix1_device, 100*100*sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy2D(matrix1_host, 100*sizeof(float), matrix1_device, pitch, 100*sizeof(float),100, cudaMemcpyDeviceToHost);
//cudaMemcpy(matrix1_host, matrix1_device, 100*100*sizeof(float), cudaMemcpyDeviceToHost);
}
void cleanCuda(){
err = cudaFree(matrix1_device);
err = cudaFree(matrix2_device);
err = cudaDeviceReset();
}
int main(){
populateArrays();
initCuda();
runCuda();
cleanCuda();
std::cout << cudaGetErrorString(cudaGetLastError());
system("pause");
return 0;
}
首先,通常您应该为matrix1和matrix2设置单独的间距变量。在这种情况下,它们将是从API调用cudaMallocPitch
返回的相同值,但在一般情况下,它们可能不是。
在cudaMemcpy2D
行中,呼叫的第二个参数是目标音高。这只是当你做cudaMallocPitch
调用这个特定的目标矩阵(即。(第一个参数)
第四个参数是源音高。由于这是用普通的主机分配来分配的,所以除了以字节为单位的宽度之外,它没有间距。
所以你交换了第二个和第四个参数。
所以不用这个:
err = cudaMemcpy2D(matrix1_device, 100*sizeof(float), matrix1_host, pitch, 100*sizeof(float), 100, cudaMemcpyHostToDevice);
试试这个:
err = cudaMemcpy2D(matrix1_device, pitch, matrix1_host, 100*sizeof(float), 100*sizeof(float), 100, cudaMemcpyHostToDevice);
,对cudaMemcpy2D
的第二次调用也是如此。第三次调用实际上是OK的,因为它在相反的方向上,源和目标矩阵被交换,所以它们与你的音调参数正确对齐。
相关文章:
- 警告处理为错误这里有什么问题
- "error: no matching function for call to"构造函数错误
- boost::进程间消息队列引发错误
- C++,OpenCV,尝试显示图像时"OpenCV(4.3.0) Error: Assertion failed (size.width>0 && size.height>0)"此错误
- 有关插入适配器的错误。[错误]请求从 'back_insert_iterator<vector<>>' 类型转换为非标量类型
- QT在错误的班级中寻找空位
- vector.resize()中的分配错误
- 代码在main()中运行,但在函数中出现错误
- 释放错误后堆使用
- (C++)分析树以计算返回错误值的简单算术表达式
- Project Euler问题4的错误解决方案
- 我的字符计数代码计算错误.为什么
- 从"int*"强制转换为"unsigned int"会丢失精度错误
- 尝试导入pybind-opencv模块时出现libgtk错误
- CMake项目Boost库错误:Boost/config/compiler/gcc.hpp:165:10:致命错误:cs
- 在某些循环内使用vector.push_back时出现分段错误
- MSVC多行宏编译器错误
- 静态数据成员的问题-修复链接错误会导致编译器错误
- 为什么在运行时没有向我们提供有关分段错误的更多信息?
- CUDA - cudaMallocPitch和cudaMemcpy2D使用,错误:InvalidValue, Inval