自定义内核GpuMat与float

Custom Kernel GpuMat with float

本文关键字：float GpuMat 内核自定义更新时间：2023-10-16

我试图使用GpuMat数据编写自定义内核来查找图像像素的弧余弦。当GPU具有CV_8UC1数据时，我可以上传，下载和更改值，但字符不能用于计算弧余弦。然而，当我试图将我的GPU转换为CV_32FC1类型(浮动)时，我在下载部分获得非法内存访问错误。下面是我的代码:

//.cu code 
#include <cuda_runtime.h>
#include <stdlib.h>
#include <iostream>
#include <stdio.h>
__global__ void funcKernel(const float* srcptr, float* dstptr, size_t srcstep, const     size_t dststep, int cols, int rows){
    int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
    int colInd = blockIdx.x*blockDim.x+threadIdx.x;
    if(rowInd >= rows || colInd >= cols)
            return;
    const float* rowsrcptr=srcptr+rowInd*srcstep;
    float* rowdstPtr=  dstptr+rowInd*dststep;
    float val = rowsrcptr[colInd];
    if((int) val % 90 == 0)
            rowdstPtr[colInd] = -1 ;
    else{
            float acos_val = acos(val);
            rowdstPtr[colInd] = acos_val;
    }
}
int divUp(int a, int b){
    return (a+b-1)/b;
}
extern "C"
{
void func(const float* srcptr, float* dstptr, size_t srcstep, const size_t dststep, int cols, int rows){
    dim3 blDim(32,8);
    dim3 grDim(divUp(cols, blDim.x), divUp(rows,blDim.y));
    std::cout << "calling kernel from funcn";
    funcKernel<<<grDim,blDim>>>(srcptr,dstptr,srcstep,dststep,cols,rows);
    std::cout << "done with kernel calln";
     cudaDeviceSynchronize();
}
//.cpp code
void callKernel(const GpuMat &src, GpuMat &dst){
    float* p = (float*)src.data;
    float* p2 =(float*) dst.data;
    func(p,p2,src.step,dst.step,src.cols,src.rows);
}
int main(){
    Mat input = imread("cat.jpg",0);
    Mat float_input;
    input.convertTo(float_input,CV_32FC1);
    GpuMat d_frame,d_output;
    Size size = float_input.size();
    d_frame.upload(float_input);
    d_output.create(size,CV_32FC1);
    callKernel(d_frame,d_output);
    Mat output(d_output);
    return 0;
}

当我运行程序时，我的编译器告诉我:

OpenCV错误:Gpu API调用(遇到非法内存访问)在copy, file中/home/mobile/opencv-2.4.9/模块/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp,的实例抛出后，第882行终止调用简历:异常的():/home/mobile/opencv-2.4.9/模块/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp: 882:错误:(-217)在函数中遇到非法内存访问复制

您可以使用cv::cuda::PtrStp<>或cv::cuda::PtrStpSz<>来编写您自己的内核(因此您不必使用GpuMat的step-Parameter，它可以稍微简化您的代码:D):

内核:

    __global__ void myKernel(const cv::cuda::PtrStepSzf input,
                             cv::cuda::PtrStepSzf output)
    {
        int x = blockIdx.x * blockDim.x + threadIdx.x;
        int y = blockIdx.y * blockDim.y + threadIdx.y;
        if (x <= input.cols - 1 && y <= input.rows - 1 && y >= 0 && x >= 0)
        {
           output(y, x) = input(y, x);
        }
    }

注意:
cv::cuda::PtrStep<>:无尺寸信息
cv::cuda::PtrStepSz<>: with size information
cv::cuda::PtrStepSzb: for unsigned char Mats (CV_8U)
cv::cuda::PtrStepSzf: for float Mats (CV_32F)
cv::cuda::PtrStep<cv::Point2f>:其他字体示例

内核调用:

    void callKernel(cv::InputArray _input,
                    cv::OutputArray _output,
                    cv::cuda::Stream _stream)
    {
        const cv::cuda::GpuMat input = _input.getGpuMat();
        _output.create(input.size(), input.type()); 
        cv::cuda::GpuMat output = _output.getGpuMat();
        dim3 cthreads(16, 16);
        dim3 cblocks(
            static_cast<int>(std::ceil(input1.size().width /
                static_cast<double>(cthreads.x))),
            static_cast<int>(std::ceil(input1.size().height / 
                static_cast<double>(cthreads.y))));
        cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
        myKernel<<<cblocks, cthreads, 0, stream>>>(input, output);
        cudaSafeCall(cudaGetLastError());
    }

你可以使用cv::cuda::GpuMat:

调用这个函数

   callKernel(d_frame, d_output, cv::cuda::Stream());

您将图像step视为float偏移量。它是从一行到下一行的字节偏移量。

试试这样:

const float* rowsrcptr= (const float *)(((char *)srcptr)+rowInd*srcstep);
float* rowdstPtr=  (float *) (((char *)dstptr)+rowInd*dststep);

从文档:

step -每个矩阵行占用的字节数。

这也是一个好主意，添加适当的cuda错误检查到你的代码(例如func)。您可以使用cuda-memcheck运行代码，以查看实际的内核故障如何生成无效的读/写操作。