使用CUDA在GPU上进行图像处理的多线程

Multithreading for image processing at GPU using CUDA

本文关键字：图像处理多线程 CUDA GPU 使用更新时间：2023-10-16

问题声明：我必须连续处理从相机拍摄的800万像素图像。它必须有几种图像处理算法，如颜色插值、颜色变换等。这些操作在CPU上需要很长时间。所以，我决定使用CUDA内核在GPU上进行这些操作。我已经编写了一个用于颜色转换的CUDA内核。但我仍然需要在表现上得到更多的提升。

基本上有两个计算时间：

将source image从CPU复制到GPU，反之亦然
source image在GPU上的处理

当图像从CPU复制到GPU时。。。。没有其他事情发生。同样，当GPU处理图像时。。。没有其他事情发生。

我的想法：我想进行多线程处理，这样可以节省一些时间。我想在GPU上处理上一张图像的同时捕捉下一张图像。而且，当GPU完成对上一个图像的处理时，下一个图像已经在那里，可以从CPU传输到GPU。

我需要什么：我对多线程世界完全陌生。我正在看一些教程和其他一些东西来了解更多关于它的信息。所以，我正在寻找一些关于正确步骤和正确逻辑的建议

我不确定您是否真的需要线程。CUDA能够允许主机和设备之间异步并发执行(无需使用多个CPU线程(。您所要求的是一个非常标准的"流水线"算法。它看起来像这样：

$ cat t832.cu
#include <stdio.h>
#define IMGSZ 8000000
// for this example, NUM_FRAMES must be less than 255
#define NUM_FRAMES 128
#define nTPB 256
#define nBLK 64

unsigned char cur_frame = 0;
unsigned char validated_frame = 0;

bool validate_image(unsigned char *img) {
  validated_frame++;
  for (int i = 0; i < IMGSZ; i++) if (img[i] != validated_frame) {printf("image validation failed at %d, was: %d, should be: %dn",i, img[i], validated_frame); return false;}
  return true;
}
void CUDART_CB my_callback(cudaStream_t stream, cudaError_t status, void* data) {
    validate_image((unsigned char *)data);
}

bool capture_image(unsigned char *img){
  for (int i = 0; i < IMGSZ; i++) img[i] = cur_frame;
  if (++cur_frame == NUM_FRAMES) {cur_frame--; return true;}
  return false;
}
__global__ void img_proc_kernel(unsigned char *img){
  int idx = threadIdx.x + blockDim.x*blockIdx.x;
  while(idx < IMGSZ){
    img[idx]++;
    idx += gridDim.x*blockDim.x;}
}
int main(){
  // setup
  bool done = false;
  unsigned char *h_imgA, *h_imgB, *d_imgA, *d_imgB;
  size_t dsize = IMGSZ*sizeof(unsigned char);
  cudaHostAlloc(&h_imgA, dsize, cudaHostAllocDefault);
  cudaHostAlloc(&h_imgB, dsize, cudaHostAllocDefault);
  cudaMalloc(&d_imgA, dsize);
  cudaMalloc(&d_imgB, dsize);
  cudaStream_t st1, st2;
  cudaStreamCreate(&st1); cudaStreamCreate(&st2);
  unsigned char *cur = h_imgA;
  unsigned char *d_cur = d_imgA;
  unsigned char *nxt = h_imgB;
  unsigned char *d_nxt = d_imgB;
  cudaStream_t *curst = &st1;
  cudaStream_t *nxtst = &st2;

  done = capture_image(cur); // grabs a frame and puts it in cur
  // enter main loop
  while (!done){
    cudaMemcpyAsync(d_cur, cur, dsize, cudaMemcpyHostToDevice, *curst); // send frame to device
    img_proc_kernel<<<nBLK, nTPB, 0, *curst>>>(d_cur); // process frame
    cudaMemcpyAsync(cur, d_cur, dsize, cudaMemcpyDeviceToHost, *curst);
  // insert a cuda stream callback here to copy the cur frame to output
    cudaStreamAddCallback(*curst, &my_callback, (void *)cur, 0);
    cudaStreamSynchronize(*nxtst); // prevent overrun
    done = capture_image(nxt); // capture nxt image while GPU is processing cur
    unsigned char *tmp = cur;
    cur = nxt;
    nxt = tmp;   // ping - pong
    tmp = d_cur;
    d_cur = d_nxt;
    d_nxt = tmp;
    cudaStream_t *st_tmp = curst;
    curst = nxtst;
    nxtst = st_tmp;
    }
}
$ nvcc -o t832 t832.cu
$ cuda-memcheck ./t832
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

有许多cuda示例代码可能也很有用，例如simpleStreams、asyncAPI和simpleCallbacks

由于你的问题很广泛，我只能想到以下建议：

1(使用CUDA流

当使用多个CUDA流时，CPU->GPU之间的内存传输、GPU处理和GPU->CPU之间的内存传递可能重叠。这样，下一个图像的图像处理就可以在结果被传回时开始。

还可以分解每个帧。每帧使用n个流，并以偏移量启动图像处理内核n次。

2(应用生产者-消费者方案

生产者线程从相机捕获帧，并将它们存储在线程安全容器中。使用者线程从该源容器获取帧，使用其自己的CUDA流将其上传到GPU，启动内核并将结果复制回主机。在尝试从源容器获取新映像之前，每个使用者线程都会与其流同步。

一个简单的实现可以是这样的：

#include <vector>
#include <thread>
#include <memory>
struct ThreadSafeContainer{ /*...*/ };
struct Producer
{
    Producer(std::shared_ptr<ThreadSafeContainer> c) : container(c)
    {
    }
    void run()
    {
        while(true)
        {
            // grab image from camera
            // store image in container
        }
    }
    std::shared_ptr<ThreadSafeContainer> container;
};
struct Consumer
{
    Consumer(std::shared_ptr<ThreadSafeContainer> c) : container(c)
    {
        cudaStreamCreate(&stream);
    }
    ~Consumer()
    {
        cudaStreamDestroy(stream);
    }
    void run()
    {
        while(true)
        {
            // read next image from container
            // upload to GPU
            cudaMemcpyAsync(...,...,...,stream);
            // run kernel
            kernel<<<..., ..., ..., stream>>>(...);
            // copy results back
            cudaMemcpyAsync(...,...,...,stream);
            // wait for results 
            cudaStreamSynchronize(stream);
            // do something with the results
        }
    }
    std::shared_ptr<ThreadSafeContainer> container;
    cudaStream_t stream; // or multiple streams per consumer
};

int main()
{
    // create an instance of ThreadSafeContainer which whill be shared between Producer and Consumer instances 
    auto container = std::make_shared<ThreadSafeContainer>();
    // create one instance of Producer, pass the shared container as an argument to the constructor
    auto p = std::make_shared<Producer>(container);
    // create a separate thread which executes Producer::run  
    std::thread producer_thread(&Producer::run, p);
    const int consumer_count = 2;
    std::vector<std::thread> consumer_threads;
    std::vector<std::shared_ptr<Consumer>> consumers;
    // create as many consumers as specified
    for (int i=0; i<consumer_count;++i)
    {
        // create one instance of Consumer, pass the shared container as an argument to the constructor
        auto c = std::make_shared<Consumer>(container);
        // create a separate thread which executes Consumer::run
        consumer_threads.push_back(std::thread(&Consumer::run, c));
    }
    // wait for the threads to finish, otherwise the program will just exit here and the threads will be killed
    // in this example, the program will never exit since the infinite loop in the run() methods never end
    producer_thread.join();
    for (auto& t : consumer_threads)
    {
        t.join();
    }
    return 0;
}