CUDA:有没有办法在继续之前强制每行完成

CUDA: Is there a way to force each line to complete before moving on?

本文关键字：有没有继续 CUDA 更新时间：2023-10-16

我是并行编程的新手，感谢您帮助理解它是如何工作的。这是一个人为的例子，我希望矩阵的每个单元格中的运算结果为 50。

结果取决于数组中 [index+1] 处的值。这在并行编程中效果不佳，因为值不是按顺序计算的，而且我每隔几个单元格就会得到不正确的结果。我拥有的创可贴是将功能拆分为多个功能，但我认为应该有更好的解决方案，尽管我不确定要搜索什么。谢谢。

库达代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <iostream>
#define TILE_WIDTH 16
using namespace std;
__global__ void cuda_arithmetic(int height, int width, float *B, float *C, float *initial_array, float *result_array){
    int             w                   =   blockIdx.x * blockDim.x + threadIdx.x; // Col // width
    int             h                   =   blockIdx.y * blockDim.y + threadIdx.y; // Row // height
    int             index               =   h * width + w;
    if ((w < width) && h < (height))                    //initial=20, B=2, C=10, result = 17;
        initial_array[index] = powf(C[index],2);
    if ((w < (width-1)) && h < (height))
        result_array[index] = initial_array[index+1] / B[index];
}
__global__ void cuda_arithmetic_step_1(int height, int width, float *B, float *C, float *initial_array, float *result_array){
    int             w                   =   blockIdx.x * blockDim.x + threadIdx.x; // Col // width
    int             h                   =   blockIdx.y * blockDim.y + threadIdx.y; // Row // height
    int             index               =   h * width + w;
    if ((w < width) && h < (height))
        initial_array[index] = powf(C[index],2);
}
__global__ void cuda_arithmetic_step_2(int height, int width, float *B, float *C, float *initial_array, float *result_array){
    int             w                   =   blockIdx.x * blockDim.x + threadIdx.x; // Col // width
    int             h                   =   blockIdx.y * blockDim.y + threadIdx.y; // Row // height
    int             index               =   h * width + w;
    if ((w < (width-1)) && h < (height))
        result_array[index] = initial_array[index+1] / B[index];
}
int main(){
    int             height              =   800;
    int             width               =   8192;
    float           *A                  =   new float[height * width];
    float           *B                  =   new float[height * width];
    float           *C                  =   new float[height * width];
    float           *result             =   new float[height * width];
    for (int i = 0; i < height; i++){
        for (int j = 0; j < width; j++){
            A[i*width+j] = 20;
            B[i*width+j] = 2;
            C[i*width+j] = 10;
            result[i*width+j] = 17;
        }
    }
    float           *gpu_A;
    float           *gpu_B;
    float           *gpu_C;
    float           *gpu_result;
    cudaMalloc((void **)&gpu_A,         (height * width * sizeof(float)));
    cudaMalloc((void **)&gpu_B,         (height * width * sizeof(float)));
    cudaMalloc((void **)&gpu_C,         (height * width * sizeof(float)));
    cudaMalloc((void **)&gpu_result,    (height * width * sizeof(float)));
    cudaMemcpy(gpu_A,       A,          (height * width * sizeof(float)), cudaMemcpyHostToDevice);
    cudaMemcpy(gpu_B,       B,          (height * width * sizeof(float)), cudaMemcpyHostToDevice); 
    cudaMemcpy(gpu_C,       C,          (height * width * sizeof(float)), cudaMemcpyHostToDevice); 
    cudaMemcpy(gpu_result,  result,     (height * width * sizeof(float)), cudaMemcpyHostToDevice);
    dim3            dimGrid((width - 1) / TILE_WIDTH + 1, (height - 1)/TILE_WIDTH + 1, 1);
    dim3            dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
    // CODE OPTION
    // incorrect result
    cuda_arithmetic<<<dimGrid,dimBlock>>>(height, width, gpu_B, gpu_C, gpu_A, gpu_result);
    // correct result
    //cuda_arithmetic_step_1<<<dimGrid,dimBlock>>>(height, width, gpu_B, gpu_C, gpu_A, gpu_result);
    //cuda_arithmetic_step_2<<<dimGrid,dimBlock>>>(height, width, gpu_B, gpu_C, gpu_A, gpu_result);
    cudaMemcpy(result, gpu_result, (height * width * sizeof(float)), cudaMemcpyDeviceToHost);
    for (int i = 0; i < height; i++){
        for (int j = 0; j < (width-1); j++){
            if (abs((result[i*(width-1)+j] - 50)) > 0.001){
                cout << "error: ";
                cout << i << " * " << width-1 << " + " << j << ": " << result[i*(width-1)+j] << endl;
                system("pause");
            }
        }
        cout << endl;
    }
    cout << endl;
    cudaFree(gpu_A);
    cudaFree(gpu_B);
    cudaFree(gpu_C);
    cudaFree(gpu_result);
    delete[] A;
    delete[] B;
    delete[] C;
    delete[] result;
    system("pause");
}

由于你的例子是人为的，我的回答会有些笼统。

通常，您正在处理全局同步的问题。

正如您所发现的，唯一干净的全局同步点是内核启动，因此，由于内核启动，在必要的同步点之前和之后将代码分解为多个部分将插入全局同步。
另一种方法是考虑是否可以本地化必要的同步。如果是这样，您可以考虑安排算法/数据，以便可以在线程块中处理必要的同步(其中共享内存和__syncthreads()为我们提供内置的协调/同步功能。这可能会在数据边界(例如线程块间边界(方面遇到一些挑战。处理边界数据的一种方法是让相邻线程块在边界区域中执行冗余计算，以便保证每个线程块在计算任何最终结果之前生成所有必要的中间结果。在这种情况下，您可以使用 __syncthreads() 安全地将中间结果的计算与最终结果分开，这是一个线程内块屏障。
在某些情况下，可以将依赖项减少到单个线程。例如，在代码中，可以使单个线程执行必要的计算：
```
initial_array[index+1] = powf(C[index+1],2);
```
和依赖结果计算：
```
result_array[index] = initial_array[index+1] / B[index];
```
由于保证在计算必要的
中间结果后执行相关计算，因此不需要其他同步。可能您的实际代码可能不适合进行如此微不足道的重写。

顺便说一句，请注意，您对 index+1 的使用将超出内核中最后一个线程块的范围(w = 宽度 -1，h = height-1(。另外，我不认为这种索引是您想要的：

        if (abs((result[i*(width-1)+j] - 50)) > 0.001){

我想你可能的意思是：

        if (abs((result[i*(width)+j] - 50)) > 0.001){

通过这些更改，您的cuda_arithmetic内核为我正常运行(即使它有轻微的越界问题(。

使用 __syncthreads(); .__syncthreads之前的所有代码都将在此内核启动的所有线程中执行，然后再执行之后的任何代码。

另请记住，最好将读取和写入操作分开以避免冲突(当多个线程从同一地址读取和写入时(。例如

array[i] = array2[i]

应返工为

   temp = array2[i]; 
   __syncthreads();
   array[i] = temp;
   __syncthreads();