Cuda矩阵示例块大小

Cuda Matrix Example Block Size

本文关键字：Cuda 更新时间：2023-10-16

我刚开始学习CUDA，我一直在NVIDIA的网站上查看示例。具体来说，我已经实现了矩阵乘法的非共享版本（第一个样本是非共享版本，尽管它在共享内存部分）：

http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-存储器

当我更改块大小时，输出出现问题。NVIDIA的代码默认块大小为16，当我将两个矩阵相乘时，这会给我正确的输出。然而，如果我将块大小更改为大于16的任何值（同时仍然是16的倍数），则矩阵中所有元素的输出都为零。我也在我的笔记本电脑上测试了这一点，发现超过32岁而不是16岁的人都有同样的结果。有人能解释一下发生了什么吗？我在SLI中有两个9800GTX+视频卡，所以我的最大块大小应该是（512512,1）。为什么我只能做16个？

此外，我注意到在共享版本的矩阵乘法中也有同样的行为（也在NVIDIA页面上）。

我没有发布我的代码，因为如果我直接从NVIDIA网站复制代码，我会遇到同样的问题。

我真的很感激在这方面的任何帮助，或者提供资源来了解更多关于CUDA的详细信息。

谢谢！

我已按要求附上代码：

    #include "stdio.h"
    #include <cuda.h>
    #include <assert.h>
    #include <time.h>
    #include <math.h>
    // This is an example CUDA program that compares the timings of a matrix multiplication.
    // The comparisons are between the CPU, GPU, and the GPU with shared memory.
    #define BLOCK_SIZE 32
    typedef struct {
    int width;
    int height;
    int stride;
    float* elements;
    } Matrix;
    typedef void (*FuncPtr)(Matrix& A, Matrix& B, Matrix& C);
    void multiplyMatrix(Matrix& A, Matrix& B, Matrix& C);
    // Helper declarations
    void initializeMatrix(Matrix& A, int rows, int cols, float val);
    void copyMatrix(Matrix& dest, Matrix& src);
    void freeMatrix(Matrix& A);
    void printError(cudaError_t err);
    void printMat(Matrix& A);
    void setVal(Matrix& A, float val);
    double applyMultFunc(FuncPtr func, Matrix& A, Matrix& B, Matrix& C, int numOfIters);
    // CUDA declarations
    __global__ void cudaMultMat(Matrix A, Matrix B, Matrix C);

   int main() {
       printf("Beginning Matrix Multiplication Comparisonn");
       // Initialize matrix
       Matrix A, B, C;
       int rowsA = 32;
       int colsA = 32;
       int colsB = 32;
       initializeMatrix(A, rowsA, colsA, 5.0f);
       initializeMatrix(B, colsA, colsB, 2.0f);
       initializeMatrix(C, rowsA, colsB, 0.0f);
       // C = A * B using CPU, GPU, and GPU with shared memory
       FuncPtr gpuMatMult = &multiplyMatrix;
       int numOfIterations = 100;
       double multTime = applyMultFunc(gpuMatMult, A, B, C, numOfIterations);
       printMat(C); 
       // Update user
       printf("Normal Mat Mult Time: %fn", multTime);

       // Cleanup
       freeMatrix(A);
       freeMatrix(B);
       freeMatrix(C);
       printf("nPress Enter to continue...n");
       getchar();
       return 0;
  }
  void multiplyMatrix(Matrix& A, Matrix& B, Matrix& C) {
    // Initialize device matrices
    Matrix deviceA, deviceB, deviceC;
    copyMatrix(deviceA, A);
    copyMatrix(deviceB, B);
    copyMatrix(deviceC, C);
    // Initialize number of blocks and threads
    dim3 numOfThreadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);
    int xSize = (C.width + numOfThreadsPerBlock.x - 1) / numOfThreadsPerBlock.x;
    int ySize = (C.height + numOfThreadsPerBlock.y - 1) / numOfThreadsPerBlock.y;
    dim3 numOfBlocks(xSize, ySize);
    // Call CUDA kernel
    cudaMultMat<<<numOfBlocks, numOfThreadsPerBlock>>>(deviceA, deviceB, deviceC);
    printError(cudaThreadSynchronize());
    printError(cudaMemcpy(C.elements, deviceC.elements, C.height * C.width * sizeof(float), cudaMemcpyDeviceToHost));
    // Free cuda memory
    printError(cudaFree(deviceA.elements));
    printError(cudaFree(deviceB.elements));
    printError(cudaFree(deviceC.elements));
  }

 // CUDA definitions
 // GPU matrix multiplication (non-shared memory)
 __global__ void cudaMultMat(Matrix A, Matrix B, Matrix C) {
    // If the matrices are of the wrong size then return
    if(A.width != B.height) {
        return;
    }
    // Initialize the indexes into the grid
    int col = (blockDim.x * blockIdx.x) + threadIdx.x;
    int row = (blockDim.y * blockIdx.y) + threadIdx.y;
    // Initialize the result
    float cVal = 0.0f;
    // Find the result for the dot product of a row of A and a column of B
    for(int i = 0; i < A.width; i++) {
        cVal += A.elements[row * A.width + i] * B.elements[i * B.width + col];
     }
     // If we are in bounds then save the result
     if(row < C.height && col < C.width) {
        C.elements[row * C.width + col] = cVal;
     }
  } 
  // Helper functions
  void initializeMatrix(Matrix& A, int rows, int cols, float val) {
    A.width = cols;
    A.height = rows;
    A.stride = A.width;
    int numOfElements = A.width * A.height;
    A.elements = (float*) malloc(numOfElements * sizeof(float));
    for(int i = 0; i < numOfElements; i++) {
        A.elements[i] = val;
    }
   }
   void copyMatrix(Matrix& dest, Matrix& src) {
    dest.width = src.width;
    dest.height = src.height;
    dest.stride = src.stride;
    int size = src.width * src.height * sizeof(float);
    printError(cudaMalloc(&dest.elements, size)); 
    printError(cudaMemcpy(dest.elements, src.elements, size, cudaMemcpyHostToDevice));
   }
   void freeMatrix(Matrix& A) {
    free(A.elements);
   }
   void printError(cudaError_t err) {
    if(err != 0) {
        printf("CUDA ERROR: %sn", cudaGetErrorString(err));
        getchar();
    }
    }
    void printMat(Matrix& A) {
    printf("*********************************n");
    for(int i = 0; i < A.height; i++) {
         for(int j = 0; j < A.width; j++) {
             int index = i * A.width + j;
             printf("%2.1f, ", A.elements[index]); 
         }
         printf("n");
     }
  }
  void setVal(Matrix& A, float val) {
     for(int i = 0; i < A.width * A.height; i++) {
          A.elements[i] = val;
     }
  }
  double applyMultFunc(FuncPtr func, Matrix& A, Matrix& B, Matrix& C, int numOfIters) {
    clock_t startTime = clock();
    for(int i = 0; i < numOfIters; i++) {
        func(A, B, C);
     } 
     clock_t endTime = clock();
     return (double) (endTime - startTime) / CLOCKS_PER_SEC;
   }

当您增加块大小时，您将超过GPU的每个块规格的线程数。

9800GTX的每个块限制为512个线程，无论您如何创建块。16*16=256可以。32x32=1024不可以。在这种情况下，内核无法运行，因此输出不正确。

你的笔记本电脑可能有一个更新的GPU，每个块支持1024个线程，所以32 x 32可以，但任何更大的都不行。

如果你在代码中添加了正确的cuda错误检查，你就可以确认这一点。请注意，这段代码似乎有cuda错误检查，但在内核调用上实现的检查是不完整的。仔细研究我给出的链接，你就会发现其中的区别。如果您修改代码并进行完整的错误检查，您将看到错误。

如果GPU的计算能力为1.0/1.1，则每个块最多可以有512个线程。但在新的GPU设备中，每个块最多可以有1024个线程。