CUDA矩阵乘法，执行时间长

Matrix Multiplication with CUDA, long execution time

本文关键字：执行时间 CUDA 更新时间：2023-10-16

我是CUDA的新手，我一直在努力找出我做错了什么。CUDA所花费的时间比仅仅使用CPU来乘以矩阵要长。如果我做错了什么，请告诉我。这是我的代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <cstdlib>
#include <assert.h>
#include <time.h>
#define size 100   // Matrix size
#define cols size   // Matrix width
#define rows size   // Matrix height
void checkCUDAError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) 
    {
        fprintf(stderr, "Cuda error: %s: %s.n", msg, cudaGetErrorString( err) );
        exit(EXIT_FAILURE);
    }                         
}
__global__ void matrixMul( int *A, int *B, int *C)
{   
    int bx = blockIdx.x; // Block index
    int tx = threadIdx.x; // Thread index
    int ts = blockDim.x; // number of threads   
    // Declaration of the shared memory C element
    extern __shared__ int c_element_sum[];
    c_element_sum[tx] = A[tx+((bx/ts)*ts)] * B[(bx%ts)+(tx*ts)];
    //Block until all threads in the block have written their data to shared mem
    __syncthreads();
    int sum;
    for(int i=0; i<ts; i++){
        if(i==0){
            sum=c_element_sum[i];
        }
        else{
            sum+=c_element_sum[i];
        }
    }
    C[bx] = sum;
}

/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
   //create timer.
   clock_t t1, t2;
   //start timer
   t1=clock();
   //allocate host memory for matrices
   unsigned int size_A = cols * rows;
   unsigned int mem_size_A = sizeof(int) * size_A;
   int* mA = (int*) malloc(mem_size_A);
   unsigned int size_B = cols * rows;
   unsigned int mem_size_B = sizeof(int) * size_B;
   int* mB = (int*) malloc(mem_size_B);
   unsigned int size_C = cols * rows;
   unsigned int mem_size_C = sizeof(int) * size_C;
   int* mC = (int*) malloc(mem_size_C);
   //initialize host memory
   for (int i = 0; i < size_A; ++i){
       mA[i] = 1;
       mB[i] = 1;
       mC[i] = 0;
   }
   // allocate device memory
   int* d_mA;
   int* d_mB;
   int* d_mC;
   cudaMalloc((void**) &d_mA, mem_size_A);
   cudaMalloc((void**) &d_mB, mem_size_B);
   cudaMalloc((void**) &d_mC, mem_size_C);
   //copy host memory to device (A and B)
   cudaMemcpy(d_mA, mA, mem_size_A, cudaMemcpyHostToDevice);
   cudaMemcpy(d_mB, mB, mem_size_B, cudaMemcpyHostToDevice);
   cudaMemcpy(d_mC, mC, mem_size_C, cudaMemcpyHostToDevice);
   // setup execution parameters
   int numThreadsPerBlock = cols;
   int numBlocks = (cols * rows);
   int sharedMemSize = numThreadsPerBlock * sizeof(int);
   dim3 dimGrid(numBlocks);
   dim3 dimBlock(numThreadsPerBlock);
   // execute the kernel
   matrixMul <<< dimGrid, dimBlock, sharedMemSize >>>(d_mA, d_mB, d_mC);
   //Block until device has completed
   cudaThreadSynchronize();
   // check if kernel execution generated an error
   // Check for any CUDA errors
   checkCUDAError("kernel invocation");
   //copy result from device to host
   cudaMemcpy(mC, d_mC, mem_size_C, cudaMemcpyDeviceToHost);
   // Check for any CUDA errors
   checkCUDAError("memcpy");
   //stop timer
   t2 = clock();
   //check results
   for (int i = 0; i < size_C; ++i){
       assert(mC[i] == cols);
   }
   //clean up memory
   free(mA);
   free(mB);
   free(mC);
   cudaFree(d_mA);
   cudaFree(d_mB);
   cudaFree(d_mC);
   printf("WITH CUDA - clocks: %d nn", t2-t1);
   //////////////////////////////
   ///////// CPU ONLY //////////
   /////////////////////////////
   //create timer.
   clock_t cpu_t1, cpu_t2;
   //start timer
   cpu_t1=clock();
   //allocate host memory for matrices
   unsigned int cpu_size_A = cols * rows;
   unsigned int cpu_mem_size_A = sizeof(int) * cpu_size_A;
   int* cpu_mA = (int*) malloc(cpu_mem_size_A);
   unsigned int cpu_size_B = cols * rows;
   unsigned int cpu_mem_size_B = sizeof(int) * cpu_size_B;
   int* cpu_mB = (int*) malloc(cpu_mem_size_B);
   unsigned int cpu_size_C = cols * rows;
   unsigned int cpu_mem_size_C = sizeof(int) * cpu_size_C;
   int* cpu_mC = (int*) malloc(cpu_mem_size_C);
   //initialize host memory
   for (int i = 0; i < cpu_size_A; ++i){
       cpu_mA[i] = 1;
       cpu_mB[i] = 1;
       cpu_mC[i] = 0;
   }
   int ts = cols;
   for(int bx=0; bx<(cols*rows);bx++){
       int sum = 0;
       for(int tx=0; tx<cols; tx++){
          sum += cpu_mA[tx+((bx/ts)*ts)] * cpu_mB[(bx%ts)+(tx*ts)];
       }
       cpu_mC[bx]=sum;
   }
   //stop timer
   cpu_t2 = clock();
   //check results
   for (int i = 0; i < cpu_size_C; ++i){
       assert(cpu_mC[i] == cols);
   }
   //clean up memory
   free(cpu_mA);
   free(cpu_mB);
   free(cpu_mC);
   printf("CPU ONLY - clocks: %d nn", cpu_t2-cpu_t1);
   return 0;
}

根据您的程序，这是意料之中的事。您的计时器看起来像是为程序的整个执行计时，包括复制到设备、计算时间和将结果复制回来。考虑到您为程序提供的工作量相当小（100x100矩阵），内存拷贝的开销远远超过了使用内核进行计算时获得的任何计算优势。内核本身也不是最有效的实现。

我不认为你做错了什么，只是你没有为GPU提供足够大的工作量，你可能会进一步优化你的内核。请注意，简单地扩大块的大小可能不会显著提高CPU的性能，因为您还将扩大内存管理时间。虽然在CUDA上编写程序的第一个实现相对简单，但要从中获得良好的性能要困难得多。使用CUDA最有效的方法是具有较高的计算与内存事务比率。例如，有一个由几个计算密集型内核组成的管道来连续操作一块数据，只需要在开始和结束时复制主机设备。

如果这只是一个帮助您学习CUDA编码的程序，那么这是一个很好的步骤，深入了解如何优化矩阵乘法内核将在许多其他情况下为您提供很好的帮助。如果您正在编写用于生产软件的内核，我建议您使用高度优化的线性代数库CUBLAS：http://developer.nvidia.com/cublas（或者其他已经为你做了艰苦工作的图书馆）。