使用CUDA对矩阵执行行/列操作

Row-wise/column-wise operations on matrices with CUDA

本文关键字：操作执行 CUDA 使用更新时间：2023-10-16

我对CUDA编程相对陌生。我已经理解了编程模型，并且已经编写了一些基本的内核。我知道如何将内核应用于矩阵(存储为1D数组)的每个元素，但现在我正试图找出如何将相同的操作应用于输入矩阵的同一行/列。

假设我有一个MxN矩阵和一个长度为N的向量。我想把向量和矩阵的每一行求和(但可以是任何其他数学运算)。此类操作的序列号为：

for (int c = 0; c < columns; c++) 
{
for (int r = 0; r < rows; r++)
{
M[r * rows + c] += V[c];
}
}

现在，用于执行此操作的CUDA代码应该非常简单：我应该生成与元素一样多的CUDA线程，并应用此内核：

__global__ void kernel(const unsigned int size, float* matrix, const float* vector)
{
// get the current element index for the thread
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
{
// sum the current element with the 
matrix[idx] += vector[threadIdx.x];
}
}

它运行，但结果不正确。事实上，如果我在内核完成它的工作后转置矩阵，这是正确的。不幸的是，我不知道为什么它会以这种方式工作。你能帮我解决这个问题吗？提前谢谢。

编辑#1

我使用启动内核

int block_size = 64;
int grid_size = (M * N + block_size - 1) / block_size;
kernel<<<grid_size, block_size>>>(M * N, matrix, vector);

编辑#2

我按照@RobertCrovella:的建议修复了CPU代码，解决了这个问题

M[r * columns + c] += V[c];

它应该匹配外部for，也就是说，在列上。

问题中显示的内核可以在不做修改的情况下使用，以将向量与矩阵的每一行求和(假设为c样式的行主存储)，但受某些限制。演示在这里。

该方法的主要限制是，可以处理的最大矢量长度以及矩阵宽度等于每个块的最大线程数，在当前CUDA 7支持的GPU上，该线程数为1024。

我们可以通过稍微修改向量索引来消除这种限制，并将行宽度(列数)作为参数传递给矩阵。通过这种修改，我们应该能够处理任意的矩阵(和向量)大小。

编辑：根据讨论/评论，OP想知道如何处理行主或列主底层存储。以下示例使用模板化内核来选择底层存储的行主或列主，还显示了一种可能的CUBLAS方法，用于使用秩-1更新函数向每个矩阵行操作添加向量：

$ cat t712.cu
#include <iostream>
#include <cublas_v2.h>
#define ROWS 20
#define COLS 10
#define nTPB 64
#define ROW_MAJOR 0
#define COL_MAJOR 1
template <int select, typename T>
__global__ void vec_mat_row_add(const unsigned int height, const unsigned int width, T* matrix, const T* vector)
{
// get the current element index for the thread
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < height*width)
{
// sum the current element with the
if (select == ROW_MAJOR)
matrix[idx] += vector[idx%width];
else // COL_MAJOR
matrix[idx] += vector[idx/height];
}
}
int main(){
float *h_mat, *d_mat, *h_vec, *d_vec;
const unsigned int msz = ROWS*COLS*sizeof(float);
const unsigned int vsz = COLS*sizeof(float);
h_mat = (float *)malloc(msz);
h_vec = (float *)malloc(vsz);
cudaMalloc(&d_mat, msz);
cudaMalloc(&d_vec, vsz);
for (int i=0; i<COLS; i++) h_vec[i] = i; // set vector to 0,1,2, ...
cudaMemcpy(d_vec, h_vec, vsz, cudaMemcpyHostToDevice);
// test row-major case
cudaMemset(d_mat, 0, msz); // set matrix to zero
vec_mat_row_add<ROW_MAJOR><<<(ROWS*COLS + nTPB -1)/nTPB, nTPB>>>(ROWS, COLS, d_mat, d_vec);
cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
std::cout << "Row-major result: " << std::endl;
for (int i = 0; i < ROWS; i++){
for (int j = 0; j < COLS; j++) std::cout << h_mat[i*COLS+j] << " ";
std::cout << std::endl;}
// test column-major case
cudaMemset(d_mat, 0, msz); // set matrix to zero
vec_mat_row_add<COL_MAJOR><<<(ROWS*COLS + nTPB -1)/nTPB, nTPB>>>(ROWS, COLS, d_mat, d_vec);
cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
std::cout << "Column-major result: " << std::endl;
for (int i = 0; i < ROWS; i++){
for (int j = 0; j < COLS; j++) std::cout << h_mat[j*ROWS+i] << " ";
std::cout << std::endl;}
// test CUBLAS, doing matrix-vector add using <T>ger
cudaMemset(d_mat, 0, msz); // set matrix to zero
float *d_ones, *h_ones;
h_ones = (float *)malloc(ROWS*sizeof(float));
for (int i =0; i<ROWS; i++) h_ones[i] = 1.0f;
cudaMalloc(&d_ones, ROWS*sizeof(float));
cudaMemcpy(d_ones, h_ones, ROWS*sizeof(float), cudaMemcpyHostToDevice);
cublasHandle_t ch;
cublasCreate(&ch);
float alpha = 1.0f;
cublasStatus_t stat = cublasSger(ch, ROWS, COLS, &alpha, d_ones, 1, d_vec, 1, d_mat, ROWS);
if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
std::cout << "CUBLAS Column-major result: " << std::endl;
for (int i = 0; i < ROWS; i++){
for (int j = 0; j < COLS; j++) std::cout << h_mat[j*ROWS+i] << " ";
std::cout << std::endl;}
return 0;
}
$ nvcc -o t712 t712.cu -lcublas
$ ./t712
Row-major result:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
Column-major result:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
CUBLAS Column-major result:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
$

为了简洁起见，我没有包括正确的cuda错误检查，但当您在使用cuda代码时，这总是一个好主意。作为代理/快捷方式，您可以使用cuda-memcheck运行代码，以快速检查是否存在任何CUDA错误。

请注意，我们希望所有3个打印输出都是相同的，因为这实际上是显示矩阵的正确方式，无论底层存储是行主存储还是列主存储。底层存储的差异在处理显示输出的for循环中得到了解释。

Robert Crovella已经回答了这个问题，提供了使用显式CUDA内核和cuBLAS的示例。

我发现展示一个关于如何使用CUDA Thrust执行行或列操作的示例对于将来的参考非常有用。我特别关注两个问题：

将列向量与所有矩阵列求和
将行向量与所有矩阵行相加

thrust::transform的通用性使下面的例子能够推广到除和之外的元素运算(例如，乘法、除法、减法等)

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/random.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/equal.h>
using namespace thrust::placeholders;
/*************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX */
/*************************************/
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T> {
T Ncols; // --- Number of columns
__host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
__host__ __device__ T operator()(T i) { return i / Ncols; }
};
/********/
/* MAIN */
/********/
int main()
{
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int Nrows = 10;           // --- Number of rows
const int Ncols =  3;           // --- Number of columns  
// --- Random uniform integer distribution between 0 and 100
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist1(0, 100);
// --- Random uniform integer distribution between 1 and 4
thrust::uniform_int_distribution<int> dist2(1, 4);
// --- Matrix allocation and initialization
thrust::device_vector<float> d_matrix(Nrows * Ncols);
for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist1(rng);
// --- Column vector allocation and initialization
thrust::device_vector<float> d_column(Nrows);
for (size_t i = 0; i < d_column.size(); i++) d_column[i] = (float)dist2(rng);
// --- Row vector allocation and initialization
thrust::device_vector<float> d_row(Ncols);
for (size_t i = 0; i < d_row.size(); i++) d_row[i] = (float)dist2(rng);
printf("nnOriginal matrixn");
for(int i = 0; i < Nrows; i++) {
std::cout << "[ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "]n";
}
printf("nnColumn vectorn");
for(int i = 0; i < Nrows; i++) std::cout << d_column[i] << "n";
printf("nnRow vectorn");
for(int i = 0; i < Ncols; i++) std::cout << d_row[i] << " ";
/*******************************************************/
/* ADDING THE SAME COLUMN VECTOR TO ALL MATRIX COLUMNS */
/*******************************************************/
thrust::device_vector<float> d_matrix2(d_matrix);
thrust::transform(d_matrix.begin(), d_matrix.end(),
thrust::make_permutation_iterator(
d_column.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols))),
d_matrix2.begin(),
thrust::plus<float>());
printf("nnColumn + Matrix -> Result matrixn");
for(int i = 0; i < Nrows; i++) {
std::cout << "[ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix2[i * Ncols + j] << " ";
std::cout << "]n";
}
/*************************************************/
/* ADDING THE SAME ROW VECTOR TO ALL MATRIX ROWS */
/*************************************************/
thrust::device_vector<float> d_matrix3(d_matrix);
thrust::transform(thrust::make_permutation_iterator(
d_matrix.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), 
thrust::make_permutation_iterator(
d_matrix.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)) + Nrows * Ncols,                    
thrust::make_permutation_iterator(
d_row.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows))),
thrust::make_permutation_iterator(
d_matrix3.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), 
thrust::plus<float>());

printf("nnRow + Matrix -> Result matrixn");
for(int i = 0; i < Nrows; i++) {
std::cout << "[ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix3[i * Ncols + j] << " ";
std::cout << "]n";
}
return 0; 
}