
What is the fastest way to perform vector-by-vector dot products for two MxN matrices with small M in CUDA?

我有两个矩阵,每个矩阵是 MxN,其中 M = 16N较大(例如,n = 262144(。我的目标是产生一个长度N的向量,其中每个元素都对应于每个矩阵中nth矢量的点产物。

我尝试了以下方法,其中cIdx对应于每个矩阵中的列向量的列索引。毫不奇怪,Nvidia Visual Profiler告诉我,这种方法主要是记忆带宽。

    public static void MatrixDotProduct(
        float* matrix1,
        float* matrix2,
        float* dotProduct,
        int2 matrixDimensions)
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        int stride = gridDim.x * blockDim.x;
        float sum;
        for (int cIdx = i; cIdx < matrixDimensions.y; cIdx += stride)
            int ci = cIdx * matrixDimensions.x;
            sum = 0f;
            for (int j = 0; j < matrixDimensions.x; j++)
                sum += matrix1[ci + j] * matrix2[ci + j];
            dotProduct[cIdx] = sum;

我还找到了一个vector-vector-vector Dot产品的版本,我也尝试并行化。不幸的是,这比上述实现慢20%(也许是因为我的M = 16?(。有没有更好的方法来解决我在这里错过的这个问题?



__device__ float vdot(float2 v1, float2 v2) {
    return (v1.x * v2.x) + (v1.y * v2.y);
__device__ float vdot(float4 v1, float4 v2) {
    return (v1.x * v2.x) + (v1.y * v2.y) + (v1.z * v2.z) + (v1.w * v2.w);
template<typename VT, int NT>
__device__ float vector_dotprod(const VT* v1, const VT* v2) {
    float sum = 0.f;
#pragma unroll
    for (int j = 0; j < NT; j++) {
        sum += vdot(v1[j], v2[j]);
    return sum;
template<typename VT, int Nrows>
void MatrixDotProductPlus(float* matrix1, float* matrix2, float* dotProduct, int2 matrixDimensions)
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x * blockDim.x;
    int stride2 = stride * Nrows;
    VT* m1 = reinterpret_cast<VT*>(matrix1) + i * Nrows;
    VT* m2 = reinterpret_cast<VT*>(matrix2) + i * Nrows;
    for (; i < matrixDimensions.y; i += stride, m1 += stride2, m2 += stride2) {
        dotProduct[i] = vector_dotprod<VT,Nrows>(m1, m2);

[警告:仅经过非常轻微的测试 - 使用自身风险]

