用两种不同的方法在c++中添加两个矩阵

adding two matrices in c++ in two different ways

本文关键字：两个添加两种方法 c++ 更新时间：2023-10-16

我只是想知道这两种场景之间是否存在差异

第一个代码是如果外部用于计数行，内部用于计数列

第二个代码是如果外部用于计数列，内部用于计数行

我将两者应用并得到相同的结果

您会得到相同的结果，但可能会得到不同的性能。

矩阵很可能按行主顺序存储，并按行访问它可能会获得更好的内存带宽和$利用率。只要试着对巨大的矩阵做同样的事情，并测量walltime。

http://en.wikipedia.org/wiki/Row-major_order

下面是如何计时。首先是我的结果。报告的数字是10000次试验的平均CPU时钟周期。

$ clang++ -Ofast -DNDEBUG -std=c++11 mat_add.cpp -o mat_add && ./mat_add
avg. cycles 100x100 doubles matrix addition
strided:    60149
sequential: 27137
$ g++-4.9 -Ofast -DNDEBUG -std=c++11 mat_add.cpp -o mat_add && ./mat_add
avg. cycles 100x100 doubles matrix addition
strided:    90517
sequential: 33407

顺序访问速度更快。原因是缓存行为，特别是缓存行。这是关于这个主题的有趣读物。

http://igoro.com/archive/gallery-of-processor-cache-effects/

我区分跨步和顺序，而不是行和列，因为行和列是任意的。通常在C++中，我们认为顺序元素在同一行中，但这纯粹是惯例，不是语言固有的。不同的库遵循不同的约定。

测试代码。

// timing
// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
#include <stdlib.h>
uint64_t start, stop;
unsigned cycles_high;
unsigned cycles_low;
unsigned cycles_high1;
unsigned cycles_low1;
unsigned ellapsed_cycles;
static inline void start_count()
{
    asm volatile(
        "CPUIDnt"
        "RDTSCnt"
        "mov %%edx, %0nt"
        "mov %%eax, %1nt"
        : "=r" (cycles_high), "=r" (cycles_low)
        :
        : "%rax", "%rbx", "%rcx", "%rdx");
}
static inline void stop_count()
{
    asm volatile(
        "RDTSCPnt"
        "mov %%edx, %0nt"
        "mov %%eax, %1nt"
        "CPUIDnt"
        : "=r" (cycles_high1), "=r" (cycles_low1)
        :
        : "%rax", "%rbx", "%rcx", "%rdx");
    start = ( ((uint64_t)cycles_high << 32) | cycles_low );
    stop = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 );
    ellapsed_cycles = stop - start;
}

// matrix addition
#include <cstddef>
#include <memory>
#include <vector>
#include <iostream>
#include <cassert>
#include <random>
using std::size_t;
template<class T>
class Matrix
{
public:
    Matrix(const size_t n, const size_t m)
    : elems_(new T[n*m]), n_(n), m_(m)
    {}
    Matrix(const size_t n, const size_t m, const std::vector< std::vector<T> >& elems)
    : elems_(new T[n*m]), n_(n), m_(m)
    {
        assert(n != 0 && m != 0);
        for (size_t i = 0; i != n_; ++i)
        {
            for (size_t j = 0; j != m_; ++j)
            {
                std::cout << "elems[" << n << ", " << j << "] = " << elems[i][j] << std::endl;
                elems_[i*n + j] = elems[i][j];
            }
        }
    }
    ~Matrix()
    {
        delete[] elems_;
    };
    T& operator()(const size_t i, size_t j)
    {
        assert(i < n_ && j < m_);
        return elems_[i*m_ + j];
    }
    const T& operator()(const size_t i, size_t j) const
    {
        assert(i < n_ && j < m_);
        return elems_[i*m_ + j];
    }
    friend std::ostream& operator<<(std::ostream& os, const Matrix& mat)
    {
        size_t i = 0;
        size_t j = 0;
        os << "[ ";
        goto first;
        for (; i != mat.n_; ++i)
        {
            os << "  ";
            first:
            for (j = 0; j != mat.m_; ++j)
            {
                // make it all pretty and nice
                os << std::fixed;
                os.width(6);
                os.precision(2);
                os << mat(i, j) << " ";
            }
            if (i != mat.n_ - 1)
                os << "n";
        }
        os << " ]";
        return os;
    }
    size_t n() const { return n_; }
    size_t m() const { return m_; }
private:
    T* elems_;
    const size_t n_;
    const size_t m_;
};

template<class T>
Matrix<T> add_sequential(const Matrix<T>& mat1, const Matrix<T>& mat2)
{
    assert(mat1.n() == mat2.n() && mat1.m() == mat2.m());
    const size_t n = mat1.n();
    const size_t m = mat1.m();
    Matrix<T> sum(n, m);
    for (size_t i = 0; i != n; ++i)
    {
        for (size_t j = 0; j != m; ++j)
        {
            sum(i, j) = mat1(i, j) + mat2(i, j);
        }
    }
    return sum;
}
template<class T>
Matrix<T> add_strided(const Matrix<T>& mat1, const Matrix<T>& mat2)
{
    assert(mat1.n() == mat2.n() && mat1.m() == mat2.m());
    const size_t n = mat1.n();
    const size_t m = mat1.m();
    Matrix<T> sum(n, m);
    for (size_t j = 0; j != m; ++j)
    {
        for (size_t i = 0; i != n; ++i)
        {
            sum(i, j) = mat1(i, j) + mat2(i, j);
        }
    }
    return sum;
}

// misc: making random matrices, flushing cache, running timing tests
template<class T>
Matrix<T> rand_real_mat(const size_t n, const size_t m)
{
    static std::default_random_engine gen;
    static std::uniform_real_distribution<T> dis(-100.0, 100.0);
    Matrix<T> mat(n, m);
    for (size_t j = 0; j != m; ++j)
    {
        for (size_t i = 0; i != n; ++i)
        {
            mat(i, j) = dis(gen);
        }
    }
    return mat;
}

#include <fstream>
void flush_cache()
{
    std::ifstream rand("/dev/random", std::ifstream::binary);
    std::ofstream devnull("/dev/null", std::ofstream::binary);
    for (size_t i = 0; i != (30 * 1024 / sizeof(int)); ++i)
    {
        int r;
        rand >> r;
        devnull << r;
    }
}
template<class R, class ElemType>
static inline std::vector<uint64_t> time_mat_fnc(
    R (fnc)(const Matrix<ElemType>&, const Matrix<ElemType>&),
    const size_t n_times)
{
    std::ofstream devnull("/dev/null", std::ofstream::binary);
    std::vector<uint64_t> times;
    times.reserve(n_times);
    static const size_t n = 100;
    static const size_t m = 100;
    for (size_t i = 0; i != 1000; ++i)
    {
        // create 2 random n x m matrices
        const auto m1 = rand_real_mat<ElemType>(n, m);
        const auto m2 = rand_real_mat<ElemType>(n, m);
        flush_cache();
        // addition
        start_count();
        const auto sum = fnc(m1, m2);
        stop_count();
        times.push_back(ellapsed_cycles);
        // prevent optimizing away unused result
        devnull << sum;
    }
    return times;
}
template<typename C>
decltype(std::declval<typename C::value_type>()/std::declval<typename C::value_type>())
average(const C& cntnr)
{
    typename C::value_type sum = 0;
    size_t size = 0;
    for (typename C::const_iterator it = cntnr.begin(), end = cntnr.end(); it != end; ++it)
    {
        sum += *it;
        ++size;
    }
    return sum / size;
}
int main()
{
    typedef double ElemType;
    const size_t trials = 10000;
    const std::vector<uint64_t> strided_times = time_mat_fnc(add_strided<ElemType>, trials);
    const std::vector<uint64_t> sequential_times = time_mat_fnc(add_sequential<ElemType>, trials);
    const auto strided_avg = average(strided_times);
    const auto sequential_avg = average(sequential_times);
    std::cout << "avg. cycles 100x100 doubles matrix addition" << std::endl;
    std::cout << "strided:    " << strided_avg << std::endl;
    std::cout << "sequential: " << sequential_avg << std::endl;
}