用两种不同的方法在c++中添加两个矩阵
adding two matrices in c++ in two different ways
我只是想知道这两种场景之间是否存在差异
第一个代码是如果外部用于计数行,内部用于计数列
第二个代码是如果外部用于计数列,内部用于计数行
我将两者应用并得到相同的结果
您会得到相同的结果,但可能会得到不同的性能。
矩阵很可能按行主顺序存储,并按行访问它可能会获得更好的内存带宽和$利用率。只要试着对巨大的矩阵做同样的事情,并测量walltime。
http://en.wikipedia.org/wiki/Row-major_order
下面是如何计时。首先是我的结果。报告的数字是10000次试验的平均CPU时钟周期。
$ clang++ -Ofast -DNDEBUG -std=c++11 mat_add.cpp -o mat_add && ./mat_add
avg. cycles 100x100 doubles matrix addition
strided: 60149
sequential: 27137
$ g++-4.9 -Ofast -DNDEBUG -std=c++11 mat_add.cpp -o mat_add && ./mat_add
avg. cycles 100x100 doubles matrix addition
strided: 90517
sequential: 33407
顺序访问速度更快。原因是缓存行为,特别是缓存行。这是关于这个主题的有趣读物。
http://igoro.com/archive/gallery-of-processor-cache-effects/
我区分跨步和顺序,而不是行和列,因为行和列是任意的。通常在C++中,我们认为顺序元素在同一行中,但这纯粹是惯例,不是语言固有的。不同的库遵循不同的约定。
测试代码。
// timing
// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
#include <stdlib.h>
uint64_t start, stop;
unsigned cycles_high;
unsigned cycles_low;
unsigned cycles_high1;
unsigned cycles_low1;
unsigned ellapsed_cycles;
static inline void start_count()
{
asm volatile(
"CPUIDnt"
"RDTSCnt"
"mov %%edx, %0nt"
"mov %%eax, %1nt"
: "=r" (cycles_high), "=r" (cycles_low)
:
: "%rax", "%rbx", "%rcx", "%rdx");
}
static inline void stop_count()
{
asm volatile(
"RDTSCPnt"
"mov %%edx, %0nt"
"mov %%eax, %1nt"
"CPUIDnt"
: "=r" (cycles_high1), "=r" (cycles_low1)
:
: "%rax", "%rbx", "%rcx", "%rdx");
start = ( ((uint64_t)cycles_high << 32) | cycles_low );
stop = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 );
ellapsed_cycles = stop - start;
}
// matrix addition
#include <cstddef>
#include <memory>
#include <vector>
#include <iostream>
#include <cassert>
#include <random>
using std::size_t;
template<class T>
class Matrix
{
public:
Matrix(const size_t n, const size_t m)
: elems_(new T[n*m]), n_(n), m_(m)
{}
Matrix(const size_t n, const size_t m, const std::vector< std::vector<T> >& elems)
: elems_(new T[n*m]), n_(n), m_(m)
{
assert(n != 0 && m != 0);
for (size_t i = 0; i != n_; ++i)
{
for (size_t j = 0; j != m_; ++j)
{
std::cout << "elems[" << n << ", " << j << "] = " << elems[i][j] << std::endl;
elems_[i*n + j] = elems[i][j];
}
}
}
~Matrix()
{
delete[] elems_;
};
T& operator()(const size_t i, size_t j)
{
assert(i < n_ && j < m_);
return elems_[i*m_ + j];
}
const T& operator()(const size_t i, size_t j) const
{
assert(i < n_ && j < m_);
return elems_[i*m_ + j];
}
friend std::ostream& operator<<(std::ostream& os, const Matrix& mat)
{
size_t i = 0;
size_t j = 0;
os << "[ ";
goto first;
for (; i != mat.n_; ++i)
{
os << " ";
first:
for (j = 0; j != mat.m_; ++j)
{
// make it all pretty and nice
os << std::fixed;
os.width(6);
os.precision(2);
os << mat(i, j) << " ";
}
if (i != mat.n_ - 1)
os << "n";
}
os << " ]";
return os;
}
size_t n() const { return n_; }
size_t m() const { return m_; }
private:
T* elems_;
const size_t n_;
const size_t m_;
};
template<class T>
Matrix<T> add_sequential(const Matrix<T>& mat1, const Matrix<T>& mat2)
{
assert(mat1.n() == mat2.n() && mat1.m() == mat2.m());
const size_t n = mat1.n();
const size_t m = mat1.m();
Matrix<T> sum(n, m);
for (size_t i = 0; i != n; ++i)
{
for (size_t j = 0; j != m; ++j)
{
sum(i, j) = mat1(i, j) + mat2(i, j);
}
}
return sum;
}
template<class T>
Matrix<T> add_strided(const Matrix<T>& mat1, const Matrix<T>& mat2)
{
assert(mat1.n() == mat2.n() && mat1.m() == mat2.m());
const size_t n = mat1.n();
const size_t m = mat1.m();
Matrix<T> sum(n, m);
for (size_t j = 0; j != m; ++j)
{
for (size_t i = 0; i != n; ++i)
{
sum(i, j) = mat1(i, j) + mat2(i, j);
}
}
return sum;
}
// misc: making random matrices, flushing cache, running timing tests
template<class T>
Matrix<T> rand_real_mat(const size_t n, const size_t m)
{
static std::default_random_engine gen;
static std::uniform_real_distribution<T> dis(-100.0, 100.0);
Matrix<T> mat(n, m);
for (size_t j = 0; j != m; ++j)
{
for (size_t i = 0; i != n; ++i)
{
mat(i, j) = dis(gen);
}
}
return mat;
}
#include <fstream>
void flush_cache()
{
std::ifstream rand("/dev/random", std::ifstream::binary);
std::ofstream devnull("/dev/null", std::ofstream::binary);
for (size_t i = 0; i != (30 * 1024 / sizeof(int)); ++i)
{
int r;
rand >> r;
devnull << r;
}
}
template<class R, class ElemType>
static inline std::vector<uint64_t> time_mat_fnc(
R (fnc)(const Matrix<ElemType>&, const Matrix<ElemType>&),
const size_t n_times)
{
std::ofstream devnull("/dev/null", std::ofstream::binary);
std::vector<uint64_t> times;
times.reserve(n_times);
static const size_t n = 100;
static const size_t m = 100;
for (size_t i = 0; i != 1000; ++i)
{
// create 2 random n x m matrices
const auto m1 = rand_real_mat<ElemType>(n, m);
const auto m2 = rand_real_mat<ElemType>(n, m);
flush_cache();
// addition
start_count();
const auto sum = fnc(m1, m2);
stop_count();
times.push_back(ellapsed_cycles);
// prevent optimizing away unused result
devnull << sum;
}
return times;
}
template<typename C>
decltype(std::declval<typename C::value_type>()/std::declval<typename C::value_type>())
average(const C& cntnr)
{
typename C::value_type sum = 0;
size_t size = 0;
for (typename C::const_iterator it = cntnr.begin(), end = cntnr.end(); it != end; ++it)
{
sum += *it;
++size;
}
return sum / size;
}
int main()
{
typedef double ElemType;
const size_t trials = 10000;
const std::vector<uint64_t> strided_times = time_mat_fnc(add_strided<ElemType>, trials);
const std::vector<uint64_t> sequential_times = time_mat_fnc(add_sequential<ElemType>, trials);
const auto strided_avg = average(strided_times);
const auto sequential_avg = average(sequential_times);
std::cout << "avg. cycles 100x100 doubles matrix addition" << std::endl;
std::cout << "strided: " << strided_avg << std::endl;
std::cout << "sequential: " << sequential_avg << std::endl;
}
相关文章:
- 如何在C++中从两个单独的for循环中添加两个数组
- 为什么Mat类的两个对象可以在不重载运算符+的情况下添加
- 运算符重载 (+),用于添加两个具有 C++ 的数组
- oStream 不打印添加两个 valarray 的结果(使用运算符重载)
- 在 char* 数组中添加两个索引
- 是否可以在 for 循环中添加两个浮点数?
- 如何在Qt中合并/追加/添加两个用于线程的模型?
- 将两个数字添加为链表
- 试图添加两个矢量,但添加顺序错误
- 如何在 c++ 中添加两个大的双精度数字
- 如何将值添加到嵌套结构中,该结构在C++中有两个指针
- 有没有更好的方法来添加两个智能指针?
- 如何调用运算符函数添加两个对象?
- C++添加检查以避免读取两个参数
- 添加两个二进制数(整数数组)
- 为什么我在尝试添加两个链表时出现此错误?
- 是否可以在C++中将两个函数一起添加
- 错误 :"+" 无法添加两个指针
- 添加一个节点,并在通用树中的两个给定节点之间找到路径成本,其中c 中的儿童列表
- 链接列表C 类,这两个添加节点实现之间的差异是什么?