行列式计算函数的优化

Optimization of determinant calculation function

本文关键字：优化函数计算行列式更新时间：2023-10-16

在搜索最佳算法时，我发现有一个折衷方案：一方面是实现的复杂性和大常数，另一方面是运行时复杂性。我选择了基于LU分解的算法，因为它实现起来很简单，并且具有足够好的性能。

#include <valarray>
#include <vector>
#include <utility>
#include <cmath>
#include <cstddef>
#include <cassert>
template< typename value_type >
struct math
{
    using size_type = std::size_t;
    size_type const dimension_;
    value_type const & eps;
    value_type const zero = value_type(0);
    value_type const one = value_type(1);
private :
    using vector = std::valarray< value_type >;
    using matrix = std::vector< vector >;
    matrix matrix_;
    matrix minor_;
public :
    math(size_type const _dimension,
         value_type const & _eps)
        : dimension_(_dimension)
        , eps(_eps)
        , matrix_(dimension_)
        , minor_(dimension_ - 1)
    { 
        assert(1 < dimension_);
        assert(!(eps < zero));
        for (size_type r = 0; r < dimension_; ++r) {
            matrix_[r].resize(dimension_);
        }
        size_type const minor_size = dimension_ - 1;
        for (size_type r = 0; r < minor_size; ++r) {
            minor_[r].resize(minor_size);
        }
    }
    template< typename rhs = matrix >
    void
    operator = (rhs const & _matrix)
    {
        auto irow = std::begin(matrix_);
        for (auto const & row_ : _matrix) {
            auto icol = std::begin(*irow);
            for (auto const & v : row_) {
                *icol = v;
                ++icol;
            }
            ++irow;
        }
    }
    value_type
    det(matrix & _matrix,
        size_type const _dimension)
    { // calculates lower unit triangular matrix and upper triangular
        assert(0 < _dimension);
        value_type det_ = one;
        for (size_type i = 0; i < _dimension; ++i) {
            vector & ri_ = _matrix[i];
            using std::abs;
            value_type max_ = abs(ri_[i]);
            size_type pivot = i;
            {
                size_type p = i;
                while (++p < _dimension) {
                    value_type y_ = abs(_matrix[p][i]);
                    if (max_ < y_) {
                        max_ = std::move(y_);
                        pivot = p;
                    }
                }
            }
            if (!(eps < max_)) { // regular?
                return zero; // singular
            }
            if (pivot != i) {
                det_ = -det_; // each permutation flips sign of det
                ri_.swap(_matrix[pivot]);
            }
            value_type & dia_ = ri_[i];
            det_ *= dia_; // det is multiple of diagonal elements
            for (size_type j = 1 + i; j < _dimension; ++j) {
                _matrix[j][i] /= dia_;
            }
            for (size_type a = 1 + i; a < _dimension; ++a) {
                vector & a_ = minor_[a - 1];
                value_type const & ai_ = _matrix[a][i];
                for (size_type b = 1 + i; b < _dimension; ++b) {
                    a_[b - 1] = ai_ * ri_[b];
                }
            }
            for (size_type a = 1 + i; a < _dimension; ++a) {
                vector const & a_ = minor_[a - 1];
                vector & ra_ = _matrix[a];
                for (size_type b = 1 + i; b < _dimension; ++b) {
                    ra_[b] -= a_[b - 1];
                }
            }
        }
        return det_;
    }
    value_type
    det(size_type const _dimension)
    {
        return det(matrix_, _dimension);
    }
    value_type
    det()
    {
        return det(dimension_);
    }
};
// main.cpp
#include <iostream>
#include <cstdlib>
int
main()
{
    using value_type = double;
    value_type const eps = std::numeric_limits< value_type >::epsilon();
    std::size_t const dimension_ = 3;
    math< value_type > m(dimension_, eps);
    m = { // example from https://en.wikipedia.org/wiki/Determinant#Laplace.27s_formula_and_the_adjugate_matrix
            {-2.0, 2.0, -3.0},
            {-1.0, 1.0,  3.0},
            { 2.0, 0.0, -1.0}
        };
    std::cout << m.det() << std::endl; // 18
    return EXIT_SUCCESS;
}

现场演示

det()函数是算法中最热门的函数，将其作为算法的一部分。我确信det()没有它能做到的那么快，因为运行时性能比较（使用google pprof）与整个算法的参考实现显示出对det()的不均衡。

如何提高det()函数的性能？哪些明显的优化可以立即应用？我应该更改索引和内存访问顺序还是其他什么？容器类型？预取？

dimension_的典型值在3到10的范围内（但如果value_type是mpfr或其他值，则可以是100）。

您的（来自det()的片段）不是吗

       for (size_type a = 1 + i; a < _dimension; ++a) {
            vector & a_ = minor_[a - 1];
            value_type const & ai_ = _matrix[a][i];
            for (size_type b = 1 + i; b < _dimension; ++b) {
                a_[b - 1] = ai_ * ri_[b];
            }
        }
        for (size_type a = 1 + i; a < _dimension; ++a) {
            vector const & a_ = minor_[a - 1];
            vector & ra_ = _matrix[a];
            for (size_type b = 1 + i; b < _dimension; ++b) {
                ra_[b] -= a_[b - 1];
            }
        }

做与相同的事情

       for (size_type a = 1 + i; a < _dimension; ++a) {
            vector & ra_ = _matrix[a];
            value_type ai_ = ra_[i];
            for (size_type b = 1 + i; b < _dimension; ++b) {
                ra_[b] -= ai_ * ri_[b];
            }
        }

而不需要minor_？此外，现在可以很容易地对内部循环进行矢量化。