矢量填充比创建新矢量慢

Vector refilling is slower than creating a new one?

本文关键字：新矢量创建填充更新时间：2023-10-16

我试图使用Simplex算法实现一个优化器。原始代码在线创建一个新的向量，在每次迭代中具有0个初始值。我尝试在循环之外创建一个公共，然后在每次迭代中使用std::fill重置值。我很惊讶第一个比第二个快。在我看来，声明无论如何都需要请求内存并初始化值，速度再快不过了。

有人能帮忙解释一下吗？如果这是真的，那么第一种方法有什么缺点吗？或者我们可以进一步改进吗？

这是代码。

新产品：

void Simplex(std::vector<double>& result, std::function<double(std::vector<double>)> func,
std::vector<double> init, std::vector<std::vector<double>> x = std::vector<std::vector<double>>(),
double EPS = 1E8 * std::numeric_limits<double>::epsilon(), int MAXIT = 1000000)
{
int N = init.size();                                //  Space Dimension
//  Coefficients for the new points.
const double a = 1.0;       //  a: Reflection
const double b = 1.0;       //  b: Expansion
const double g = 0.5;       //  g: Contraction
const double h = 0.5;       //  h: Multi-Contraction
std::vector<double> xcentroid_old(N, 0);    //  Old Simplex Centroid * (N + 1)
std::vector<double> xcentroid_new(N, 0);    //  New Simplex Centroid * (N + 1)
std::vector<double> vf(N + 1, 0);           //  Values at Simplex Vertexes       
int x1 = 0;                 //  Index of smallest vertex.
int xn = 0;                 //  Index of second greatest vertex.
int xnp1 = 0;               //  Index of greatest vertex.
int countIT = 0;                //  Iteration Count
//  If no initial simplex is specified, construct the trial simplex.
if (x.size() == 0)
{
std::vector<double> del(init);
//  del = init / 20
std::transform(del.begin(), del.end(), del.begin(),
std::bind2nd(std::divides<double>(), 20));
for (int i = 0; i < N; i++)
{
std::vector<double> tmp(init);
tmp[i] += del[i];
x.push_back(tmp);
}
x.push_back(init);
// Calculate the xcentriod.
std::transform(init.begin(), init.end(), xcentroid_old.begin(),
std::bind2nd(std::multiplies<double>(), N + 1));
}
std::vector<double> xg(N);
std::vector<double> xr(N);
std::vector<double> xe(N);
std::vector<double> xc(N);
//  Optimization starts.
for (countIT = 0; countIT < MAXIT; countIT++)
{
for (int i = 0; i < N + 1; i++)
vf[i] = func(x[i]);
// Find index of max, second max, min of vf.
x1 = 0; xn = 0; xnp1 = 0;
for (int i = 0; i < vf.size(); i++)
{
if (vf[i] < vf[x1])
x1 = i;
if (vf[i] > vf[xnp1])
xnp1 = i;
}
xn = x1;
for (int i = 0; i < vf.size(); i++)
{
if (vf[i] < vf[xnp1] && vf[i] > vf[xn])
xn = i;
}
//  xg: Centroid of the N best vertexes.
std::fill(xg.begin(), xg.end(), 0);
for (int i = 0; i < x.size(); i++)
{
if (i != xnp1)
std::transform(xg.begin(), xg.end(), x[i].begin(), xg.begin(), std::plus<double>());
}
std::transform(xg.begin(), xg.end(),
x[xnp1].begin(), xcentroid_new.begin(), std::plus<double>());
std::transform(xg.begin(), xg.end(), xg.begin(),
std::bind2nd(std::divides<double>(), N));
//  Termination condition: change (sum of absolute differences on all dimensions)
//  of simplex centroid is less than EPS.
double diff = 0;
for (int i = 0; i < N; i++)
diff += fabs(xcentroid_old[i] - xcentroid_new[i]);
if (diff / N < EPS)
break;
else
xcentroid_old.swap(xcentroid_new);
//  Reflection
std::fill(xr.begin(), xr.end(), 0);
for (int i = 0; i < N; i++)
xr[i] = xg[i] + a * (xg[i] - x[xnp1][i]);
double fxr = func(xr);
if (vf[x1] <= fxr && fxr <= vf[xn])
//  If f(x1) <= f(xr) <= f(xn), update xnp1 to xr.
std::copy(xr.begin(), xr.end(), x[xnp1].begin());
else if (fxr < vf[x1])
{
//  If f(xr) < f(x1), expansion.
std::fill(xe.begin(), xe.end(), 0);
for (int i = 0; i<N; i++)
xe[i] = xr[i] + b * (xr[i] - xg[i]);
//  Update xnp1 to the better one of xr or xe.
if (func(xe) < fxr)
std::copy(xe.begin(), xe.end(), x[xnp1].begin());
else
std::copy(xr.begin(), xr.end(), x[xnp1].begin());
}
else if (fxr > vf[xn])
{
//  If f(xr) > f(xn), contraction.
std::fill(xc.begin(), xc.end(), 0);
for (int i = 0; i < N; i++)
xc[i] = xg[i] + g * (x[xnp1][i] - xg[i]);
if (func(xc) < vf[xnp1])
//  If f(xc) < f(xnp1), update xnp1 to xc.
std::copy(xc.begin(), xc.end(), x[xnp1].begin());
else
{
//  If f(xc) >= f(xnp1), multi-contraction.
for (int i = 0; i < x.size(); i++)
{
if (i != x1)
{
for (int j = 0; j < N; j++)
x[i][j] = x[x1][j] + h * (x[i][j] - x[x1][j]);
}
}
}
}
}
if (countIT == MAXIT)
throw std::invalid_argument("Iteration limit achieves, result may not be optimal.");
result = x[x1];
}

原件：

void Simplex_Original(std::vector<double>& result, std::function<double(std::vector<double>)> func,
std::vector<double> init, std::vector<std::vector<double>> x = std::vector<std::vector<double>>(),
double EPS = 1E8 * std::numeric_limits<double>::epsilon(), int MAXIT = 1000000)
{
int N = init.size();                                //  Space Dimension
//  Coefficients for the new points.
const double a = 1.0;       //  a: Reflection
const double b = 1.0;       //  b: Expansion
const double g = 0.5;       //  g: Contraction
const double h = 0.5;       //  h: Multi-Contraction
std::vector<double> xcentroid_old(N, 0);    //  Old Simplex Centroid * (N + 1)
std::vector<double> xcentroid_new(N, 0);    //  New Simplex Centroid * (N + 1)
std::vector<double> vf(N + 1, 0);           //  Values at Simplex Vertexes       
int x1 = 0;                 //  Index of smallest vertex.
int xn = 0;                 //  Index of second greatest vertex.
int xnp1 = 0;               //  Index of greatest vertex.
int countIT = 0;                //  Iteration Count
//  If no initial simplex is specified, construct the trial simplex.
if (x.size() == 0)
{
std::vector<double> del(init);
//  del = init / 20
std::transform(del.begin(), del.end(), del.begin(),
std::bind2nd(std::divides<double>(), 20));
for (int i = 0; i < N; i++)
{
std::vector<double> tmp(init);
tmp[i] += del[i];
x.push_back(tmp);
}
x.push_back(init);
// Calculate the xcentriod.
std::transform(init.begin(), init.end(), xcentroid_old.begin(),
std::bind2nd(std::multiplies<double>(), N + 1));
}
//  Optimization starts.
for (countIT = 0; countIT < MAXIT; countIT++)
{
for (int i = 0; i < N + 1; i++)
vf[i] = func(x[i]);
// Find index of max, second max, min of vf.
x1 = 0; xn = 0; xnp1 = 0;
for (int i = 0; i < vf.size(); i++)
{
if (vf[i] < vf[x1])
x1 = i;
if (vf[i] > vf[xnp1])
xnp1 = i;
}
xn = x1;
for (int i = 0; i < vf.size(); i++)
{
if (vf[i] < vf[xnp1] && vf[i] > vf[xn])
xn = i;
}
//  xg: Centroid of the N best vertexes.
std::vector<double> xg(N, 0);
for (int i = 0; i < x.size(); i++)
{
if (i != xnp1)
std::transform(xg.begin(), xg.end(), x[i].begin(), xg.begin(), std::plus<double>());
}
std::transform(xg.begin(), xg.end(),
x[xnp1].begin(), xcentroid_new.begin(), std::plus<double>());
std::transform(xg.begin(), xg.end(), xg.begin(),
std::bind2nd(std::divides<double>(), N));
//  Termination condition: change (sum of absolute differences on all dimensions)
//  of simplex centroid is less than EPS.
double diff = 0;
for (int i = 0; i < N; i++)
diff += fabs(xcentroid_old[i] - xcentroid_new[i]);
if (diff / N < EPS)
break;
else
xcentroid_old.swap(xcentroid_new);
//  Reflection
std::vector<double> xr(N, 0);
for (int i = 0; i < N; i++)
xr[i] = xg[i] + a * (xg[i] - x[xnp1][i]);
double fxr = func(xr);
if (vf[x1] <= fxr && fxr <= vf[xn])
//  If f(x1) <= f(xr) <= f(xn), update xnp1 to xr.
std::copy(xr.begin(), xr.end(), x[xnp1].begin());
else if (fxr < vf[x1])
{
//  If f(xr) < f(x1), expansion.
std::vector<double> xe(N, 0);
for (int i = 0; i<N; i++)
xe[i] = xr[i] + b * (xr[i] - xg[i]);
//  Update xnp1 to the better one of xr or xe.
if (func(xe) < fxr)
std::copy(xe.begin(), xe.end(), x[xnp1].begin());
else
std::copy(xr.begin(), xr.end(), x[xnp1].begin());
}
else if (fxr > vf[xn])
{
//  If f(xr) > f(xn), contraction.
std::vector<double> xc(N, 0);
for (int i = 0; i < N; i++)
xc[i] = xg[i] + g * (x[xnp1][i] - xg[i]);
if (func(xc) < vf[xnp1])
//  If f(xc) < f(xnp1), update xnp1 to xc.
std::copy(xc.begin(), xc.end(), x[xnp1].begin());
else
{
//  If f(xc) >= f(xnp1), multi-contraction.
for (int i = 0; i < x.size(); i++)
{
if (i != x1)
{
for (int j = 0; j < N; j++)
x[i][j] = x[x1][j] + h * (x[i][j] - x[x1][j]);
}
}
}
}
}
if (countIT == MAXIT)
throw std::invalid_argument("Iteration limit achieves, result may not be optimal.");
result = x[x1];
}

测试功能：

double func(vector<double> x)
{
return (x[0] * x[0] + x[1] * x[1]) * (x[0] * x[0] + x[1] * x[1]) - (x[0] - 3 * x[1]) * (x[0] - 3 * x[1]);
}
void main()
{
int m = 1000, n = 10;
double dz = 0.1 / m / n;
vector<double> init(2), result(2);
init[0] = 3;    init[1] = 3;
clock_t t1;
t1 = clock();
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
init[0] += dz;
Optimizer::Simplex_Original(result, func, init);
}
}
cout << "Old:" << 't' << float(clock() - t1) / CLOCKS_PER_SEC << endl;
cout << result[0] << 't' << result[1] << endl;
init[0] = 3;    init[1] = 3;
t1 = clock();
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
init[0] += dz;
Optimizer::Simplex(result, func, init);
}
}
cout << "New:" << 't' << float(clock() - t1) / CLOCKS_PER_SEC << endl;
cout << result[0] << 't' << result[1] << endl;
}

我使用VS 2013发布模式，O2打开。

对于原始版本，10000次重复的成本约为9秒，但对于新版本，成本为13秒。

我认为向量构造函数知道内存是连续的，将能够更好地优化设置其内容。它可能只是对整个区域进行memset，或者类似的操作，而std:：fill不知道它正在访问什么类型的容器，所以必须通过为每个元素增加迭代器并单独写入每个元素来迭代所有元素。。。

对大型复杂函数的微小更改可以触发优化代码中各种令人惊讶的性能变化(好的或坏的)。处理器很复杂，很难预测什么会更快或更慢。

也就是说，分配零填充内存可能比将已经分配的块设置为零更快。

考虑：

std::vector<double> xg(N, 0);

如果0.0在您的平台上用所有零位表示(很可能)，那么这归结为分配一个充满零的内存块。事实证明，大多数虚拟内存操作系统都可以很容易地分配已经清零的内存。事实上，操作系统可能有一个线程在另一个内核上运行，其唯一目的是将未使用的内存块清零，以便它们可用。当分配内存时，它已经是一堆零了，所以在分配器中没有额外的工作要做。

比较：

std::fill(xg.begin(), xg.end(), 0);

这就是使用你核心上的线程来清空内存块。您实际上已经失去了让操作系统提前在自己的线程上为您完成这项工作的并发性。

虽然分配会带来性能成本，而且如果你经常这样做，通常应该将其视为潜在的瓶颈，但在衡量之前，你无法真正知道。