因为一行不相关的代码，速度相差那么大

Because one line of unrelated code, the speed difference so much

本文关键字：速度代码不相关一行因为更新时间：2023-10-16

为什么要用预编译器加一行代码，速度变化这么大？

#include <stdio.h>
#include <time.h>
#include <set>
#include <vector>
#include <list>
#include <iostream>
#include <algorithm>
#include <functional>

#ifndef LINUX
#include <windows.h>
#else
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/time.h>
#endif
using namespace std;
const int SIX_MILLION = 6000000;
const int ONE_MILLION = 1000000;
unsigned long long randint()
{
return (unsigned long long)(rand() & 0xFFFF) << 16 | (unsigned long long)(rand() & 0xFFFF);
}
#ifdef LINUX
int time_substract(struct timeval *result, struct timeval *begin, struct timeval *end)
{
if (begin->tv_sec > end->tv_sec)    return -1;
if ((begin->tv_sec == end->tv_sec) && (begin->tv_usec > end->tv_usec))    return -2;
result->tv_sec = (end->tv_sec - begin->tv_sec);
result->tv_usec = (end->tv_usec - begin->tv_usec);
if (result->tv_usec < 0)
{
result->tv_sec--;
result->tv_usec += ONE_MILLION;
}
return 0;
}
#endif
double time_it(function<void()> func, int times = 3)
{
#ifndef LINUX
LARGE_INTEGER lpStart[1], lpEnd[1], lpFreq[1];
vector<double> result(times);
::QueryPerformanceFrequency(lpFreq);
for (int i = 0; i < times; ++i) {
::QueryPerformanceCounter(lpStart);
func();
::QueryPerformanceCounter(lpEnd);
result[i] = (lpEnd[0].QuadPart - lpStart[0].QuadPart)  * ONE_MILLION  / double(lpFreq[0].QuadPart);
}
nth_element(result.begin(), result.begin() + (times / 2), result.end());
return result[times / 2];
#else
struct timeval start, stop, diff;
vector<double> result(times);
memset(&start, 0, sizeof(struct timeval));
memset(&stop, 0, sizeof(struct timeval));
memset(&diff, 0, sizeof(struct timeval));
for (int i = 0; i < times; ++i) {
gettimeofday(&start, 0);
func();
gettimeofday(&stop, 0);
time_substract(&diff, &start, &stop);
result[i] = (diff.tv_sec * 1000000  + double(diff.tv_usec));
}
nth_element(result.begin(), result.begin() + (times / 2), result.end());
return result[times / 2];
#endif
}
size_t prepare_data(set<unsigned long long> &data, unsigned int size, bool strict = false) {
data.clear();
for (unsigned int i = 0; i < size; i++) {
data.insert(strict ? i * 3 : randint());
}
return data.size();
}
int main()
{
srand((unsigned int)time(NULL));
set<unsigned long long> a;
set<unsigned long long> b;
vector<unsigned long long> result(SIX_MILLION);
double res;
#ifdef TEST
prepare_data(a, SIX_MILLION);
#endif
prepare_data(a, SIX_MILLION / 2, true);
prepare_data(b, SIX_MILLION / 2);
res = time_it([&a, &b, &result]() {
auto iter = set_intersection(a.begin(), a.end(), b.begin(), b.end(), result.begin());
result.resize(iter - result.begin());
});
cout << "duration: " << res << " microseconds,set a size: " <<
a.size() << " set b size: " << b.size() << " set result size: " << result.size() << endl;
return 0;
}

ubuntu@host：~/test_intersection$ g++ -std=c++11 -O3 -DLINUX main1.cpp -o main1 ubuntu@host：~/test_intersection$ ./main1 持续时间：62080微秒，设置大小：2998917设置B大小：

3000000 设置结果大小：2087ubuntu@host：~/test_intersection$
g++ -std=c++11 -O3 -DLINUX -DTESTmain1.cpp -o main1
ubuntu@host：~/test_intersection$ ./main1
持续时间：362546微秒，设置大小： 2998985设置B大小： 3000000 集合结果大小：2149

我在DigitalOcean托管的Ubuntu droplet上得到了与您相同的结果。它的内存相当有限。在运行测试之前，它有大约 220MB 的可用空间(输出/usr/bin/free -tm)：

# free -tm
total        used        free      shared  buff/cache   available
Mem:            493         153         220          14         119         298
Swap:             0           0           0
Total:          493         153         220

当我运行慢速测试时，我可以看到可用内存完全吸收。

# free -tm
total        used        free      shared  buff/cache   available
Mem:            493         383          10          14          99          69
Swap:             0           0           0
Total:          493         383          10

以防万一clear()方法在内部保留所有内存，我尝试了：

data = std::move( std::set<unsigned long long>() );

但这几乎没有区别。

所以我最初的怀疑之一是，你通过使用像std::set这样的数据结构耗尽内存来耗尽内存，它执行大量分配来构建树，然后以未指定的顺序释放它们(由于树中节点的排列)。

为了模拟这一点，我将TEST部分替换为执行大量分配的代码，然后以不同的顺序释放它们(通过使用质数步幅跳过列表)。

#ifdef TEST
//prepare_data(a, SIX_MILLION);
{
std::vector<void*> mem(SIX_MILLION);
for( auto & val : mem ) val = malloc(24);
for( int i=0, p=0, step=499739; i < SIX_MILLION; i++) {
p = (p + step ) % SIX_MILLION;
free(mem[p]);
}
}
#endif

分配24 个字节足以对系统上的内存分配器施加压力，从而导致与您描述的结果类似的结果。我发现，如果我以更可预测的顺序释放值(即从第一个到最后一个遍历)，这对性能没有同样的影响。

所以我想说，对此的最终解释是，你是内存碎片的受害者。你用大量的小分配填满了你的内存，然后以随机顺序释放它们。然后，您构建了新的数据集，由于分配系统捉襟见肘，这些数据集的缓存局部性较差。在计算这两个数据集的昂贵交集时，这对性能产生了可衡量的严重影响。