数组指针C++简单的技巧可以提高性能
C++ simple trick with an array pointer improves performance
>我在堆排序例程中发现了一种奇怪的行为(见下文)。
void hpsort(unsigned long n, double *data)
{
unsigned long i, ir, j, l;
double rra;
if (n < 2) return;
l = (n - 2) / 2 + 1;
ir = n - 1;
for (;;)
{
if (l > 0) rra = data[--l];
else
{
rra = data[ir];
data[ir] = data[0];
if (--ir == 0) { data[0] = rra; break; }
}
i = l;
j = l + l + 1;
while (j <= ir)
{
if (j < ir && data[j] < data[j+1]) ++j;
if (rra < data[j])
{
data[i] = data[j];
i = j;
j += j + 1;
}
else break;
}
data[i] = rra;
}
return;
}
如果我像这样调用此例程进行基准测试
double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array);
它需要 X 秒。 但是如果我只添加一行
void hpsort(unsigned int n, double *data)
{
++data;
并做基准测试
double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array-1);
大约需要 0.96 倍秒(即快 4%)。此性能差异在每次运行之间是稳定的。
感觉就像编译器在第一种情况下g++
进行边界检查,而在第二种情况下,我可以以某种方式欺骗它。但我从未听说过对 C 数组进行边界检查......
知道为什么我会在性能上出现这种奇怪的差异吗?
附言编译是用g++ -O2
完成的。 顺便说一下,将unsigned long
更改为long int
也会使性能降低约 3% 到 4%。
p.p.s. "定义的行为"版本也显示了性能改进
void hpsort(unsigned int n, double *data)
{
--data;
和基准作为
double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array+1);
性能比较
Size of array Faster Slower
10 1.46 1.60
100 1.41 1.62
1000 1.84 1.96
10000 1.78 1.87
100000 1.72 1.80
1000000 1.76 1.83
10000000 1.98 2.03
这是我的hpsort.cpp
代码
void hpsort1(unsigned long n, double *data)
{
unsigned long i, ir, j, l;
double rra;
if (n < 2) return;
l = (n - 2) / 2 + 1;
ir = n - 1;
for (;;)
{
if (l > 0) rra = data[--l];
else
{
rra = data[ir];
data[ir] = data[0];
if (--ir == 0)
{
data[0] = rra;
break;
}
}
i = l;
j = l + l + 1;
while (j <= ir)
{
if (j < ir && data[j] < data[j+1]) ++j;
if (rra < data[j])
{
data[i] = data[j];
i = j;
j += j + 1;
}
else break;
}
data[i] = rra;
}
return;
}
void hpsort2(unsigned long n, double *data)
{
unsigned long i, ir, j, l;
double rra;
--data;
if (n < 2) return;
l = (n - 2) / 2 + 1;
ir = n - 1;
for (;;)
{
if (l > 0) rra = data[--l];
else
{
rra = data[ir];
data[ir] = data[0];
if (--ir == 0)
{
data[0] = rra;
break;
}
}
i = l;
j = l + l + 1;
while (j <= ir)
{
if (j < ir && data[j] < data[j+1]) ++j;
if (rra < data[j])
{
data[i] = data[j];
i = j;
j += j + 1;
}
else break;
}
data[i] = rra;
}
return;
}
这是我的基准测试代码heapsort-benchmark.cpp
#include <vector>
#include <alloca.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
using namespace std;
void hpsort1(unsigned long n, double *data);
void hpsort2(unsigned long n, double *data);
typedef double element_t;
typedef void(*Test)(element_t*, element_t*, int);
const int sizes [] = {10, 100, 1000, 10000, 100000, 1000000, 10000000};
const int largest_size = sizes[sizeof(sizes)/sizeof(int)-1];
vector<double> result_times; // results are pushed into this vector
clock_t start_time;
void do_row(int size) // print results for given size of processed array
{
printf("%10d t", size);
for (int i=0; i<result_times.size(); ++i) printf("%.2ft", result_times[i]);
printf("n");
result_times.clear();
}
inline void start_timer() { start_time = clock(); }
inline double timer()
{
clock_t end_time = clock();
return (end_time - start_time)/double(CLOCKS_PER_SEC);
}
void run(Test f, element_t* first, element_t* last, int number_of_times)
{
start_timer();
while (number_of_times-- > 0) f(first,last,number_of_times);
result_times.push_back(timer());
}
void random_shuffle(double *first, double *last)
{
size_t i, j, n;
double tmp;
n = last-first;
srand((unsigned int)0);
for (i=n-1; i>0; --i)
{
j = rand() % (i+1);
tmp = first[i];
first[i] = first[j];
first[j] = tmp;
}
return;
}
void hpsort1_test(element_t* first, element_t* last, int number_of_times)
{
size_t num_elements = (last-first);
element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
memcpy(array, first, sizeof(element_t)*num_elements);
hpsort1(num_elements, array);
free(array);
}
void hpsort2_test(element_t* first, element_t* last, int number_of_times)
{
size_t num_elements = (last-first);
element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
memcpy(array, first, sizeof(element_t)*num_elements);
hpsort2(num_elements, array+1);
free(array);
}
void initialize(element_t* first, element_t* last)
{
element_t x = 0.;
while (first != last) { *first++ = x; x += 1.; }
}
double logtwo(double x) { return log(x)/log((double) 2.0); }
int number_of_tests(int size)
{
double n = (double)size;
double largest_n = (double)largest_size;
return int(floor((largest_n * logtwo(largest_n)) / (n * logtwo(n))));
}
void run_tests(int size)
{
const int n = number_of_tests(size);
element_t *buffer = (element_t *)malloc(size * sizeof(element_t));
element_t* buffer_end = &buffer[size];
initialize(buffer, buffer + size); // fill in the elements
for (int i = 0; i < size/2; ++i) buffer[size/2 + i] = buffer[i]; // fill in the second half with values of the first half
//random_shuffle(buffer, buffer_end); // shuffle if you do not want an ordered array
run(hpsort2_test, buffer, buffer_end, n);
run(hpsort1_test, buffer, buffer_end, n);
do_row(size);
free(buffer);
}
int main()
{
const int n = sizeof(sizes)/sizeof(int);
for (int i = 0; i < n; ++i) run_tests(sizes[i]);
}
我编译并运行它
g++ -O2 -c heapsort-benchmark.cpp
g++ -O2 -c hpsort.cpp
g++ -O2 -o heapsort-benchmark heapsort-benchmark.o hpsort.o
./heapsort-benchmark
第一列将是更快的版本
无法
像OP那样获得一致的结果。
IMO OP的微小差异不是代码差异的一部分,而是测试的一部分。
void hpsort(unsigned long n, double *data) {
unsigned long i, ir, j, l;
double rra;
...
}
void hpsort1(unsigned long n, double *data) {
--data;
unsigned long i, ir, j, l;
double rra;
...
}
测试代码
#include <time.h>
#include <stdlib.h>
void test(const char *s, int code, size_t n) {
srand(0);
double* array = (double*) malloc(sizeof(double) * n * 2);
// make 2 copies of same random data
for (size_t i = 0; i < n; i++) {
array[i] = rand();
array[i+n] = array[i];
}
double dt0;
double dt1;
clock_t c0 = clock();
clock_t c1,c2;
if (code) {
hpsort1(n, array + 1);
c1 = clock();
hpsort(n, &array[n]);
c2 = clock();
dt0 = (double) (c2 - c1)/CLOCKS_PER_SEC;
dt1 = (double) (c1 - c0)/CLOCKS_PER_SEC;
} else {
hpsort(n, array);
c1 = clock();
hpsort1(n, &array[n]+1);
c2 = clock();
dt0 = (double) (c1 - c0)/CLOCKS_PER_SEC;
dt1 = (double) (c2 - c1)/CLOCKS_PER_SEC;
}
free(array);
const char *cmp = dt0==dt1 ? "==" : (dt0<dt1 ? "<" : ">");
printf("%s %f %2s %f Diff:% f%%n", s, dt0, cmp, dt1, 100*(dt1-dt0)/dt0);
}
int main() {
//srand((unsigned) time(0));
size_t n = 3000000;
for (int i=0; i<10; i++) {
test("heap first", 0, n);
test("heap1 first", 1, n);
fflush(stdout);
}
}
输出
heap first 1.263000 > 1.201000 Diff:-4.908947%
heap1 first 1.295000 < 1.326000 Diff: 2.393822%
heap first 1.342000 > 1.295000 Diff:-3.502235%
heap1 first 1.279000 < 1.295000 Diff: 1.250977%
heap first 1.279000 == 1.279000 Diff: 0.000000%
heap1 first 1.280000 > 1.279000 Diff:-0.078125%
heap first 1.295000 > 1.294000 Diff:-0.077220%
heap1 first 1.280000 > 1.279000 Diff:-0.078125%
heap first 1.279000 == 1.279000 Diff: 0.000000%
heap1 first 1.295000 > 1.279000 Diff:-1.235521%
heap first 1.263000 < 1.295000 Diff: 2.533650%
heap1 first 1.280000 > 1.279000 Diff:-0.078125%
heap first 1.295000 > 1.263000 Diff:-2.471042%
heap1 first 1.295000 < 1.310000 Diff: 1.158301%
heap first 1.310000 < 1.326000 Diff: 1.221374%
heap1 first 1.326000 < 1.342000 Diff: 1.206637%
heap first 1.279000 == 1.279000 Diff: 0.000000%
heap1 first 1.264000 < 1.295000 Diff: 2.452532%
heap first 1.279000 > 1.264000 Diff:-1.172791%
heap1 first 1.279000 > 1.264000 Diff:-1.172791%
相关文章:
- 1d 智能指针不适用于语法 (*)++
- 在C#中处理C++指针而不使用unsafe的最佳方法
- 为什么使用 "this" 指针调用派生成员函数?
- 函数向量_指针有不同的原型,我可以构建一个吗
- 使用指针从C++中的数组中获取最大值
- 助记符和指向成员语法的指针
- 嵌入方指针压缩已禁用
- 数组的指针从不分段故障
- C++ 指针的内存地址和指向数组的内存地址如何相同?
- 何时在引用或唯一指针上使用移动语义
- QMetaObject invokeMethod的基于函数指针的语法
- 如何从 std::atomic 中提取指针 T<T>?
- 如何在 C# 中映射双 C 结构指针?
- C++将浮点指针值舍入为小数位数
- 为什么++(*p)更改指针值
- 调整大小后指向元素值的指针unordered_map有效?
- 正在将指针转换为范围
- 使用指向成员的指针将成员函数作为参数传递
- 将OpenCV C++重写为EmguCV C#-如何使用指针
- C++ 中的高性能代码(继承、指向函数的指针、if)