数组指针C++简单的技巧可以提高性能

C++ simple trick with an array pointer improves performance

本文关键字：高性能指针 C++ 简单数组更新时间：2023-10-16

>我在堆排序例程中发现了一种奇怪的行为（见下文）。

void hpsort(unsigned long n, double *data)
{
  unsigned long i, ir, j, l;
  double rra;
  if (n < 2) return;
  l = (n - 2) / 2 + 1;
  ir = n - 1;
  for (;;)
    {
      if (l > 0) rra = data[--l];
      else
        {
          rra = data[ir];
          data[ir] = data[0];
          if (--ir == 0) { data[0] = rra; break; }
        }
      i = l;
      j = l + l + 1;
      while (j <= ir)
        {
          if (j < ir && data[j] < data[j+1]) ++j;
          if (rra < data[j])
            {
              data[i] = data[j];
              i = j;
              j += j + 1;
            }
          else break;
        }
      data[i] = rra;
    }
  return;
}

如果我像这样调用此例程进行基准测试

double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array);

它需要 X 秒。但是如果我只添加一行

void hpsort(unsigned int n, double *data)
{
   ++data;

并做基准测试

double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array-1);

大约需要 0.96 倍秒（即快 4%）。此性能差异在每次运行之间是稳定的。

感觉就像编译器在第一种情况下g++进行边界检查，而在第二种情况下，我可以以某种方式欺骗它。但我从未听说过对 C 数组进行边界检查......

知道为什么我会在性能上出现这种奇怪的差异吗？

附言编译是用g++ -O2完成的。顺便说一下，将unsigned long更改为long int也会使性能降低约 3% 到 4%。

p.p.s. "定义的行为"版本也显示了性能改进

void hpsort(unsigned int n, double *data)
{
   --data;

和基准作为

double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array+1);

性能比较

Size of array   Faster  Slower
        10      1.46    1.60
       100      1.41    1.62
      1000      1.84    1.96
     10000      1.78    1.87
    100000      1.72    1.80
   1000000      1.76    1.83
  10000000      1.98    2.03

这是我的hpsort.cpp代码

 void hpsort1(unsigned long n, double *data)
 {
   unsigned long i, ir, j, l;
   double rra;
   if (n < 2) return;
   l = (n - 2) / 2 + 1;
   ir = n - 1;
   for (;;)
     {
       if (l > 0) rra = data[--l];
       else
         {
           rra = data[ir];
           data[ir] = data[0];
           if (--ir == 0)
             {
               data[0] = rra;
               break;
             }
         }
       i = l;
       j = l + l + 1;
       while (j <= ir)
         {
           if (j < ir && data[j] < data[j+1]) ++j;
           if (rra < data[j])
             {
               data[i] = data[j];
               i = j;
               j += j + 1;
             }
           else break;
         }
       data[i] = rra;
     }
   return;
 }
 void hpsort2(unsigned long n, double *data)
 {
   unsigned long i, ir, j, l;
   double rra;
   --data;
   if (n < 2) return;
   l = (n - 2) / 2 + 1;
   ir = n - 1;
   for (;;)
     {
       if (l > 0) rra = data[--l];
       else
         {
           rra = data[ir];
           data[ir] = data[0];
           if (--ir == 0)
             {
               data[0] = rra;
               break;
             }
         }
       i = l;
       j = l + l + 1;
       while (j <= ir)
         {
           if (j < ir && data[j] < data[j+1]) ++j;
           if (rra < data[j])
             {
               data[i] = data[j];
               i = j;
               j += j + 1;
             }
           else break;
         }
       data[i] = rra;
     }
   return;
 }

这是我的基准测试代码heapsort-benchmark.cpp

 #include <vector>
 #include <alloca.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
 #include <math.h>
 using namespace std;
 void hpsort1(unsigned long n, double *data);
 void hpsort2(unsigned long n, double *data);
 typedef double element_t;
 typedef void(*Test)(element_t*, element_t*, int);
 const int sizes [] = {10, 100, 1000, 10000, 100000, 1000000, 10000000};
 const int largest_size = sizes[sizeof(sizes)/sizeof(int)-1];
 vector<double> result_times; // results are pushed into this vector
 clock_t start_time;
 void do_row(int size) // print results for given size of processed array
 {
   printf("%10d  t", size);
   for (int i=0; i<result_times.size(); ++i) printf("%.2ft", result_times[i]);
   printf("n");
   result_times.clear();
 }
 inline void start_timer() { start_time = clock(); }
 inline double timer()
 {
   clock_t end_time = clock();
   return (end_time - start_time)/double(CLOCKS_PER_SEC);
 }
 void run(Test f, element_t* first, element_t* last, int number_of_times)
 {
   start_timer();
   while (number_of_times-- > 0) f(first,last,number_of_times);
   result_times.push_back(timer());
 }
 void random_shuffle(double *first, double *last)
 {
   size_t i, j, n;
   double tmp;
   n = last-first;
   srand((unsigned int)0);
   for (i=n-1; i>0; --i)
     {
       j = rand() % (i+1);
       tmp = first[i];
       first[i] = first[j];
       first[j] = tmp;
     }
   return;
 }
 void hpsort1_test(element_t* first, element_t* last, int number_of_times)
 {
   size_t num_elements = (last-first);
   element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
   memcpy(array, first, sizeof(element_t)*num_elements);
   hpsort1(num_elements, array);
   free(array);
 }
 void hpsort2_test(element_t* first, element_t* last, int number_of_times)
 {
   size_t num_elements = (last-first);
   element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
   memcpy(array, first, sizeof(element_t)*num_elements);
   hpsort2(num_elements, array+1);
   free(array);
 }
 void initialize(element_t* first, element_t* last)
 {
   element_t x = 0.;
   while (first != last) { *first++ = x; x += 1.; }
 }
 double logtwo(double x) { return log(x)/log((double) 2.0); }
 int number_of_tests(int size)
 {
   double n = (double)size;
   double largest_n = (double)largest_size;
   return int(floor((largest_n * logtwo(largest_n)) / (n * logtwo(n))));
 }
 void run_tests(int size)
 {
   const int n = number_of_tests(size);
   element_t *buffer = (element_t *)malloc(size * sizeof(element_t));
   element_t* buffer_end = &buffer[size];
   initialize(buffer, buffer + size); // fill in the elements
   for (int i = 0; i < size/2; ++i) buffer[size/2 + i] = buffer[i]; // fill in the second half with values of the first half
   //random_shuffle(buffer, buffer_end); // shuffle if you do not want an ordered array
   run(hpsort2_test, buffer, buffer_end, n);
   run(hpsort1_test, buffer, buffer_end, n);
   do_row(size);
   free(buffer);
 }

 int main()
 {
   const int n = sizeof(sizes)/sizeof(int);
   for (int i = 0; i < n; ++i) run_tests(sizes[i]);
 }

我编译并运行它

g++ -O2 -c heapsort-benchmark.cpp
g++ -O2 -c hpsort.cpp
g++ -O2 -o heapsort-benchmark heapsort-benchmark.o hpsort.o 
./heapsort-benchmark

第一列将是更快的版本

无法

像OP那样获得一致的结果。

IMO OP的微小差异不是代码差异的一部分，而是测试的一部分。

void hpsort(unsigned long n, double *data) {
  unsigned long i, ir, j, l;
  double rra;
  ...
}
void hpsort1(unsigned long n, double *data) {
  --data;
  unsigned long i, ir, j, l;
  double rra;
  ...
}

测试代码

#include <time.h>
#include <stdlib.h>
void test(const char *s, int code, size_t n) {
  srand(0);
  double* array = (double*) malloc(sizeof(double) * n * 2);
  // make 2 copies of same random data
  for (size_t i = 0; i < n; i++) {
    array[i] = rand();
    array[i+n] = array[i];
  }
  double dt0;
  double dt1;
  clock_t c0 = clock();
  clock_t c1,c2;
  if (code) {
    hpsort1(n, array + 1);
    c1 = clock();
    hpsort(n, &array[n]);
    c2 = clock();
    dt0 = (double) (c2 - c1)/CLOCKS_PER_SEC;
    dt1 = (double) (c1 - c0)/CLOCKS_PER_SEC;
  } else {
    hpsort(n, array);
    c1 = clock();
    hpsort1(n, &array[n]+1);
    c2 = clock();
    dt0 = (double) (c1 - c0)/CLOCKS_PER_SEC;
    dt1 = (double) (c2 - c1)/CLOCKS_PER_SEC;
  }
  free(array);
  const char *cmp = dt0==dt1 ? "==" : (dt0<dt1 ? "<" : ">");
  printf("%s %f %2s %f  Diff:% f%%n", s, dt0, cmp,  dt1, 100*(dt1-dt0)/dt0);
}
int main() {
  //srand((unsigned) time(0));
  size_t n = 3000000;
  for (int i=0; i<10; i++) {
    test("heap  first", 0, n);
    test("heap1 first", 1, n);
    fflush(stdout);
  }
}

输出

heap  first 1.263000  > 1.201000  Diff:-4.908947%
heap1 first 1.295000  < 1.326000  Diff: 2.393822%
heap  first 1.342000  > 1.295000  Diff:-3.502235%
heap1 first 1.279000  < 1.295000  Diff: 1.250977%
heap  first 1.279000 == 1.279000  Diff: 0.000000%
heap1 first 1.280000  > 1.279000  Diff:-0.078125%
heap  first 1.295000  > 1.294000  Diff:-0.077220%
heap1 first 1.280000  > 1.279000  Diff:-0.078125%
heap  first 1.279000 == 1.279000  Diff: 0.000000%
heap1 first 1.295000  > 1.279000  Diff:-1.235521%
heap  first 1.263000  < 1.295000  Diff: 2.533650%
heap1 first 1.280000  > 1.279000  Diff:-0.078125%
heap  first 1.295000  > 1.263000  Diff:-2.471042%
heap1 first 1.295000  < 1.310000  Diff: 1.158301%
heap  first 1.310000  < 1.326000  Diff: 1.221374%
heap1 first 1.326000  < 1.342000  Diff: 1.206637%
heap  first 1.279000 == 1.279000  Diff: 0.000000%
heap1 first 1.264000  < 1.295000  Diff: 2.452532%
heap  first 1.279000  > 1.264000  Diff:-1.172791%
heap1 first 1.279000  > 1.264000  Diff:-1.172791%