快速交叉乘积.函数调用开销

Fast Cross Product. Function call overhead?

本文关键字:函数调用 开销      更新时间:2023-10-16

我是C++编程的新手。我正试图看到将我的所有MatLab软件迁移到C++的好处。我正在做一些有限元的事情,主要是非线性的,所以我需要大量执行的操作之一是两个向量的叉积。我已经在Matlab和C++中测试了两种实现,C++似乎更快。在C++中,两种不同的实现给出了不同的时序。我使用的是"英特尔MKL"。

这是代码:

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>
#include <mkl.h>

void vprod( double vgr[3], double vg1[3], double vg2[3]);

int main() {
double v1[3]={1.22, 2.65, 3.65}, v2[3]={6.98, 98.159, 54.65}, vr[3];
int LC=1000000;
int i,j,k;
double tiempo=0.0, tinicial;
//------------------------------------------------------------------------
std::cout << "INLINE METHOD: " << std::endl;
tinicial = dsecnd();
for (i=0; i<LC; i++){   
vr[0] = v1[1]*v2[2]-v1[2]*v2[1]; 
vr[1] =-(v1[0]*v2[2]-v1[2]*v2[0]);
vr[2] = v1[0]*v2[1]-v1[1]*v2[0];
};
tiempo = (dsecnd() - tinicial);
std::cout << "Tiempo Total: " << tiempo << std::endl;
std::cout << "Resultado: " << vr[0] << std::endl;
//------------------------------------------------------------------------
//------------------------------------------------------------------------
std::cout << "FUNCTION METHOD: " << std::endl;
tinicial = dsecnd();
for (i=0; i<LC; i++){   
vprod (vr,v1,v2);
};
tiempo = (dsecnd() - tinicial);
std::cout << "Tiempo Total: " << tiempo << std::endl;
std::cout << "Resultado: " << vr[0] << std::endl;
//------------------------------------------------------------------------
std::cin.ignore();
return 0;
}

inline void vprod( double vgr[3], double vg1[3], double vg2[3]){
vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1]; 
vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
}

我的问题是:为什么第一个实现比第二个快3倍?这是函数调用开销的结果吗?谢谢

EDIT:我修改了代码,以避免编译器用常量向量"猜测"循环的结果。正如@phonetagger所展示的,结果大不相同。在不使用vprod函数的情况下,我有28500微秒的时间,而使用vprod函数的时间为29000微秒。这个数字是使用Ox优化获得的。如果内联关键字处于打开状态,则更改优化不会影响比较,尽管数字会增加一点。此外,如果不使用inline关键字(并且优化关闭),则在不使用vprod函数的情况下计时为32000,而使用该函数的计时为37000。因此,函数调用开销可能在5000微秒左右。

新代码是:

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>
#include <mkl.h>
//#include <mkl_lapack.h>
void vprod( double *vgr, int ploc, double *vg1, double *vg2);

int main() {
int nv=1000000;
int dim=3*nv;
double *v1, *v2, *vr; // Declare Pointers
int ploc, i;
double tiempo=0.0, tinicial;
v1 = new double [dim];  //Allocate block of memory
v2 = new double [dim];
vr = new double [dim];
// Fill vectors with something
for (i = 0; i < dim; i++) {
v1[i] =1.25 +  (double)(i+1);
v2[i] =2.62+ 2*(double)(i+7);
}

//------------------------------------------------------------------------
std::cout << "RUTINA CON CODIGO INLINE: n" ;
tinicial = dsecnd();
ploc = 0; // ploc points to an intermediate location.
for (i=0; i<nv; i++){   
vr[ploc] = v1[ploc+1]*v2[ploc+2]-v1[ploc+2]*v2[ploc+1]; 
vr[ploc+1] =-(v1[ploc]*v2[ploc+2]-v1[ploc+2]*v2[ploc]);
vr[ploc+2] = v1[ploc]*v2[ploc+1]-v1[ploc+1]*v2[ploc];
ploc +=3;
};
tiempo = (dsecnd() - tinicial);
std::cout << "Tiempo Total: " << tiempo << ".n";
std::cout << "Resultado: " << vr[0] << ".n";
delete v1,v2,vr;
v1 = new double [dim];  //Allocate block of memory
v2 = new double [dim];
vr = new double [dim];
//------------------------------------------------------------------------
//------------------------------------------------------------------------
std::cout << "RUTINA LLAMANDO A FUNCION: n" ;
ploc=0;
tinicial = dsecnd();
for (i=0; i<nv; i++){   
vprod ( vr, ploc, v1, v2);
ploc +=3;
};
tiempo = (dsecnd() - tinicial);
std::cout << "Tiempo Total: " << tiempo << ".n";
std::cout << "Resultado: " << vr[0] << ".n";
//------------------------------------------------------------------------
std::cin.ignore();
return 0;
}

inline void vprod( double *vgr, int ploc, double *vg1, double *vg2) {
vgr[ploc]    =   vg1[ploc+1]*vg2[ploc+2]-vg1[ploc+2]*vg2[ploc+1]; 
vgr[ploc+1]  = -(vg1[ploc]*vg2[ploc+2]-vg1[ploc+2]*vg2[ploc]);
vgr[ploc+2]  =   vg1[ploc]*vg2[ploc+1]-vg1[ploc+1]*vg2[ploc];
}

我不知道你在使用什么编译器("MKL"是一个编译器套件吗;无论你是否试图"耍花招"让你的代码运行得更快。通常(尽管并非总是如此)最好让编译器为您表演技巧,您只需专注于编写高效的算法,而不是表演编码技巧。

无论如何,我以各种方式在我的系统上运行了您的代码,结果显示在下面的代码注释中。。。

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>
//#include <mkl.h>
// My standin for dsecnd() since I don't have "mkl.h"...
#include <sys/time.h>
double dsecnd()
{
struct timeval tv;
if (gettimeofday(&tv,NULL))
{
fprintf(stderr,"ngettimeofday() errornn");
exit(1);
}
return tv.tv_sec*1000000 + tv.tv_usec; // ...returns MICROSECONDS
//return tv.tv_sec + ((double)tv.tv_usec)/1000000; // ...returns SECONDS
}
//---------------------------------
// Uncomment one or both of these to test variations....
//#define USE_INLINE_KEYWORD
//#define DEFINE_vprod_AT_TOP
//
// Using g++ (GCC) 4.1.2 20080704 (Red Hat 4.1.2-52) on an x86 machine...
//
//                                 microseconds          microseconds
//                               "hardcoded inline"   "via vprod() function"
//                                                     [i]=inlined, [-]=not
//                               ------------------   ----------------------
// inline keyword, at top
//      no optimization                 9501               17797 [-]
//      optimization -O1                   2   (see NOTE)      1 [i]
//      optimization -O2                   1                   1 [i]
//      optimization -O3                   0                   0 [i]
//
// no inline keyword, at top
//      no optimization                 9630               18203 [-]
//      optimization -O1                1257               10681 [-]
//      optimization -O2                1272               10694 [-]
//      optimization -O3                   0                   1 [i]
//
// inline keyword, at bottom
//      no optimization                 9763               18333 [-]
//      optimization -O1                   1                   0 [i]
//      optimization -O2                   2                   1 [i]
//      optimization -O3                   0                   0 [i]
//
// no inline keyword, at bottom
//      no optimization                 9900               18387 [-]
//      optimization -O1                1289               10714 [-]
//      optimization -O2                 795                6740 [-]
//      optimization -O3                   1                   0 [i]
//
// Note that in all cases, both results were reported as -213.458.
//
// NOTE: Especially since I'm using gettimeofday() instead of something
//       that returns process (CPU) time, all results may include some
//       time that the CPU spent processing other stuff, but even if
//       that weren't the case (i.e. even if I used a function that
//       returned only CPU time spent on this particular process), there
//       would still be the quantization error of +/-1 microsecond on
//       each end of the interval, meaning +/-2 microseconds overall.
//
/* My cut & paste "build & test script" to run on the Linux command prompt...
echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""
rm -f a.out; g++ so.cpp
echo ""; echo "No optimization:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O1 so.cpp
echo ""; echo "Optimization -O1:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O2 so.cpp
echo ""; echo "Optimization -O2:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O3 so.cpp
echo ""; echo "Optimization -O3:---------------"; objdump -d a.out | grep call | grep vprod; a.out
...if the "objdump -d a.out | grep call | grep vprod" command returns something
like "call   8048754 <_Z5vprodPdS_S_>", then I know that the call to vprod() is
NOT inlined, whereas if it returns nothing, I know the call WAS inlined.  There
is only one caller of vprod(), so the results can't be confusing.
*/
//
//---------------------------------
#ifdef DEFINE_vprod_AT_TOP
#ifdef USE_INLINE_KEYWORD
inline
#endif
void vprod( double vgr[3], double vg1[3], double vg2[3]){
//void vprod( double *vgr, double *vg1, double *vg2){
vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
}
#else
// Declare (prototype) the function only if NOT defining it at the top...
void vprod( double vgr[3], double vg1[3], double vg2[3]);
#endif

int main() {
double v1[3]={1.22, 2.65, 3.65}, v2[3]={6.98, 98.159, 54.65}, vr[3];
int LC=1000000L;
int i,j,k;
double tiempo=0.0, tinicial;
//------------------------------------------------------------------------
std::cout << "INLINE METHOD: " << std::endl;
tinicial = dsecnd();
for (i=0; i<LC; i++){
vr[0] = v1[1]*v2[2]-v1[2]*v2[1];
vr[1] =-(v1[0]*v2[2]-v1[2]*v2[0]);
vr[2] = v1[0]*v2[1]-v1[1]*v2[0];
};
tiempo = (dsecnd() - tinicial);
std::cout << "Tiempo Total:             " << tiempo << std::endl;
std::cout << "Resultado: " << vr[0] << std::endl;
//------------------------------------------------------------------------
//------------------------------------------------------------------------
std::cout << "FUNCTION METHOD: " << std::endl;
tinicial = dsecnd();
for (i=0; i<LC; i++){
vprod (vr,v1,v2);
};
tiempo = (dsecnd() - tinicial);
std::cout << "Tiempo Total:             " << tiempo << std::endl;
std::cout << "Resultado: " << vr[0] << std::endl;
//------------------------------------------------------------------------
//    std::cin.ignore();
return 0;
}

#ifndef DEFINE_vprod_AT_TOP
#ifdef USE_INLINE_KEYWORD
inline
#endif
void vprod( double vgr[3], double vg1[3], double vg2[3]){
//void vprod( double *vgr, double *vg1, double *vg2){
vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
}
#endif

现在,随着优化级别的提高,编译器使用的编码技巧不再是线性的;编译器的技巧在不同的优化级别上发挥作用,可能取决于您是否使用"inline"关键字。编译器可能会采用(我的结果表明存在)不同类型的优化,而不是内联函数。有趣的是,正如我所读到的,"inline"关键字实际上只是向编译器建议您希望函数内联,并且可能只是调整一些阈值,以确定如果启用优化,是否内联它可能已经内联的函数,即使使用了"inline"关键字,函数也从未内联过。值得注意的是,prod()是在main()之上还是在main(()之下定义的,似乎对函数是否内联没有区别。

Martin,你说得完全正确(参考Martin的评论……2012年10月5日17:57我的回答下的第三条评论)。是的,在更高的优化级别上,编译器似乎让自己意识到它知道数组的传入值,这样它就可以在编译时执行整个计算、循环等等,并完全优化循环。

我将测试代码重新编码为三个独立的文件(一个头文件和两个源文件),并破坏了计算;循环到一个单独的函数中,以防止编译器在优化方面过于聪明。现在它无法将循环优化为编译时计算。以下是我的新结果。请注意,我在原来的0到1000000循环周围添加了另一个循环(0到50),然后除以50。我这样做有两个原因:它使我们能够将今天的数字与以前的数字进行比较,还可以平均出由于测试过程中交换过程而导致的不规则情况。这对你来说可能无关紧要,因为我认为dsecnd()只报告其特定进程的CPU时间?

不管怎样,这是我的新结果。。。。。。。

(是的,"inline keyword,optimization-O1"比-O2或-O3快的奇怪结果是可重复的,"no inline keywords,optimition-O1"的奇怪结果也是可重复的。我没有深入研究程序集,看看为什么会这样。)

//========================================================================================
// File: so.h
void loop_inline( const int LC, double vgr[3], double vg1[3], double vg2[3]);
void loop_func( const int LC, double vgr[3], double vg1[3], double vg2[3]);
//---------------------------------
// Comment or uncomment to test both ways...
#define USE_INLINE_KEYWORD
//
// Using g++ (GCC) 4.1.2 20080704 (Red Hat 4.1.2-52) on an x86 machine...
//
//                                 microseconds          microseconds
//                               "hardcoded inline"   "via vprod() function"
//                                                     [i]=inlined, [-]=not
//                               ------------------   ----------------------
// inline keyword
//      no optimization                11734               14598 [-]
//      optimization -O1                4617                4616 [i]
//      optimization -O2                7754                7838 [i]
//      optimization -O3                7777                7673 [i]
//
// no inline keyword
//      no optimization                11807               14602 [-]
//      optimization -O1                4651                7691 [-]
//      optimization -O2                7755                7383 [-]
//      optimization -O3                7921                7432 [-]
//
// Note that in all cases, both results were reported as -213.458.
//
/* My cut & paste "build & test script" to run on the Linux command prompt...
echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""
rm -f a.out; g++ -c so.cpp so2.cpp; g++ so.o so2.o;
echo ""; echo "No optimization:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O1 -c so.cpp so2.cpp; g++ so.o so2.o;
echo ""; echo "Optimization -O1:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O2 -c so.cpp so2.cpp; g++ so.o so2.o;
echo ""; echo "Optimization -O2:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O3 -c so.cpp so2.cpp; g++ so.o so2.o;
echo ""; echo "Optimization -O3:---------------"; objdump -d a.out | grep call | grep vprod; a.out
...if the "objdump -d a.out | grep call | grep vprod" command returns something
like "call   8048754 <_Z5vprodPdS_S_>", then I know that the call to vprod() is
NOT inlined, whereas if it returns nothing, I know the call WAS inlined.
*/
//========================================================================================
// File: so.cpp
// Sorry so messy, I didn't bother to clean up the #includes.......
#include <stdint.h>
#include <inttypes.h>
#include <stddef.h> // for NULL
#include <stdlib.h> // for exit()
#include <stdio.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>
//#include <mkl.h>
#include "so.h"
// My standin for dsecnd() since I don't have "mkl.h"...
#include <sys/time.h>
double dsecnd()
{
struct timeval tv;
if (gettimeofday(&tv,NULL))
{
fprintf(stderr,"ngettimeofday() errornn");
exit(1);
}
return tv.tv_sec*1000000 + tv.tv_usec; // ...returns MICROSECONDS
//return tv.tv_sec + ((double)tv.tv_usec)/1000000; // ...returns SECONDS
}
//---------------------------------
#ifndef USE_INLINE_KEYWORD
// We're NOT using the 'inline' keyword, so define vprod() in this
// file so it can't possibly be inlined where it's called (in the
// other source file).
void vprod( double vgr[3], double vg1[3], double vg2[3]){
//void vprod( double *vgr, double *vg1, double *vg2){
vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
}
#endif
int main() {
double v1[3]={1.22, 2.65, 3.65}, v2[3]={6.98, 98.159, 54.65}, vr[3];
int LC=1000000L;
int i, N=100;
double tiempo=0.0, tinicial;
//------------------------------------------------------------------------
std::cout << "INLINE METHOD: " << std::endl;
tinicial = dsecnd();
for (i=0; i<N; ++i)
loop_inline(LC,vr,v1,v2);
tiempo = (dsecnd() - tinicial)/N;
std::cout << "Tiempo Total:             " << tiempo << std::endl;
std::cout << "Resultado: " << vr[0] << std::endl;
//------------------------------------------------------------------------
//------------------------------------------------------------------------
std::cout << "FUNCTION METHOD: " << std::endl;
tinicial = dsecnd();
for (i=0; i<N; ++i)
loop_func(LC,vr,v1,v2);
tiempo = (dsecnd() - tinicial)/N;
std::cout << "Tiempo Total:             " << tiempo << std::endl;
std::cout << "Resultado: " << vr[0] << std::endl;
//------------------------------------------------------------------------
//    std::cin.ignore();
return 0;
}
//========================================================================================
// File: so2.cpp
#include "so.h"
#ifdef USE_INLINE_KEYWORD
inline void vprod( double vgr[3], double vg1[3], double vg2[3]){
//void vprod( double *vgr, double *vg1, double *vg2){
vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
}
#else
// Not using 'inline' keyword, so just declare (prototype) the
// function here and define it in the other source file (so it
// can't possibly be inlined).
void vprod( double vgr[3], double vg1[3], double vg2[3]);
#endif
void loop_inline( const int LC, double vgr[3], double vg1[3], double vg2[3]){
for (int i=0; i<LC; i++) {
vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
}
}
void loop_func( const int LC, double vgr[3], double vg1[3], double vg2[3]){
for (int i=0; i<LC; i++) {
vprod (vgr,vg1,vg2);
}
}