英特尔TBB代码有不同的执行时间在windows和linux

Intel TBB Code having different execution time in windows and linux

本文关键字:执行时间 windows linux TBB 代码 英特尔      更新时间:2023-10-16

我正试图并行化我的代码,这是设计FIR滤波器。为此,我选择了parallel_reduce .当我在windows上执行代码时,它需要15秒,当我在linux上执行代码时,它需要几乎2.5秒。在windows中,我在VS 2010上执行代码,启用英特尔性能库TBB,在linux中,我通过终端编译,包括TBB库以及g++编译器。由于处理器相同,代码也将在同一处理器上执行,为什么这个操作系统会产生差异?

我使用的代码是:
#include<iostream> 
#include "tbb/task_scheduler_init.h" 
#include "tbb/parallel_for.h" 
#include "tbb/blocked_range.h" 
#include "tbb/compat/thread" 
#include "tbb/parallel_reduce.h" 
#include <math.h>
#include <fstream>
using namespace tbb; 
using namespace std; 
#define pi 3.141593
#define FILTER_LEN 265
double coeffs[ FILTER_LEN ] =
{
  0.0033473431384214393,0.000032074683390218124,0.0033131082058404943,0.0024777666109278788,
  -0.0008968429179843104,-0.0031973449396977684,-0.003430943381749411,-0.0029796565504781646,
  -0.002770673157048994,-0.0022783059845596586,-0.0008531818129514857,0.001115432556294998,
  0.0026079871108133294,0.003012423848769931,0.002461420635709332,0.0014154004589753215,
  0.00025190669718400967,-0.0007608257014963959,-0.0013703600874774068,-0.0014133823230551277,
  -0.0009759556503342884,-0.00039687498737139273,-0.00007527524701314324,-0.00024181463305012626,
  -0.0008521761947454302,-0.00162618205097997,-0.002170446498273018,-0.002129903305507943,
  -0.001333859049002249,0.00010700092934983156,0.0018039564602637683,0.0032107930896349583,
  0.0038325849735515363,0.003416201274366522,0.002060848732332109,0.00017954815260431595,
  -0.0016358832300944531,-0.0028402136847527387,-0.0031256650498727384,-0.0025374271571154713,
  -0.001438370315670195,-0.00035115295209013755,0.0002606730012030533,0.0001969569787142967,
  -0.00039635535951198597,-0.0010886127490608972,-0.0013530057243606405,-0.0008123200399262436,
  0.0005730271959526784,0.0024419465938120906,0.004133717273258681,0.0049402122577746265,
  0.0043879285604252714,0.002449549610687005,-0.00040283102645093463,-0.003337730734820209,
  -0.0054508346511294775,-0.006093057767824609,-0.005117609782189977,-0.0029293645861970417,
  -0.0003251033117661085,0.0018074390555649442,0.0028351284091668164,0.002623563404428517,
  0.0015692864792199496,0.0004127664681096788,-0.00009249878881824428,0.0004690173244168184,
  0.001964334172374759,0.0037256715492873485,0.004809640399145206,0.004395274594482053,
  0.0021650921193604,-0.0014888595443799124,-0.005534807968511709,-0.008642334104607624,
  -0.009668950651149259,-0.008104732391434574,-0.004299972815463919,0.0006184612821881392,
  0.005136551428636121,0.007907786753766152,0.008241212326068366,0.00634786595941524,
  0.003235610213062744,0.00028882736660937287,-0.001320994685952108,-0.0011237433853145615,
  0.00044213409507615003,0.0022057106517524255,0.00277593527678719,0.0011909915058737617,
  -0.0025807757230413447,-0.007497632882437637,-0.011739520895818884,-0.013377018279057393,
  -0.011166543231844196,-0.005133056165990026,0.0032948631959114935,0.011673660427968408,
  0.017376415708412904,0.018548938130314566,0.014811760899506572,0.007450782505155853,
  -0.001019540069785369,-0.007805775815783898,-0.010898333714715424,-0.00985364043415772,
  -0.005988406030111452,-0.001818560524968024,0.000028552677472614846,-0.0019938756495376363,
  -0.007477684025727061,-0.013989430449615033,-0.017870518868849213,-0.015639422062597726,
  -0.005624959109456065,0.010993528170353541,0.03001263681283932,0.04527492462846608,
  0.050581340787164114,0.041949186532860346,0.019360612460662185,-0.012644336735920483,
  -0.0458782599058412,-0.07073838953156347,-0.0791205623455818,-0.06709535677423759,
  -0.03644544574795176,0.005505370370858695,0.04780486657828151,0.07898800597378192,
  0.0904453420042807,0.07898800597378192,0.04780486657828151,0.005505370370858695,
  -0.03644544574795176,-0.06709535677423759,-0.0791205623455818,-0.07073838953156347,
  -0.0458782599058412,-0.012644336735920483,0.019360612460662185,0.041949186532860346,
  0.050581340787164114,0.04527492462846608,0.03001263681283932,0.010993528170353541,
  -0.005624959109456065,-0.015639422062597726,-0.017870518868849213,-0.013989430449615033,
  -0.007477684025727061,-0.0019938756495376363,0.000028552677472614846,-0.001818560524968024,
  -0.005988406030111452,-0.00985364043415772,-0.010898333714715424,-0.007805775815783898,
  -0.001019540069785369,0.007450782505155853,0.014811760899506572,0.018548938130314566,
  0.017376415708412904,0.011673660427968408,0.0032948631959114935,-0.005133056165990026,
  -0.011166543231844196,-0.013377018279057393,-0.011739520895818884,-0.007497632882437637,
  -0.0025807757230413447,0.0011909915058737617,0.00277593527678719,0.0022057106517524255,
  0.00044213409507615003,-0.0011237433853145615,-0.001320994685952108,0.00028882736660937287,
  0.003235610213062744,0.00634786595941524,0.008241212326068366,0.007907786753766152,
  0.005136551428636121,0.0006184612821881392,-0.004299972815463919,-0.008104732391434574,
  -0.009668950651149259,-0.008642334104607624,-0.005534807968511709,-0.0014888595443799124,
  0.0021650921193604,0.004395274594482053,0.004809640399145206,0.0037256715492873485,
  0.001964334172374759,0.0004690173244168184,-0.00009249878881824428,0.0004127664681096788,
  0.0015692864792199496,0.002623563404428517,0.0028351284091668164,0.0018074390555649442,
  -0.0003251033117661085,-0.0029293645861970417,-0.005117609782189977,-0.006093057767824609,
  -0.0054508346511294775,-0.003337730734820209,-0.00040283102645093463,0.002449549610687005,
  0.0043879285604252714,0.0049402122577746265,0.004133717273258681,0.0024419465938120906,
  0.0005730271959526784,-0.0008123200399262436,-0.0013530057243606405,-0.0010886127490608972,
  -0.00039635535951198597,0.0001969569787142967,0.0002606730012030533,-0.00035115295209013755,
  -0.001438370315670195,-0.0025374271571154713,-0.0031256650498727384,-0.0028402136847527387,
  -0.0016358832300944531,0.00017954815260431595,0.002060848732332109,0.003416201274366522,
  0.0038325849735515363,0.0032107930896349583,0.0018039564602637683,0.00010700092934983156,
  -0.001333859049002249,-0.002129903305507943,-0.002170446498273018,-0.00162618205097997,
  -0.0008521761947454302,-0.00024181463305012626,-0.00007527524701314324,-0.00039687498737139273,
  -0.0009759556503342884,-0.0014133823230551277,-0.0013703600874774068,-0.0007608257014963959,
  0.00025190669718400967,0.0014154004589753215,0.002461420635709332,0.003012423848769931,
  0.0026079871108133294,0.001115432556294998,-0.0008531818129514857,-0.0022783059845596586,
  -0.002770673157048994,-0.0029796565504781646,-0.003430943381749411,-0.0031973449396977684,
  -0.0008968429179843104,0.0024777666109278788,0.0033131082058404943,0.000032074683390218124,
  0.0033473431384214393
};

class SumFoo 
{ 
    double* my_a; 
    public: 
    double sum; 
        static int count;
        int ip,nip;
    void operator( )( const blocked_range<size_t>& r ) 
    { 
        double *a = my_a; 
       //   cout<<"id of thread is t"<<this_thread::get_id()<<endl; 
        // cout<<"r.begin is "<<r.begin()<<"t r.end is "<<r.end()<<endl; 
        ip=( FILTER_LEN - 1 + (SumFoo::count));
        for( size_t k=r.begin(); k!=r.end( ); ++k ) 
        {           
            nip=ip-k;
            sum+= ((coeffs[k]) * (a[nip]));                                       
         }
    }  
    SumFoo( SumFoo& x, split ) : my_a(x.my_a), sum(0) 
    { 
        //cout<<"split Constructor called"<<endl; 
    } 
    void join( const SumFoo& y ) 
    { 
        // cout<<"Joining all the sums"<<endl; 
        sum+=y.sum; 
    } 
    SumFoo(double a[] ) :my_a(a), sum(0) 
    { 
            // cout<<"Constructor called"<<endl; 
    } 
}; 
void ParallelSumFoo(double *a, size_t n ,ofstream &o) 
{ 
        SumFoo sf(a); 
        for(int j=264;j<150264;j++)
        {
                SumFoo::count=j-264;
                parallel_reduce(blocked_range<size_t>(0,265), sf,auto_partitioner() ); 
              o<<j<<","<<sf.sum<<endl;
        }
} 
int SumFoo::count=0;
int main() 
{ 
     ofstream o("400hzreduce.csv");
    double *buffer=new double[150264];  
    fill_n(buffer,150264,0);
    tick_count t0=tick_count::now(); 
    for(int i=264;i<150264;i++) 
    { 
        buffer[i] = sin(400 * (2 * pi) * (i / 5000.0));
        o<<i<<","<<buffer[i]<<endl;
    } 

    cout<<fixed; 

    ParallelSumFoo(buffer,150264,o);
    tick_count t1=tick_count::now(); 
    double t9=(t1-t0).seconds(); 
    cout<<"Time Taken for parallel execution is t"<<t9<<"seconds"<<endl; 
}

请帮助找到我错在哪里?

你在两个操作系统上都有类似的编译器优化选项,不是吗?-O3 Vs.没有gcc可以产生这种差异。对于visual studio,我不太确定这些选项,但我相信你可以通过GUI搜索并找到它们。

如果没有parallel_reduce,你在两个系统上的运行时间是多少?这将降低一级复杂性。

你试过分析你的代码吗?我推荐在Linux中使用valgrind --tool=callgrind和kcachegrind来查看结果。这应该有助于缩小人们的反应范围。

在这段代码中,数据被写入文件,这在执行时间上产生了巨大的差异。

在linux和windows中写入数据所花费的时间是不同的,这就是为什么时间不同,否则TBB不会产生任何差异。