嵌套for循环的奇怪性能问题

Odd performance issue with nested for loops

本文关键字：性能问题 for 循环嵌套更新时间：2023-10-16

下面是完整的源代码，您只需将其复制粘贴到Visual Studio中即可轻松重新编写。

#include <Windows.h>
#include <algorithm>
#include <vector>
#include <iostream>
#include <sstream>
LARGE_INTEGER gFreq;
struct CProfileData;
// Yes, we map the pointer itself not the string, for performance reasons
std::vector<CProfileData*> gProfileData;
// simulate a draw buffer access to avoid CBlock::Draw being optimized away
float gDrawBuffer = 0;
struct CTimer
{
    CTimer()
    {
        Reset();
    }
    size_t GetElapsedMicro()
    {
        LARGE_INTEGER now;
        ::QueryPerformanceCounter(&now);
        return (1000000 * (now.QuadPart - m_timer.QuadPart)) / gFreq.QuadPart;
    }
    inline void Reset()
    {
        ::QueryPerformanceCounter(&m_timer);
    }
    LARGE_INTEGER m_timer;
};
struct CProfileData
{
    CProfileData() : m_hitCount(0), m_totalTime(0), m_minTime(-1),
        m_maxTime(0), m_name(NULL)
    {
        gProfileData.push_back(this);
    }
    size_t m_totalTime;
    size_t m_minTime;
    size_t m_maxTime;
    size_t m_hitCount;
    const char * m_name;
};
class CSimpleProfiler
{
public:
    CSimpleProfiler(const char * aLocationName, CProfileData * aData)
        : m_location(aLocationName), m_data(aData)
    {
        ::QueryPerformanceCounter(&m_clock);
    }
    ~CSimpleProfiler()
    {
        CProfileData & data = *m_data;
        data.m_name = m_location;
        ++data.m_hitCount;

        LARGE_INTEGER now;
        ::QueryPerformanceCounter(&now);
        size_t elapsed = (1000000 * (now.QuadPart - m_clock.QuadPart)) / gFreq.QuadPart;
        data.m_totalTime += elapsed;
        elapsed < data.m_minTime ? data.m_minTime = elapsed : true;
        elapsed > data.m_maxTime ? data.m_maxTime = elapsed : true;
    }
    static void PrintAll()
    {
        std::stringstream str;
        str.width(20);
        str << "Location";
        str.width(15);
        str << "Total time";
        str.width(15);
        str << "Average time";
        str.width(15);
        str << "Hit count";
        str.width(15);
        str << "Min";
        str.width(15);
        str << "Max" << std::endl;
        ::OutputDebugStringA(str.str().c_str());
        for (auto i = gProfileData.begin(); i != gProfileData.end(); ++i)
        {
            CProfileData & data = **i;
            std::stringstream str;
            str.width(20);
            str << data.m_name;
            str.width(15);
            str << data.m_totalTime;
            str.width(15);
            str << data.m_totalTime / (float)data.m_hitCount;
            str.width(15);
            str << data.m_hitCount;
            str.width(15);
            str << data.m_minTime;
            str.width(15);
            str << data.m_maxTime << std::endl;
            ::OutputDebugStringA(str.str().c_str());
        }
    }
    static void Clear()
    {
        for (auto i = gProfileData.begin(); i != gProfileData.end(); ++i)
        {
            (*i)->m_totalTime = 0;
            (*i)->m_minTime = 0;
            (*i)->m_maxTime = 0;
            (*i)->m_hitCount = 0;
        }
    }
private:
    LARGE_INTEGER m_clock;
    const char * m_location;
    CProfileData * m_data;
};

#define PROFILING_ENABLED
#ifdef PROFILING_ENABLED
#define SIMPLE_PROFILE 
    static CProfileData pdata ## __LINE__; 
    CSimpleProfiler p ## __LINE__(__FUNCTION__, & pdata ## __LINE__)
#define SIMPLE_PROFILE_WITH_NAME(Name) 
    static CProfileData pdata ## __LINE__; 
    CSimpleProfiler p ## __LINE__(Name, & pdata ## __LINE__)
#else
#define SIMPLE_PROFILE __noop
#define SIMPLE_PROFILE_WITH_NAME(Name) __noop
#endif

void InvalidateL1Cache()
{
    const int size = 256 * 1024; 
    static char *c = (char *)malloc(size);
    for (int i = 0; i < 0x0fff; i++)
        for (int j = 0; j < size; j++)
            c[j] = i*j;
}
int _tmain(int argc, _TCHAR* argv[])
{
    ::QueryPerformanceFrequency(&gFreq);
    LARGE_INTEGER pc;
    ::QueryPerformanceCounter(&pc);
    struct CBlock
    {
        float x;
        float y;
        void Draw(float aBlend)
        {   
            for (size_t i = 0; i < 100; ++i )
                gDrawBuffer += aBlend;
        }
    };

    typedef std::vector<std::vector<CBlock>> Layer;
    typedef std::vector<Layer> Layers;
    Layers mBlocks;
    // populate with dummy data;
    mBlocks.push_back(Layer());
    Layer & layer = mBlocks.back();
    layer.resize(109);
    srand(0); // for reprodicibility (determinism)
    for (auto i = layer.begin(); i != layer.end(); ++i)
    {
        i->resize(25 + rand() % 10 - 5);
    }
    // end populating dummy data
    while (1)
    {
        CSimpleProfiler::Clear();
        float aBlend = 1.f / (rand() % 100);
        {
            for (auto i = mBlocks.begin(); i != mBlocks.end(); ++i)
            {
                for (auto j = i->begin(); j != i->end(); ++j)
                {
                    CTimer t;
                    {
                        SIMPLE_PROFILE_WITH_NAME("Main_Draw_3");
                        for (auto blockIt = j->begin(); blockIt != j->end();)
                        {
                            CBlock * b = nullptr;
                            {
                                b = &*blockIt;
                            }
                        {
                            b->Draw(aBlend);
                        }
                        {
                            ++blockIt;
                        }
                        }
                    }
                    if (t.GetElapsedMicro() > 1000)
                    {
                        ::OutputDebugStringA("SLOWDOWN!n");
                        CSimpleProfiler::PrintAll();
                    }
                }
            }
        }
    }
    return 0;
}

我不时地得到以下分析，以微秒表示：

SLOWDOWN!
            Location     Total time   Average time      Hit count            Min            Max
         Main_Draw_3           2047        36.5536             56              0           1040

这种情况时有发生。通常，Main_Draw_3块需要100微秒才能完成，但它会不时地达到1000（Max列）。是什么原因造成的？

我知道缓存未命中可能会起到一定作用，但在这种情况下真的是这样吗？。。。这里发生了什么？我该如何缓解这种情况？

更多信息：

编译器VS 2013，使用Maximize Speed (/O2)编译

我认为可能有两个问题：

您是否使用上的优化进行编译？旗帜是什么
也许您可以增加样本量（例如，在一次分析运行中运行十次（或一百次，或一千次等）此代码）。原因是如果样本量很小，标准偏差就会很高