C++11 多线程比单线程慢

C++11 Multithreading slower than Single Thread

本文关键字：单线程多线程 C++11 更新时间：2023-10-16

我是多任务处理的绝对初学者，他阅读了一些基础知识并尝试将其用于对象可视化的项目。问题是我实现的多线程解决方案比单线程解决方案慢，我不知道为什么，并且由于未知原因我有意外的应用程序代码。我给你两个我试图实现更好性能的案例。我想知道我不明白的地方以及我一般观点的错误。我给你部分源代码，最后总结所有问题。

这是我的线程工厂实现(非常基本，但才刚刚开始(：

threadfactory.h

#pragma once
#include <vector>
#include "ThreadInterface.h"
#include "../MemoryManagement/MemoryMgr.h"
#include "../Logging/LoggingDefines.h"
class CThreadFactory : public CThreadIntearface
{
    public:
        CThreadFactory();
        CThreadFactory(BYTE max_threads);
        ~CThreadFactory();
        void Init(BYTE max_threads);
        void Clear(void);
        //update waves
        virtual void UpdateWavesInternalPoints(CWaves& waves);
        virtual void UpdateWavesNormals(CWaves& waves);
        //update vertices
        virtual void TransformVertices(const CObject& object, const vector<TVertex>& input, vector<XMFLOAT3>& output, const CXNAMatrix& matrix);
        static const char* GetHeapName(void) { return "Thread factory"; }
#if (defined(DEBUG) | defined(_DEBUG))
        /**
        *   Return class name. This function is compiled only in debug mode.
        *   return class name
        */
        NAME_FUNC();
#endif
    private:
        void Join(vector<std::thread>& threads);
        void ReleaseThreads(vector<std::thread>& threads);
    private:
        UINT muiNumberofThreads;
    private:
        DECLARE_HEAP;
};

线程工厂.cpp

#include "ThreadFactory.h"
CThreadFactory::CThreadFactory()
{
    TRACE(LOG_DEBUG, string("Start of initialization of object "") + GetName() + string("""));
    muiNumberofThreads = 1;
    TRACE(LOG_DEBUG, string("End of initialization of object "") + GetName() + string("""));
}
CThreadFactory::CThreadFactory(BYTE max_threads)
{
    TRACE(LOG_DEBUG, string("Start of initialization of object "") + GetName() + string("""));
    Init(max_threads);
    TRACE(LOG_DEBUG, string("End of initialization of object "") + GetName() + string("""));
}
CThreadFactory::~CThreadFactory()
{
    TRACE(LOG_DEBUG, string("Start of releasing of object "") + GetName() + string("""));
    Clear();
    TRACE(LOG_DEBUG, string("End of releasing of object "") + GetName() + string("""));
}
void CThreadFactory::Init(BYTE max_threads)
{
    muiNumberofThreads = max_threads;
}
void CThreadFactory::Clear(void)
{
}
void CThreadFactory::Join(vector<std::thread>& threads)
{
    for (auto& it : threads)
    {
        if (it.joinable())
            it.join();
    }
}
void CThreadFactory::ReleaseThreads(vector<std::thread>& threads)
{
    /*for (auto& it : threads)
    {
    }*/
    threads.clear();
}
void CThreadFactory::UpdateWavesInternalPoints(CWaves& waves)
{
    if (muiNumberofThreads <= 1)
    {
        waves.UpdateWaveInteriorPoints(1, waves.RowCount() - 1);
    }
    else
    {
        vector<std::thread> threads(muiNumberofThreads - 1);
        UINT dwWavePartDifference = waves.RowCount() / muiNumberofThreads;
        DWORD dwMinRow = 1, dwMaxRow = 1 + dwWavePartDifference;
        for (UINT i = 0; i < muiNumberofThreads - 1; i++)
        {
            threads[i] = move(std::thread{ &CWaves::UpdateWaveInteriorPoints, &waves, dwMinRow, dwMaxRow });
            dwMinRow += dwWavePartDifference;
            dwMaxRow += dwWavePartDifference;
        }
        waves.UpdateWaveInteriorPoints(dwMinRow, dwMaxRow);
        Join(threads);
        ReleaseThreads(threads);
    }
}
void CThreadFactory::UpdateWavesNormals(CWaves& waves)
{
    if (muiNumberofThreads <= 1)
    {
        waves.UpdateWaveNormals(1, waves.RowCount() - 1);
    }
    else
    {
        vector<std::thread> threads(muiNumberofThreads - 1);
        UINT dwWavePartDifference = waves.RowCount() / muiNumberofThreads;
        DWORD dwMinRow = 1, dwMaxRow = 1 + dwWavePartDifference;
        for (UINT i = 0; i < muiNumberofThreads - 1; i++)
        {
            threads[i] = move(std::thread{ &CWaves::UpdateWaveNormals, &waves, dwMinRow, dwMaxRow });
            dwMinRow += dwWavePartDifference;
            dwMaxRow += dwWavePartDifference;
        }
        waves.UpdateWaveNormals(dwMinRow, dwMaxRow);
        Join(threads);
        ReleaseThreads(threads);
    }
}
void CThreadFactory::TransformVertices(const CObject& object, const vector<TVertex>& input, vector<XMFLOAT3>& output, const CXNAMatrix& matrix)
{
    if (output.size() != input.size())
        output.resize(input.size());
    if ((muiNumberofThreads <= 1) || (input.size() < 1000))
    {
        object.TransformVerticesSet(input.begin(), output.begin(), input.size() - 1, matrix);
    }
    else
    {
        vector<std::thread> threads(muiNumberofThreads - 1);
        UINT uiThreadVertexCount = input.size() / muiNumberofThreads;
        UINT uiStartVertexIndex = 0;
        for (UINT i = 0; i < muiNumberofThreads - 1; i++)
        {
            if (uiStartVertexIndex >= input.size())
                uiStartVertexIndex = input.size() - 1;
            threads[i] = move(std::thread{ &CObject::TransformVerticesSet, &object, input.begin() + uiStartVertexIndex, output.begin() + uiStartVertexIndex, uiThreadVertexCount - 1, matrix });
            uiStartVertexIndex += uiThreadVertexCount;
        }
        object.TransformVerticesSet(input.begin() + uiStartVertexIndex, output.begin() + uiStartVertexIndex, uiThreadVertexCount - 1, matrix);
        Join(threads);
        ReleaseThreads(threads);
    }
}
#if (defined(DEBUG) | defined(_DEBUG))
NAME_BODY(CThreadFactory, "Threads");
#endif
DEFINE_HEAP(CThreadFactory, GetHeapName());

1.波次更新：

我正在使用名为 Wave 的对象。此对象隐式具有大约 40 000 个顶点。我正在使用这些函数为每一帧更新它：

void CWaves::UpdateWaveInteriorPoints(DWORD min_row, DWORD max_row)
{
    if (min_row < 1)
        min_row = 1;
    if (max_row > (RowCount() - 1))
        max_row = (RowCount() - 1);
    for (DWORD i = min_row; i < max_row; ++i)
    {
        for (DWORD j = 1; j < ColumnCount() - 1; ++j)
        {
            // After this update we will be discarding the old previous
            // buffer, so overwrite that buffer with the new update.
            // Note how we can do this inplace (read/write to same element) 
            // because we won't need prev_ij again and the assignment happens last.
            // Note j indexes x and i indexes z: h(x_j, z_i, t_k)
            // Moreover, our +z axis goes "down"; this is just to 
            // keep consistent with our row indices going down.
            GetPrevSolutionVertices()[i*ColumnCount() + j].Position.y =
                GetK1()*GetPrevSolutionVertices()[i*ColumnCount() + j].Position.y +
                GetK2()*mpObjectMesh->mVertices[i*ColumnCount() + j].Position.y +
                GetK3()*(mpObjectMesh->mVertices[(i + 1)*ColumnCount() + j].Position.y +
                mpObjectMesh->mVertices[(i - 1)*ColumnCount() + j].Position.y +
                mpObjectMesh->mVertices[i*ColumnCount() + j + 1].Position.y +
                mpObjectMesh->mVertices[i*ColumnCount() + j - 1].Position.y);
        }
    }
}
void CWaves::UpdateWaveNormals(DWORD min_row, DWORD max_row)
{
    if (min_row < 1)
        min_row = 1;
    if (max_row >(RowCount() - 1))
        max_row = (RowCount() - 1);
    for (UINT i = min_row; i < max_row; ++i)
    {
        for (UINT j = 1; j < ColumnCount() - 1; ++j)
        {
            float l = mpObjectMesh->mVertices[i*ColumnCount() + j - 1].Position.y;
            float r = mpObjectMesh->mVertices[i*ColumnCount() + j + 1].Position.y;
            float t = mpObjectMesh->mVertices[(i - 1)*ColumnCount() + j].Position.y;
            float b = mpObjectMesh->mVertices[(i + 1)*ColumnCount() + j].Position.y;
            mpObjectMesh->mVertices[i*ColumnCount() + j].Normal.x = -r + l;
            mpObjectMesh->mVertices[i*ColumnCount() + j].Normal.y = 2.0f*GetSpatialStep();
            mpObjectMesh->mVertices[i*ColumnCount() + j].Normal.z = b - t;
            XMVECTOR n = XMVector3Normalize(XMLoadFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].Normal));
            XMStoreFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].Normal, n);
            mpObjectMesh->mVertices[i*ColumnCount() + j].TangentU = XMFLOAT3(2.0f*GetSpatialStep(), r - l, 0.0f);
            XMVECTOR T = XMVector3Normalize(XMLoadFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].TangentU));
            XMStoreFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].TangentU, T);
        }
    }
}
void CWaves::UpdateWave(float dt)
{
    static float t_base = 0.0f;
    if ((g_Timer->TotalTime() - t_base) >= 0.25f)
    {
        t_base += 0.25f;
        DWORD i, j;
        do
        {
            i = 5 + rand() % (RowCount() - 5);
            j = 5 + rand() % (ColumnCount() - 5);
        } while (!((i > 1) && (i < (RowCount() - 2)) &&
            (j > 1) && (j < (ColumnCount() - 2))));
        float r = MathHelper::RandF(1.0f, 2.0f);
        Disturb(i, j, r);
    }
    static float t = 0;
    // Accumulate time.
    t += dt;
    // Only update the simulation at the specified time step.
    if (t >= TimeStep())
    {
        // Only update interior points; we use zero boundary conditions.
        if (g_ThreadFactory)
        {
            g_ThreadFactory->UpdateWavesInternalPoints(*this);
        }
        else
        {
            UpdateWaveInteriorPoints(1, RowCount() - 1);
        }
        // We just overwrote the previous buffer with the new data, so
        // this data needs to become the current solution and the old
        // current solution becomes the new previous solution.
        std::swap(GetPrevSolutionVertices(), mpObjectMesh->mVertices);
        t = 0.0f; // reset time
        if (mShapeDescription.mShapeProperties.bLightedObject)
        {
            //
            // Compute normals using finite difference scheme.
            //
            if (g_ThreadFactory)
            {
                g_ThreadFactory->UpdateWavesNormals(*this);
            }
            else
            {
                UpdateWaveNormals(1, RowCount() - 1);
            }
        }
    }
}

在这种情况下，我认为问题出在我给所有线程的 CWaves 对象中，我认为这会导致持续锁定。所以我改变了另一种情况的方法，我试图使用给定的变换矩阵来转换顶点。我使用的是容器迭代器，而不是整个对象。

2. 顶点变换

从上面显示的线程工厂调用的顶点转换方法：

void CObject::TransformVerticesSet(vector<TVertex>::const_iterator input, vector<XMFLOAT3>::iterator output, UINT number_of_vertices, const CXNAMatrix& matrix) const
{
    for (UINT i = 0; i <= number_of_vertices; i++)
    {
        CMatrixTransformations::TransformPoint(input[i].Position, matrix, output[i]);
    }
}

在这种情况下，我尝试使用迭代器而不是给出整个顶点向量，但结果等于以前的解决方案。它比单线程解决方案慢。

编辑<8.12.2014>

在前面的代码中，我使用以下宏：

TRACE - 用于日志记录系统，在释放模式下为空

NAME_FUNC，NAME_BODY - 用于声明和定义返回类名的类方法的宏

DECLARE_HEAP，DEFINE_HEAP - 为重载的新建和删除运算符创建声明和定义

这些都不会影响多线程操作的性能。

这是我关闭应用程序后VS 2013的输出(请注意，在这种情况下，我不对以前的情况使用多线程(：

The thread 0x229c has exited with code 27 (0x1b).
The thread 0x22dc has exited with code 27 (0x1b).
The thread 0x11ac has exited with code 27 (0x1b).
The thread 0x328c has exited with code 27 (0x1b).
The thread 0x205c has exited with code 27 (0x1b).
The thread 0xf4c has exited with code 27 (0x1b).
The thread 0x894 has exited with code 27 (0x1b).
The thread 0x3094 has exited with code 27 (0x1b).
The thread 0x2eb4 has exited with code 27 (0x1b).
The thread 0x2ef8 has exited with code 27 (0x1b).
The thread 0x22f4 has exited with code 27 (0x1b).
The thread 0x2810 has exited with code 27 (0x1b).
The thread 0x29e0 has exited with code 27 (0x1b).
The thread 0x2e54 has exited with code 27 (0x1b).
D3D11 WARNING: Process is terminating. Using simple reporting. Please call ReportLiveObjects() at runtime for standard reporting. [ STATE_CREATION WARNING #0: UNKNOWN]
D3D11 WARNING: Live Producer at 0x012F05A0, Refcount: 8. [ STATE_CREATION WARNING #0: UNKNOWN]
D3D11 WARNING:  Live Object at 0x012F1D38, Refcount: 0. [ STATE_CREATION WARNING #0: UNKNOWN]
D3D11 WARNING:  Live Object at 0x013BA3F8, Refcount: 0. [ STATE_CREATION WARNING #0: UNKNOWN]
The program '[13272] EngineDX.exe' has exited with code 27 (0x1b).

似乎第三方 API(可能是 DX(正在创建线程，但在进程管理器中，我只看到一个线程的使用。这可能是一个问题...

所以这是我的问题：

我的线程工厂实现是否如此错误，或者更新 40 000 个顶点不必划分为更多线程？
如果我锁定了，我想知道为什么。顶点转换的解决方案是使用迭代器和顶点向量容器被划分，所以我不应该锁定。
我决定为每个函数调用创建线程，原因只有一个。起初，我将线程向量容器作为线程工厂的成员类。但这会导致调试模式下的内存泄漏(发布模式没有此问题(。只是纯粹的声明，什么都不做。我一直不知道为什么。还有什么需要正确释放线程的吗？
现在我的应用程序以代码 27 结束，因为所有线程都返回此错误代码。什么意思？
奇怪的是，当我使用 8 个线程(7 线程 CPU 上的 8 + 个主线程(时，在调试模式下，我看到所有 8 个线程都做了一些事情。但是在发布模式下，仅使用一个线程(主线程(没有变化。是行为错误还是由于某些原因可以预期？

很抱歉文本很长，但我想更精确以避免误解。感谢您的回答。

编辑17.12.2014：

我重新实现了线程使用的函数(并使其独立于 Wave 类(，没有共享对象引用或变量，但它仍然不起作用。我不明白为什么...有趣的是，当我设置要使用的 8 个线程时，在调试可执行文件中，我看到我的 Core i7 以 100% 的速度运行，但在帧速率方面没有任何好处。使用发布可执行文件，我看到只有 4 个线程运行，CPU 为 25%。

新的多线程函数：

void UpdateWaveInteriorPoints(TVertexFieldIterator previous_vertex_field, TVertexFieldIterator actual_vertex_field, DWORD min_row, DWORD max_row, float k1, float k2, float k3, UINT column_count)
{
    if (min_row < 1)
        min_row = 1;
    /*if (max_row >(RowCount() - 1))
        max_row = (RowCount() - 1);*/
    for (DWORD i = min_row; i < max_row; ++i)
    {
        for (DWORD j = 1; j < column_count - 1; ++j)
        {
            // After this update we will be discarding the old previous
            // buffer, so overwrite that buffer with the new update.
            // Note how we can do this inplace (read/write to same element) 
            // because we won't need prev_ij again and the assignment happens last.
            // Note j indexes x and i indexes z: h(x_j, z_i, t_k)
            // Moreover, our +z axis goes "down"; this is just to 
            // keep consistent with our row indices going down.
            previous_vertex_field[i*column_count + j].Position.y =
                k1*previous_vertex_field[i*column_count + j].Position.y +
                k2*actual_vertex_field[i*column_count + j].Position.y +
                k3*(actual_vertex_field[(i + 1)*column_count + j].Position.y +
                actual_vertex_field[(i - 1)*column_count + j].Position.y +
                actual_vertex_field[i*column_count + j + 1].Position.y +
                actual_vertex_field[i*column_count + j - 1].Position.y);
        }
    }
}

创建线程的函数：

TVertexFieldIterator tActualVertexIterator = waves.mpObjectMesh->mVertices.begin();
        TVertexFieldIterator tPreviousVertexIterator = waves.GetPrevSolutionVertices().begin();
        std::vector<std::thread> threads;
        //std::vector<std::future<void>> threads;
        UINT dwWavePartDifference = waves.RowCount() / muiNumberofThreads;
        DWORD dwMinRow = 1, dwMaxRow = dwWavePartDifference;
        DWORD dwVertexCount = dwWavePartDifference*waves.ColumnCount();
        for (UINT i = 0; i < muiNumberofThreads - 1; i++)
        {
            //threads.emplace_back(std::async( std::launch::async, &CWaves::UpdateWaveInteriorPoints, &waves, tPreviousVertexIterator, tActualVertexIterator, dwMinRow, dwMaxRow, waves.GetK1(), waves.GetK2(), waves.GetK3(), waves.ColumnCount() ));
            threads.emplace_back(std::thread(&UpdateWaveInteriorPoints, tPreviousVertexIterator, tActualVertexIterator, dwMinRow, dwMaxRow, waves.GetK1(), waves.GetK2(), waves.GetK3(), waves.ColumnCount()));
            tActualVertexIterator += dwVertexCount;
            tPreviousVertexIterator += dwVertexCount;
        }
        tPreviousVertexIterator -= waves.ColumnCount(); //row - 1
        tActualVertexIterator -= waves.ColumnCount(); //row - 1
        waves.UpdateWaveInteriorPoints(tPreviousVertexIterator, tActualVertexIterator, dwMinRow, dwMaxRow, waves.GetK1(), waves.GetK2(), waves.GetK3(), waves.ColumnCount());
        for (UINT i = 0; i < muiNumberofThreads -1; i++)
        {
            //threads[i].wait();
            threads[i].join();
        }

马立克

@mareknr 当我提出你的问题时，侧边栏中有 10 个相关的问题和答案，所有这些都与为什么多线程实现比单线程实现慢有关。我认为其中一个或多个将解决您的问题。以下是其中一些的链接：

多线程 GEMM 比单线程 GEMM 慢？

C++ 由于 CPU 类型，提升多线程比单线程慢？

为什么这个 OpenMP 程序比单线程慢？

2 个线程比 1 个线程慢？