如何通过指针更快地访问变量,然后直接访问变量?

How can a variable be accessed faster via a pointer then straightly?

本文关键字:变量 访问 然后 何通过 指针      更新时间:2023-10-16

我正在使用下一个程序来测试不同简单动作的速度。在这里,我测试将变量加载到寄存器中的速度:

#include <iostream>
#include <Windows.h>
void main()
{
DWORD _time;
int val = 1;
int* ptra = &val;
for (auto a = 0; a < 20; a++)
{
_time = GetTickCount();
for (auto i = 0; i < 100000000; i++)
{
_asm
{
mov         eax, val
}
}
_time = GetTickCount() - _time;
std::cout << _time << std::endl;
};
std::cout << buf << std::endl;
system("pause");
for (auto a = 0; a < 20; a++)
{
_time = GetTickCount();
for (auto i = 0; i < 100000000; i++)
{
_asm
{
mov     eax, dword ptr[ptra]
mov     ebx, dword ptr[eax]
}
}
_time = GetTickCount() - _time;
std::cout << _time << std::endl;
};
std::cout << buf << std::endl;
system("pause");
}

我的电脑上测试的平均值为234203

出于某种原因,它通过指针走得更快,然后走得更快。我已经在两台PC上对此进行了测试,并得到了相同的结果。起初我认为一些 CP 优化正在起作用,但这意味着使用指针比变量本身更有效,听起来很尴尬。现在我想我做错了什么,可能是 GetTickCount() 有问题,无论如何,我没有找到任何可以帮助理解正在发生的事情的东西,没有人可以解释这一点。

双指针的结果与直通结果相同 指针链越长,工作速度越慢

用 1 替换 val 会增加一些速度,但比指针慢

空循环需要相同的时间才能完成。任何更改都不会影响行为。

这是我目前使用的代码

#include <iostream>
#include <Windows.h>
int* ptra;
int val;
void main()
{
DWORD _time;
val = 1;
ptra = &val;
HANDLE TH = GetCurrentThread();
HANDLE PH = GetCurrentProcess();
PDWORD_PTR APMask = new ULONG_PTR;
PDWORD_PTR ASMask = new ULONG_PTR;
ULONG_PTR Core = 1;
GetProcessAffinityMask(PH, APMask, ASMask);
while (!(Core && *APMask)) Core = Core << 1;
SetThreadAffinityMask(TH, Core);
SetThreadPriority(TH, THREAD_PRIORITY_TIME_CRITICAL);
SetPriorityClass(PH, REALTIME_PRIORITY_CLASS);
DWORD64 ProcessorTime;
#define order 0
#define loops 10000000
//=========================================
for (auto a = 0; a < 20; a++)
{
//_time = GetTickCount();
ProcessorTime = __rdtsc();
for (auto i = 0; i < loops; i++)
{
_asm
{
#if order == 1
mov     eax, dword ptr[ptra]
mov     ebx, dword ptr[eax]
#else
mov         eax, val
#endif
}
}
//_time = GetTickCount() - _time;
ProcessorTime = __rdtsc() - ProcessorTime;
std::cout << ProcessorTime << std::endl;
};
//system("pause");
//=========================================
std::cout << "=" << std::endl;
//=========================================
for (auto a = 0; a < 20; a++)
{
//_time = GetTickCount();
ProcessorTime = __rdtsc();
for (auto i = 0; i < loops; i++)
{
_asm
{
#if order == 1
mov         eax, val
#else
mov     eax, dword ptr[ptra]
mov     ebx, dword ptr[eax]
#endif
}
}
//_time = GetTickCount() - _time;
ProcessorTime = __rdtsc() - ProcessorTime;
std::cout << ProcessorTime << std::endl;
};
//=========================================
SetPriorityClass(PH, NORMAL_PRIORITY_CLASS);
SetThreadPriority(TH, THREAD_PRIORITY_NORMAL);
SetThreadAffinityMask(TH, *APMask);
system("pause");
}

ASM 代码:

; 5    : {
push    ebp
mov ebp, esp
sub esp, 124                ; 0000007cH
mov eax, DWORD PTR ___security_cookie
xor eax, ebp
mov DWORD PTR __$ArrayPad$[ebp], eax
push    ebx
push    esi
push    edi
; 6    :    //DWORD _time;
; 7    : 
; 8    :    int* ptra;
; 9    :    int val;
; 10   : 
; 11   :    val = 1;
mov DWORD PTR _val$[ebp], 1
; 12   :    ptra = &val;
lea eax, DWORD PTR _val$[ebp]
mov DWORD PTR _ptra$[ebp], eax
; 13   :    short unsigned a;
; 14   : 
; 15   :    HANDLE TH = GetCurrentThread();
call    DWORD PTR __imp__GetCurrentThread@0
mov DWORD PTR _TH$[ebp], eax
; 16   :    HANDLE PH = GetCurrentProcess();
call    DWORD PTR __imp__GetCurrentProcess@0
mov DWORD PTR _PH$[ebp], eax
; 17   :    PDWORD_PTR APMask = new ULONG_PTR;
push    4
call    ??2@YAPAXI@Z                ; operator new
add esp, 4
mov DWORD PTR $T2[ebp], eax
mov eax, DWORD PTR $T2[ebp]
mov DWORD PTR _APMask$[ebp], eax
; 18   :    PDWORD_PTR ASMask = new ULONG_PTR;
push    4
call    ??2@YAPAXI@Z                ; operator new
add esp, 4
mov DWORD PTR $T1[ebp], eax
mov eax, DWORD PTR $T1[ebp]
mov DWORD PTR _ASMask$[ebp], eax
; 19   :    ULONG_PTR Core = 1;
mov DWORD PTR _Core$[ebp], 1
; 20   :    GetProcessAffinityMask(PH, APMask, ASMask);
mov eax, DWORD PTR _ASMask$[ebp]
push    eax
mov ecx, DWORD PTR _APMask$[ebp]
push    ecx
mov edx, DWORD PTR _PH$[ebp]
push    edx
call    DWORD PTR __imp__GetProcessAffinityMask@12
$LN2@main:
; 21   :    while (!(Core && *APMask)) Core = Core << 1;
cmp DWORD PTR _Core$[ebp], 0
je  SHORT $LN16@main
mov eax, DWORD PTR _APMask$[ebp]
cmp DWORD PTR [eax], 0
jne SHORT $LN3@main
$LN16@main:
mov eax, DWORD PTR _Core$[ebp]
shl eax, 1
mov DWORD PTR _Core$[ebp], eax
jmp SHORT $LN2@main
$LN3@main:
; 22   :    SetThreadAffinityMask(TH, Core);
mov eax, DWORD PTR _Core$[ebp]
push    eax
mov ecx, DWORD PTR _TH$[ebp]
push    ecx
call    DWORD PTR __imp__SetThreadAffinityMask@8
; 23   :    SetThreadPriority(TH, THREAD_PRIORITY_TIME_CRITICAL);
push    15                  ; 0000000fH
mov eax, DWORD PTR _TH$[ebp]
push    eax
call    DWORD PTR __imp__SetThreadPriority@8
; 24   :    SetPriorityClass(PH, REALTIME_PRIORITY_CLASS);
push    256                 ; 00000100H
mov eax, DWORD PTR _PH$[ebp]
push    eax
call    DWORD PTR __imp__SetPriorityClass@8
; 25   :    DWORD64 ProcessorTime;
; 26   : #define order 0
; 27   : #define loops 10000000
; 28   : #define tests 50
; 29   : 
; 30   :    //=========================================
; 31   :    for (a = 0; a < tests; a++)
xor eax, eax
mov WORD PTR _a$[ebp], ax
jmp SHORT $LN6@main
$LN4@main:
mov ax, WORD PTR _a$[ebp]
add ax, 1
mov WORD PTR _a$[ebp], ax
$LN6@main:
movzx   eax, WORD PTR _a$[ebp]
cmp eax, 50                 ; 00000032H
jge SHORT $LN5@main
; 32   :    {
; 33   :        //_time = GetTickCount();
; 34   :        ProcessorTime = __rdtsc();
rdtsc
mov DWORD PTR _ProcessorTime$[ebp], eax
mov DWORD PTR _ProcessorTime$[ebp+4], edx
; 35   :        for (auto i = 0; i < loops; i++)
mov DWORD PTR _i$4[ebp], 0
jmp SHORT $LN9@main
$LN7@main:
mov eax, DWORD PTR _i$4[ebp]
add eax, 1
mov DWORD PTR _i$4[ebp], eax
$LN9@main:
cmp DWORD PTR _i$4[ebp], 10000000       ; 00989680H
jge SHORT $LN8@main
; 36   :        {
; 37   :            _asm
; 38   :            {
; 39   : #if order == 1
; 40   :                mov     eax, dword ptr[ptra]
; 41   :                mov     ebx, dword ptr[eax]
; 42   : #else
; 43   :                mov         eax, val
mov eax, DWORD PTR _val$[ebp]
; 44   : #endif
; 45   :            }
; 46   :        }
jmp SHORT $LN7@main
$LN8@main:
; 47   :        //_time = GetTickCount() - _time;
; 48   :        ProcessorTime = __rdtsc() - ProcessorTime;
rdtsc
sub eax, DWORD PTR _ProcessorTime$[ebp]
sbb edx, DWORD PTR _ProcessorTime$[ebp+4]
mov DWORD PTR _ProcessorTime$[ebp], eax
mov DWORD PTR _ProcessorTime$[ebp+4], edx
; 49   :        std::cout << ProcessorTime << std::endl;
push    OFFSET ??$endl@DU?$char_traits@D@std@@@std@@YAAAV?$basic_ostream@DU?$char_traits@D@std@@@0@AAV10@@Z ; std::endl<char,std::char_traits<char> >
mov eax, DWORD PTR _ProcessorTime$[ebp+4]
push    eax
mov ecx, DWORD PTR _ProcessorTime$[ebp]
push    ecx
mov ecx, DWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
call    DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@_K@Z
mov ecx, eax
call    DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@P6AAAV01@AAV01@@Z@Z
; 50   :    };
jmp SHORT $LN4@main
$LN5@main:
; 51   :    //system("pause");
; 52   :    //=========================================
; 53   :    std::cout << "=" << std::endl;
push    OFFSET ??$endl@DU?$char_traits@D@std@@@std@@YAAAV?$basic_ostream@DU?$char_traits@D@std@@@0@AAV10@@Z ; std::endl<char,std::char_traits<char> >
push    OFFSET ??_C@_01NEMOKFLO@?$DN?$AA@
mov eax, DWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
push    eax
call    ??$?6U?$char_traits@D@std@@@std@@YAAAV?$basic_ostream@DU?$char_traits@D@std@@@0@AAV10@PBD@Z ; std::operator<<<std::char_traits<char> >
add esp, 8
mov ecx, eax
call    DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@P6AAAV01@AAV01@@Z@Z
; 54   :    //=========================================
; 55   :    for (a = 0; a < tests; a++)
xor eax, eax
mov WORD PTR _a$[ebp], ax
jmp SHORT $LN12@main
$LN10@main:
mov ax, WORD PTR _a$[ebp]
add ax, 1
mov WORD PTR _a$[ebp], ax
$LN12@main:
movzx   eax, WORD PTR _a$[ebp]
cmp eax, 50                 ; 00000032H
jge SHORT $LN11@main
; 56   :    {
; 57   :        //_time = GetTickCount();
; 58   :        ProcessorTime = __rdtsc();
rdtsc
mov DWORD PTR _ProcessorTime$[ebp], eax
mov DWORD PTR _ProcessorTime$[ebp+4], edx
; 59   :        for (auto i = 0; i < loops; i++)
mov DWORD PTR _i$3[ebp], 0
jmp SHORT $LN15@main
$LN13@main:
mov eax, DWORD PTR _i$3[ebp]
add eax, 1
mov DWORD PTR _i$3[ebp], eax
$LN15@main:
cmp DWORD PTR _i$3[ebp], 10000000       ; 00989680H
jge SHORT $LN14@main
; 60   :        {
; 61   :            _asm
; 62   :            {
; 63   : #if order == 1
; 64   :                mov         eax, val
; 65   : #else
; 66   :                mov     eax, dword ptr[ptra]
mov eax, DWORD PTR _ptra$[ebp]
; 67   :                mov     ebx, dword ptr[eax]
mov ebx, DWORD PTR [eax]
; 68   : #endif
; 69   :            }
; 70   :        }
jmp SHORT $LN13@main
$LN14@main:
; 71   :        //_time = GetTickCount() - _time;
; 72   :        ProcessorTime = __rdtsc() - ProcessorTime;
rdtsc
sub eax, DWORD PTR _ProcessorTime$[ebp]
sbb edx, DWORD PTR _ProcessorTime$[ebp+4]
mov DWORD PTR _ProcessorTime$[ebp], eax
mov DWORD PTR _ProcessorTime$[ebp+4], edx
; 73   :        std::cout << ProcessorTime << std::endl;
push    OFFSET ??$endl@DU?$char_traits@D@std@@@std@@YAAAV?$basic_ostream@DU?$char_traits@D@std@@@0@AAV10@@Z ; std::endl<char,std::char_traits<char> >
mov eax, DWORD PTR _ProcessorTime$[ebp+4]
push    eax
mov ecx, DWORD PTR _ProcessorTime$[ebp]
push    ecx
mov ecx, DWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
call    DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@_K@Z
mov ecx, eax
call    DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@P6AAAV01@AAV01@@Z@Z
; 74   :    };
jmp SHORT $LN10@main
$LN11@main:
; 75   :    //=========================================
; 76   :    SetPriorityClass(PH, NORMAL_PRIORITY_CLASS);
push    32                  ; 00000020H
mov eax, DWORD PTR _PH$[ebp]
push    eax
call    DWORD PTR __imp__SetPriorityClass@8
; 77   :    SetThreadPriority(TH, THREAD_PRIORITY_NORMAL);
push    0
mov eax, DWORD PTR _TH$[ebp]
push    eax
call    DWORD PTR __imp__SetThreadPriority@8
; 78   :    SetThreadAffinityMask(TH, *APMask);
mov eax, DWORD PTR _APMask$[ebp]
mov ecx, DWORD PTR [eax]
push    ecx
mov edx, DWORD PTR _TH$[ebp]
push    edx
call    DWORD PTR __imp__SetThreadAffinityMask@8
; 79   :    system("pause");
push    OFFSET ??_C@_05PDJBBECF@pause?$AA@
call    DWORD PTR __imp__system
add esp, 4
; 80   : }
jmp SHORT $LN19@main
jmp SHORT $LN18@main
$LN19@main:
xor eax, eax
$LN18@main:
pop edi
pop esi
pop ebx
mov ecx, DWORD PTR __$ArrayPad$[ebp]
xor ecx, ebp
call    @__security_check_cookie@4
mov esp, ebp
pop ebp
ret 0

我在 MSVC++ 14.0 中得到了这个,所有优化都被取消了(它削减了所有漫无目的的代码,测试代码也是如此)。

您不会在相关代码中使用循环对齐,这可能会在结果中产生足够的不变性,使它们变得毫无意义。$LN13@main:$LN7@main:必须以相同的方式对齐(取决于您的平台,多少,实际上您可能应该在ProcessorTime = __rdtsc();之前对齐整个块,并且在两种情况下都可能跳转到该地址(在清除缓存或预热它们之后,取决于您要测量的内容)。

也就是说,在现代 x86 上,您的两个代码都处于指令级别,甚至没有饱和单核,因此此类mov很可能会与现实世界代码中的其他指令一起执行。所有的"时间"都是通过访问堆栈内存和依赖项来消耗的。例如,在第二种间接情况下,如果您只使用eax作为两个 mov 的目的地,也许它会因为更大的冲突而减慢速度。嗯。。可能不是,CPU 每次新的循环迭代仍然有很多备用寄存器,因此它可能会通过每次重命名eax来避免任何虚假的依赖/冲突。

总的来说,测量"脚手架"是没有意义的,你应该衡量真正的算法在做某事,因为你测量的东西几乎不会再多花一个+1的周期,这很可能会在真实代码的瓶颈的数十/数百个周期中丢失。