为什么march=native会破坏我的程序?

Why does march=native corrupt my program?

本文关键字:我的 程序 march native 为什么      更新时间:2023-10-16

我正在编译程序:

#include <iostream>
#include <vector>
#include <cstddef>
#include <algorithm>
struct Model
{
int open, extend;
};
struct Cell
{
int a, b;
};
typedef std::vector<std::vector<Cell>> DPMatrix;
void print(const DPMatrix& matrix)
{
for (std::size_t i = 0; i < matrix.size(); ++i) {
for (std::size_t j = 0; j < matrix[i].size(); ++j) {
std::cout << '{' << matrix[i][j].a << ' ' << matrix[i][j].b << "} ";
}
std::cout << std::endl;
}
}
DPMatrix init_dp_matrix(const std::size_t num_cols, const std::size_t num_rows, const Model& model)
{
DPMatrix result(num_cols, DPMatrix::value_type(num_rows, Cell()));
const int inf = model.open * std::max(num_cols, num_rows);
for (int i = 1; i < num_cols; ++i) {
result[i][0].b = model.open + (i - 1) * model.extend;
}
for (int j = 1; j < num_rows; ++j) {
result[0][j].a = model.open + (j - 1) * model.extend;
}
return result;
}
int main()
{
const Model model = {-8, -1};
const DPMatrix matrix = init_dp_matrix(10, 2, model);
print(matrix);
}

在 GCC 9.2.0 中:

$ g++-9 -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)

-march=native

$ g++-9 -O3 -march=native -o bug bug.cpp

在装有英特尔芯片的 Ubuntu 机器上:

$ lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description:    Ubuntu 18.04.3 LTS
Release:        18.04
Codename:       bioni
$ grep model /proc/cpuinfo | head -2
model           : 85
model name      : Intel(R) Xeon(R) Platinum 8175M CPU @ 2.50GHz

运行程序我得到虚假输出:

$ ./bug 
{0 0} {-8 0} 
{-2048 255} {0 0} 
{-2304 255} {0 0} 
{-2560 255} {0 0} 
{-2816 255} {0 0} 
{-3072 255} {0 0} 
{-3328 255} {0 0} 
{-3584 255} {0 0} 
{-3840 255} {0 0} 
{0 -16} {0 0}

如果我在没有-march=native的情况下编译,我会得到正确的输出:

$ g++-9 -O3 -o bug bug.cpp
$ ./bug 
{0 0} {-8 0} 
{0 -8} {0 0} 
{0 -9} {0 0} 
{0 -10} {0 0} 
{0 -11} {0 0} 
{0 -12} {0 0} 
{0 -13} {0 0} 
{0 -14} {0 0} 
{0 -15} {0 0} 
{0 -16} {0 0

-match=native版本的程序集为:

$ g++-9 -O3 -march=native -S bug.cpp
$ cat bug.s
.file   "bug.cpp"
.text
.section    .text._ZNKSt5ctypeIcE8do_widenEc,"axG",@progbits,_ZNKSt5ctypeIcE8do_widenEc,comdat
.align 2
.p2align 4
.weak   _ZNKSt5ctypeIcE8do_widenEc
.type   _ZNKSt5ctypeIcE8do_widenEc, @function
_ZNKSt5ctypeIcE8do_widenEc:
.LFB1303:
.cfi_startproc
movl    %esi, %eax
ret
.cfi_endproc
.LFE1303:
.size   _ZNKSt5ctypeIcE8do_widenEc, .-_ZNKSt5ctypeIcE8do_widenEc
.section    .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "} "
.text
.p2align 4
.globl  _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.type   _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, @function
_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB2359:
.cfi_startproc
movq    (%rdi), %rdx
cmpq    %rdx, 8(%rdi)
je  .L23
pushq   %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq   %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq   %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
movabsq $-6148914691236517205, %r13
pushq   %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
xorl    %r12d, %r12d
pushq   %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
movq    %rdi, %rbp
pushq   %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
subq    $24, %rsp
.cfi_def_cfa_offset 80
.p2align 4,,10
.p2align 3
.L4:
leaq    (%r12,%r12,2), %rbx
salq    $3, %rbx
addq    %rbx, %rdx
movq    8(%rdx), %rax
xorl    %r14d, %r14d
cmpq    %rax, (%rdx)
je  .L8
.p2align 4,,10
.p2align 3
.L5:
movl    $1, %edx
leaq    15(%rsp), %rsi
movl    $_ZSt4cout, %edi
movb    $123, 15(%rsp)
call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq    %rax, %rdi
movq    0(%rbp), %rax
leaq    0(,%r14,8), %r15
movq    (%rax,%rbx), %rax
movl    (%rax,%r14,8), %esi
incq    %r14
call    _ZNSolsEi
movq    %rax, %rdi
movl    $1, %edx
leaq    15(%rsp), %rsi
movb    $32, 15(%rsp)
call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq    %rax, %rdi
movq    0(%rbp), %rax
movq    (%rax,%rbx), %rax
movl    4(%rax,%r15), %esi
call    _ZNSolsEi
movq    %rax, %rdi
movl    $2, %edx
movl    $.LC0, %esi
call    _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq    0(%rbp), %rdx
addq    %rbx, %rdx
movq    8(%rdx), %rax
subq    (%rdx), %rax
sarq    $3, %rax
cmpq    %rax, %r14
jb  .L5
.L8:
movq    _ZSt4cout(%rip), %rax
movq    -24(%rax), %rax
movq    _ZSt4cout+240(%rax), %r14
testq   %r14, %r14
je  .L26
cmpb    $0, 56(%r14)
je  .L9
movsbl  67(%r14), %esi
.L10:
movl    $_ZSt4cout, %edi
call    _ZNSo3putEc
movq    %rax, %rdi
call    _ZNSo5flushEv
movq    0(%rbp), %rdx
movq    8(%rbp), %rax
incq    %r12
subq    %rdx, %rax
sarq    $3, %rax
imulq   %r13, %rax
cmpq    %r12, %rax
ja  .L4
addq    $24, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 56
popq    %rbx
.cfi_def_cfa_offset 48
popq    %rbp
.cfi_def_cfa_offset 40
popq    %r12
.cfi_def_cfa_offset 32
popq    %r13
.cfi_def_cfa_offset 24
popq    %r14
.cfi_def_cfa_offset 16
popq    %r15
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
.L9:
.cfi_restore_state
movq    %r14, %rdi
call    _ZNKSt5ctypeIcE13_M_widen_initEv
movq    (%r14), %rax
movl    $10, %esi
movq    48(%rax), %rax
cmpq    $_ZNKSt5ctypeIcE8do_widenEc, %rax
je  .L10
movq    %r14, %rdi
call    *%rax
movsbl  %al, %esi
jmp .L10
.L23:
.cfi_def_cfa_offset 8
.cfi_restore 3
.cfi_restore 6
.cfi_restore 12
.cfi_restore 13
.cfi_restore 14
.cfi_restore 15
ret
.L26:
.cfi_def_cfa_offset 80
.cfi_offset 3, -56
.cfi_offset 6, -48
.cfi_offset 12, -40
.cfi_offset 13, -32
.cfi_offset 14, -24
.cfi_offset 15, -16
call    _ZSt16__throw_bad_castv
.cfi_endproc
.LFE2359:
.size   _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.section    .rodata.str1.8,"aMS",@progbits,1
.align 8
.LC2:
.string "cannot create std::vector larger than max_size()"
.section    .text.unlikely,"ax",@progbits
.LCOLDB6:
.text
.LHOTB6:
.p2align 4
.globl  _Z14init_dp_matrixmmRK5Model
.type   _Z14init_dp_matrixmmRK5Model, @function
_Z14init_dp_matrixmmRK5Model:
.LFB2360:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA2360
pushq   %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movabsq $1152921504606846975, %rax
movq    %rsp, %rbp
.cfi_def_cfa_register 6
pushq   %r15
pushq   %r14
pushq   %r13
pushq   %r12
pushq   %rbx
andq    $-32, %rsp
subq    $64, %rsp
.cfi_offset 15, -24
.cfi_offset 14, -32
.cfi_offset 13, -40
.cfi_offset 12, -48
.cfi_offset 3, -56
movq    %rdi, 24(%rsp)
movq    %rsi, 40(%rsp)
movq    %rcx, 16(%rsp)
cmpq    %rax, %rdx
ja  .L103
movq    %rdx, %r15
testq   %rdx, %rdx
je  .L71
leaq    0(,%rdx,8), %rbx
movq    %rbx, %rdi
.LEHB0:
call    _Znwm
.LEHE0:
movq    %rax, %r13
leaq    -1(%r15), %rax
cmpq    $3, %rax
movq    %r15, %rdx
movq    %r13, %rax
jbe .L30
shrq    $2, %rdx
salq    $5, %rdx
addq    %r13, %rdx
vpxor   %xmm0, %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L32:
vmovdqu32   %ymm0, (%rax)
addq    $32, %rax
cmpq    %rdx, %rax
jne .L32
movq    %r15, %rcx
andq    $-4, %rcx
movq    %r15, %rdx
andl    $3, %edx
leaq    0(%r13,%rcx,8), %rax
cmpq    %rcx, %r15
je  .L33
.L30:
movq    $0, (%rax)
cmpq    $1, %rdx
je  .L33
movq    $0, 8(%rax)
cmpq    $2, %rdx
je  .L33
movq    $0, 16(%rax)
cmpq    $3, %rdx
je  .L33
movq    $0, 24(%rax)
.L33:
leaq    0(%r13,%rbx), %rax
movq    %rax, 56(%rsp)
.L29:
movabsq $384307168202282325, %rax
cmpq    %rax, 40(%rsp)
ja  .L104
movq    40(%rsp), %rax
movq    24(%rsp), %r12
leaq    (%rax,%rax,2), %rbx
movq    $0, (%r12)
movq    $0, 8(%r12)
movq    $0, 16(%r12)
salq    $3, %rbx
testq   %rax, %rax
je  .L35
movq    %rbx, %rdi
vzeroupper
.LEHB1:
call    _Znwm
.LEHE1:
addq    %rax, %rbx
movq    %rax, (%r12)
movq    %rax, 8(%r12)
movq    %rbx, 16(%r12)
movq    56(%rsp), %r12
movq    %rax, %r14
subq    %r13, %r12
movq    %r12, %rax
sarq    $3, %rax
je  .L40
movabsq $1152921504606846975, %rdx
cmpq    %rdx, %rax
ja  .L41
movq    40(%rsp), %rax
movq    %r14, %rbx
movq    %rax, 48(%rsp)
.p2align 4,,10
.p2align 3
.L46:
movq    $0, (%rbx)
movq    $0, 8(%rbx)
movq    $0, 16(%rbx)
movq    %r12, %rdi
.LEHB2:
call    _Znwm
.LEHE2:
leaq    (%rax,%r12), %rcx
movq    %rax, (%rbx)
movq    %rcx, 16(%rbx)
movq    %rax, %rdi
cmpq    %r13, 56(%rsp)
je  .L42
movq    %r12, %rdx
movq    %r13, %rsi
movq    %rcx, 32(%rsp)
call    memcpy
movq    32(%rsp), %rcx
addq    $24, %rbx
movq    %rcx, -16(%rbx)
decq    48(%rsp)
jne .L46
movq    24(%rsp), %rax
movq    %rbx, 8(%rax)
.L47:
movq    %r13, %rdi
call    _ZdlPv
.L48:
movq    16(%rsp), %rax
cmpq    $1, 40(%rsp)
movl    (%rax), %edx
jbe .L62
movl    4(%rax), %edi
movq    24(%rsp), %rax
movq    (%rax), %rsi
movq    40(%rsp), %rax
leaq    -2(%rax), %rcx
cmpq    $7, %rcx
jbe .L73
movq    %rcx, %r8
shrq    $3, %r8
leaq    (%r8,%r8,2), %r8
salq    $6, %r8
vmovdqa64   .LC1(%rip), %ymm3
vmovdqa64   .LC3(%rip), %ymm4
vmovdqa64   .LC4(%rip), %ymm6
vmovdqa64   .LC5(%rip), %ymm5
vpbroadcastd    %edi, %ymm10
vpbroadcastd    %edx, %ymm9
leaq    24(%rsi), %rax
leaq    24(%rsi,%r8), %r8
vpcmpeqd    %ymm8, %ymm8, %ymm8
kxnorb  %k1, %k1, %k1
.p2align 4,,10
.p2align 3
.L61:
vmovdqa64   %ymm3, %ymm0
vpaddd  %ymm8, %ymm0, %ymm0
vpmulld %ymm10, %ymm0, %ymm0
vmovdqu64   (%rax), %ymm2
vmovdqu64   96(%rax), %ymm1
vpermt2q    32(%rax), %ymm6, %ymm2
vpermt2q    128(%rax), %ymm6, %ymm1
vpermt2q    64(%rax), %ymm5, %ymm2
vpaddd  %ymm9, %ymm0, %ymm0
vpermt2q    160(%rax), %ymm5, %ymm1
kmovb   %k1, %k2
addq    $192, %rax
vpscatterqd %xmm0, 4(,%ymm2,1){%k2}
vperm2i128  $17, %ymm0, %ymm0, %ymm0
kmovb   %k1, %k3
vpaddd  %ymm4, %ymm3, %ymm3
vpscatterqd %xmm0, 4(,%ymm1,1){%k3}
cmpq    %r8, %rax
jne .L61
andq    $-8, %rcx
leaq    1(%rcx), %r8
leal    1(%rcx), %eax
.L59:
leaq    (%r8,%r8,2), %rcx
movq    (%rsi,%rcx,8), %r8
leal    -1(%rax), %ecx
imull   %edi, %ecx
movq    40(%rsp), %rbx
addl    %edx, %ecx
movl    %ecx, 4(%r8)
leal    1(%rax), %ecx
movslq  %ecx, %r8
cmpq    %r8, %rbx
jbe .L62
leaq    (%r8,%r8,2), %r8
movq    (%rsi,%r8,8), %r9
movl    %edi, %r8d
imull   %eax, %r8d
addl    %edx, %r8d
movl    %r8d, 4(%r9)
leal    2(%rax), %r8d
movslq  %r8d, %r9
cmpq    %r9, %rbx
jbe .L62
imull   %edi, %ecx
leaq    (%r9,%r9,2), %r9
movq    (%rsi,%r9,8), %r9
addl    %edx, %ecx
movl    %ecx, 4(%r9)
leal    3(%rax), %ecx
movslq  %ecx, %r9
cmpq    %r9, %rbx
jbe .L62
imull   %edi, %r8d
leaq    (%r9,%r9,2), %r9
movq    (%rsi,%r9,8), %r9
addl    %edx, %r8d
movl    %r8d, 4(%r9)
leal    4(%rax), %r8d
movslq  %r8d, %r9
cmpq    %r9, %rbx
jbe .L62
imull   %edi, %ecx
leaq    (%r9,%r9,2), %r9
movq    (%rsi,%r9,8), %r9
addl    %edx, %ecx
movl    %ecx, 4(%r9)
leal    5(%rax), %ecx
movslq  %ecx, %r9
cmpq    %r9, %rbx
jbe .L62
imull   %edi, %r8d
leaq    (%r9,%r9,2), %r9
movq    (%rsi,%r9,8), %r9
addl    %edx, %r8d
movl    %r8d, 4(%r9)
leal    6(%rax), %r8d
movslq  %r8d, %r9
cmpq    %r9, %rbx
jbe .L62
imull   %edi, %ecx
leaq    (%r9,%r9,2), %r9
movq    (%rsi,%r9,8), %r9
addl    $7, %eax
addl    %edx, %ecx
cltq
movl    %ecx, 4(%r9)
cmpq    %rax, %rbx
jbe .L62
imull   %r8d, %edi
leaq    (%rax,%rax,2), %rax
movq    (%rsi,%rax,8), %rax
leal    (%rdi,%rdx), %r8d
movl    %r8d, 4(%rax)
.L62:
cmpq    $1, %r15
jbe .L27
movq    16(%rsp), %rax
leaq    -1(%r15), %r8
movl    4(%rax), %edi
movq    24(%rsp), %rax
movq    (%rax), %rax
movq    (%rax), %rsi
leaq    -2(%r15), %rax
cmpq    $6, %rax
jbe .L74
movq    %r8, %rcx
shrq    $3, %rcx
salq    $6, %rcx
vmovdqa64   .LC1(%rip), %ymm2
vmovdqa64   .LC3(%rip), %ymm4
vpbroadcastd    %edi, %ymm6
vpbroadcastd    %edx, %ymm5
movq    %rsi, %rax
addq    %rsi, %rcx
vpcmpeqd    %ymm3, %ymm3, %ymm3
.p2align 4,,10
.p2align 3
.L66:
vmovdqa64   %ymm2, %ymm0
vpaddd  %ymm3, %ymm0, %ymm0
vpmulld %ymm6, %ymm0, %ymm0
addq    $64, %rax
vpaddd  %ymm4, %ymm2, %ymm2
vpaddd  %ymm5, %ymm0, %ymm0
vmovd   %xmm0, -56(%rax)
vpextrd $1, %xmm0, -48(%rax)
vpextrd $2, %xmm0, -40(%rax)
vpextrd $3, %xmm0, -32(%rax)
vextracti128    $0x1, %ymm0, %xmm0
vmovd   %xmm0, -24(%rax)
vpextrd $1, %xmm0, -16(%rax)
vpextrd $2, %xmm0, -8(%rax)
vpextrd $3, %xmm0, (%rax)
cmpq    %rcx, %rax
jne .L66
movq    %r8, %rcx
andq    $-8, %rcx
leaq    1(%rcx), %r9
leal    1(%rcx), %eax
cmpq    %r8, %rcx
je  .L27
.L64:
leal    -1(%rax), %ecx
imull   %edi, %ecx
addl    %edx, %ecx
movl    %ecx, (%rsi,%r9,8)
leal    1(%rax), %ecx
movslq  %ecx, %r9
cmpq    %r15, %r9
jnb .L27
movl    %edi, %r8d
imull   %eax, %r8d
addl    %edx, %r8d
movl    %r8d, (%rsi,%r9,8)
leal    2(%rax), %r8d
movslq  %r8d, %r9
cmpq    %r9, %r15
jbe .L27
imull   %edi, %ecx
addl    %edx, %ecx
movl    %ecx, (%rsi,%r9,8)
leal    3(%rax), %ecx
movslq  %ecx, %r9
cmpq    %r15, %r9
jnb .L27
imull   %edi, %r8d
addl    %edx, %r8d
movl    %r8d, (%rsi,%r9,8)
leal    4(%rax), %r8d
movslq  %r8d, %r9
cmpq    %r9, %r15
jbe .L27
imull   %edi, %ecx
addl    %edx, %ecx
movl    %ecx, (%rsi,%r9,8)
leal    5(%rax), %ecx
movslq  %ecx, %r9
cmpq    %r9, %r15
jbe .L27
imull   %edi, %r8d
addl    $6, %eax
cltq
addl    %edx, %r8d
movl    %r8d, (%rsi,%r9,8)
cmpq    %rax, %r15
jbe .L27
imull   %ecx, %edi
addl    %edi, %edx
movl    %edx, (%rsi,%rax,8)
.L27:
movq    24(%rsp), %rax
vzeroupper
leaq    -40(%rbp), %rsp
popq    %rbx
popq    %r12
popq    %r13
popq    %r14
popq    %r15
popq    %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.p2align 4,,10
.p2align 3
.L37:
.cfi_restore_state
movq    %r12, 8(%r14)
addq    $24, %r14
cmpq    %r14, %rbx
je  .L45
.L40:
movq    $0, (%r14)
movq    %r12, 16(%r14)
cmpq    %r13, 56(%rsp)
je  .L37
movq    %r12, %rdx
movq    %r13, %rsi
xorl    %edi, %edi
call    memcpy
addq    $24, %r14
movq    %r12, -16(%r14)
cmpq    %r14, %rbx
jne .L40
.L45:
movq    24(%rsp), %rax
movq    %rbx, 8(%rax)
testq   %r13, %r13
je  .L48
.L105:
movq    %r13, %rdi
call    _ZdlPv
jmp .L48
.p2align 4,,10
.p2align 3
.L42:
movq    %rcx, 8(%rbx)
addq    $24, %rbx
decq    48(%rsp)
jne .L46
movq    24(%rsp), %rax
movq    %rbx, 8(%rax)
testq   %r13, %r13
je  .L48
jmp .L105
.p2align 4,,10
.p2align 3
.L71:
movq    $0, 56(%rsp)
xorl    %r13d, %r13d
jmp .L29
.p2align 4,,10
.p2align 3
.L35:
testq   %r13, %r13
je  .L106
vzeroupper
jmp .L47
.L73:
movl    $1, %eax
movl    $1, %r8d
jmp .L59
.L74:
movl    $1, %eax
movl    $1, %r9d
jmp .L64
.L106:
movq    16(%rsp), %rax
movl    (%rax), %edx
jmp .L62
.L41:
movq    $0, (%r14)
movq    $0, 8(%r14)
movq    $0, 16(%r14)
.LEHB3:
call    _ZSt17__throw_bad_allocv
.LEHE3:
.L104:
movl    $.LC2, %edi
vzeroupper
.LEHB4:
call    _ZSt20__throw_length_errorPKc
.LEHE4:
.L103:
movl    $.LC2, %edi
.LEHB5:
call    _ZSt20__throw_length_errorPKc
.LEHE5:
.L78:
movq    %rax, %rdi
jmp .L49
.L77:
movq    %rax, %rdi
jmp .L50
.L75:
movq    %rax, %r12
vzeroupper
jmp .L56
.globl  __gxx_personality_v0
.section    .gcc_except_table,"a",@progbits
.align 4
.LLSDA2360:
.byte   0xff
.byte   0x3
.uleb128 .LLSDATT2360-.LLSDATTD2360
.LLSDATTD2360:
.byte   0x1
.uleb128 .LLSDACSE2360-.LLSDACSB2360
.LLSDACSB2360:
.uleb128 .LEHB0-.LFB2360
.uleb128 .LEHE0-.LEHB0
.uleb128 0
.uleb128 0
.uleb128 .LEHB1-.LFB2360
.uleb128 .LEHE1-.LEHB1
.uleb128 .L75-.LFB2360
.uleb128 0
.uleb128 .LEHB2-.LFB2360
.uleb128 .LEHE2-.LEHB2
.uleb128 .L77-.LFB2360
.uleb128 0x1
.uleb128 .LEHB3-.LFB2360
.uleb128 .LEHE3-.LEHB3
.uleb128 .L78-.LFB2360
.uleb128 0x1
.uleb128 .LEHB4-.LFB2360
.uleb128 .LEHE4-.LEHB4
.uleb128 .L75-.LFB2360
.uleb128 0
.uleb128 .LEHB5-.LFB2360
.uleb128 .LEHE5-.LEHB5
.uleb128 0
.uleb128 0
.LLSDACSE2360:
.byte   0x1
.byte   0
.align 4
.long   0
.LLSDATT2360:
.text
.cfi_endproc
.section    .text.unlikely
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDAC2360
.type   _Z14init_dp_matrixmmRK5Model.cold, @function
_Z14init_dp_matrixmmRK5Model.cold:
.LFSB2360:
.L49:
.cfi_def_cfa 6, 16
.cfi_offset 3, -56
.cfi_offset 6, -16
.cfi_offset 12, -48
.cfi_offset 13, -40
.cfi_offset 14, -32
.cfi_offset 15, -24
movq    %r14, %rbx
.L50:
vzeroupper
call    __cxa_begin_catch
.L53:
cmpq    %rbx, %r14
jne .L107
.LEHB6:
call    __cxa_rethrow
.LEHE6:
.L76:
movq    %rax, %r12
vzeroupper
call    __cxa_end_catch
movq    24(%rsp), %rax
movq    (%rax), %rdi
testq   %rdi, %rdi
je  .L56
call    _ZdlPv
.L56:
testq   %r13, %r13
je  .L69
movq    %r13, %rdi
call    _ZdlPv
.L69:
movq    %r12, %rdi
.LEHB7:
call    _Unwind_Resume
.LEHE7:
.L107:
movq    (%r14), %rdi
testq   %rdi, %rdi
je  .L52
call    _ZdlPv
.L52:
addq    $24, %r14
jmp .L53
.cfi_endproc
.LFE2360:
.section    .gcc_except_table
.align 4
.LLSDAC2360:
.byte   0xff
.byte   0x3
.uleb128 .LLSDATTC2360-.LLSDATTDC2360
.LLSDATTDC2360:
.byte   0x1
.uleb128 .LLSDACSEC2360-.LLSDACSBC2360
.LLSDACSBC2360:
.uleb128 .LEHB6-.LCOLDB6
.uleb128 .LEHE6-.LEHB6
.uleb128 .L76-.LCOLDB6
.uleb128 0
.uleb128 .LEHB7-.LCOLDB6
.uleb128 .LEHE7-.LEHB7
.uleb128 0
.uleb128 0
.LLSDACSEC2360:
.byte   0x1
.byte   0
.align 4
.long   0
.LLSDATTC2360:
.section    .text.unlikely
.text
.size   _Z14init_dp_matrixmmRK5Model, .-_Z14init_dp_matrixmmRK5Model
.section    .text.unlikely
.size   _Z14init_dp_matrixmmRK5Model.cold, .-_Z14init_dp_matrixmmRK5Model.cold
.LCOLDE6:
.text
.LHOTE6:
.section    .text._ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev,"axG",@progbits,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED5Ev,comdat
.align 2
.p2align 4
.weak   _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.type   _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, @function
_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev:
.LFB2637:
.cfi_startproc
pushq   %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
movq    %rdi, %r12
pushq   %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq   %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
movq    8(%rdi), %rbx
movq    (%rdi), %rbp
cmpq    %rbp, %rbx
je  .L109
.p2align 4,,10
.p2align 3
.L113:
movq    0(%rbp), %rdi
testq   %rdi, %rdi
je  .L110
addq    $24, %rbp
call    _ZdlPv
cmpq    %rbp, %rbx
jne .L113
.L111:
movq    (%r12), %rbp
.L109:
testq   %rbp, %rbp
je  .L115
popq    %rbx
.cfi_remember_state
.cfi_def_cfa_offset 24
movq    %rbp, %rdi
popq    %rbp
.cfi_def_cfa_offset 16
popq    %r12
.cfi_def_cfa_offset 8
jmp _ZdlPv
.p2align 4,,10
.p2align 3
.L110:
.cfi_restore_state
addq    $24, %rbp
cmpq    %rbp, %rbx
jne .L113
jmp .L111
.p2align 4,,10
.p2align 3
.L115:
popq    %rbx
.cfi_def_cfa_offset 24
popq    %rbp
.cfi_def_cfa_offset 16
popq    %r12
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE2637:
.size   _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, .-_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.weak   _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
.set    _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.section    .text.unlikely
.LCOLDB7:
.section    .text.startup,"ax",@progbits
.LHOTB7:
.p2align 4
.globl  main
.type   main, @function
main:
.LFB2371:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA2371
pushq   %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movl    $2, %edx
movl    $10, %esi
subq    $48, %rsp
.cfi_def_cfa_offset 64
leaq    16(%rsp), %rdi
leaq    8(%rsp), %rcx
movq    $-8, 8(%rsp)
.LEHB8:
call    _Z14init_dp_matrixmmRK5Model
.LEHE8:
leaq    16(%rsp), %rdi
.LEHB9:
call    _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.LEHE9:
leaq    16(%rsp), %rdi
call    _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
addq    $48, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 16
xorl    %eax, %eax
popq    %rbp
.cfi_def_cfa_offset 8
ret
.L119:
.cfi_restore_state
movq    %rax, %rbp
jmp .L118
.section    .gcc_except_table
.LLSDA2371:
.byte   0xff
.byte   0xff
.byte   0x1
.uleb128 .LLSDACSE2371-.LLSDACSB2371
.LLSDACSB2371:
.uleb128 .LEHB8-.LFB2371
.uleb128 .LEHE8-.LEHB8
.uleb128 0
.uleb128 0
.uleb128 .LEHB9-.LFB2371
.uleb128 .LEHE9-.LEHB9
.uleb128 .L119-.LFB2371
.uleb128 0
.LLSDACSE2371:
.section    .text.startup
.cfi_endproc
.section    .text.unlikely
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDAC2371
.type   main.cold, @function
main.cold:
.LFSB2371:
.L118:
.cfi_def_cfa_offset 64
.cfi_offset 6, -16
leaq    16(%rsp), %rdi
vzeroupper
call    _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
movq    %rbp, %rdi
.LEHB10:
call    _Unwind_Resume
.LEHE10:
.cfi_endproc
.LFE2371:
.section    .gcc_except_table
.LLSDAC2371:
.byte   0xff
.byte   0xff
.byte   0x1
.uleb128 .LLSDACSEC2371-.LLSDACSBC2371
.LLSDACSBC2371:
.uleb128 .LEHB10-.LCOLDB7
.uleb128 .LEHE10-.LEHB10
.uleb128 0
.uleb128 0
.LLSDACSEC2371:
.section    .text.unlikely
.section    .text.startup
.size   main, .-main
.section    .text.unlikely
.size   main.cold, .-main.cold
.LCOLDE7:
.section    .text.startup
.LHOTE7:
.p2align 4
.type   _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, @function
_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB3017:
.cfi_startproc
subq    $8, %rsp
.cfi_def_cfa_offset 16
movl    $_ZStL8__ioinit, %edi
call    _ZNSt8ios_base4InitC1Ev
movl    $__dso_handle, %edx
movl    $_ZStL8__ioinit, %esi
movl    $_ZNSt8ios_base4InitD1Ev, %edi
addq    $8, %rsp
.cfi_def_cfa_offset 8
jmp __cxa_atexit
.cfi_endproc
.LFE3017:
.size   _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.section    .init_array,"aw"
.align 8
.quad   _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.local  _ZStL8__ioinit
.comm   _ZStL8__ioinit,1,1
.section    .rodata.cst32,"aM",@progbits,32
.align 32
.LC1:
.long   1
.long   2
.long   3
.long   4
.long   5
.long   6
.long   7
.long   8
.align 32
.LC3:
.long   8
.long   8
.long   8
.long   8
.long   8
.long   8
.long   8
.long   8
.align 32
.LC4:
.quad   0
.quad   3
.quad   6
.quad   0
.align 32
.LC5:
.quad   0
.quad   1
.quad   2
.quad   5
.hidden __dso_handle
.ident  "GCC: (Homebrew GCC 9.2.0) 9.2.0"
.section    .note.GNU-stack,"",@progbits

-march=native版本的组件可在 godbolt 上找到。

出了什么问题,这是编译器错误还是我的程序格式不正确?如果是编译器错误,如何缓解此问题?

附加信息

编译-v

$ ~/tools/octopus/build/brew/bin/g++-9 -O3 -march=native -S bug.cpp -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0) 
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-S' '-v' '-shared-libgcc'
/home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /home/dcooke/tools/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /home/dcooke/tools/octopus/build/brew/nonexistent -idirafter /home/dcooke/tools/octopus/build/brew/include -idirafter /usr/include/x86_64-linux-gnu -idirafter /usr/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mpku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=33792 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o bug.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP

使用-O2或更少进行编译会使问题消失:

$ g++-9 -O2 -march=native -o bug bug.cpp
$ ./bug 
{0 0} {-8 0} 
{0 -8} {0 0} 
{0 -9} {0 0} 
{0 -10} {0 0} 
{0 -11} {0 0} 
{0 -12} {0 0} 
{0 -13} {0 0} 
{0 -14} {0 0} 
{0 -15} {0 0} 
{0 -16} {0 0}

我尝试在另一台装有英特尔芯片的机器上构建:

$ rpm -q centos-release
centos-release-7-3.1611.el7.centos.x86_64
$ grep model /proc/cpuinfo | head -2
model       : 85
model name  : Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
$ g++-9 -O3 -march=native -o bug bug.cpp -v
Reading specs from /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/well/gerton/dan/apps/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc@9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0) 
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-o' 'bug' '-v' '-shared-libgcc'
/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc@9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /gpfs1/well/gerton/dan/apps/octopus/build/brew/nonexistent -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/include -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/opt/glibc/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mno-pku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=28160 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o /tmp/cczPrvHP.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP
$ ./bug 
{0 0} {-8 0} 
{0 -8} {0 0} 
{0 -9} {0 0} 
{0 -10} {0 0} 
{0 -11} {0 0} 
{0 -12} {0 0} 
{0 -13} {0 0} 
{0 -14} {0 0} 
{0 -15} {0 0} 
{0 -16} {0 0} 

正确的输出...

-ftree-loop-vectorize是罪魁祸首

$ g++-9 -march=native -O2 -o bug bug.cpp -ftree-loop-vectorize
$ ./bug
{0 0} {-8 0} 
{-2048 255} {0 0} 
{-2304 255} {0 0} 
{-2560 255} {0 0} 
{-2816 255} {0 0} 
{-3072 255} {0 0} 
{-3328 255} {0 0} 
{-3584 255} {0 0} 
{-3840 255} {0 0} 
{0 -16} {0 0}

其他O3标志均不会导致此行为。

事实证明,这是由于binutils气体中的错误。这个解决方案是将我的二进制升级到 2.32。

相关文章: