为什么对Aligned STD ::数组的初始自动矢量载荷是标量?(g /clang )

Why initial auto-vectorication loads from aligned std::array are scalar? (g++/clang++)

本文关键字:标量 clang 数组 STD Aligned 为什么      更新时间:2023-10-16

我有问题理解是什么防止编译器从 std :: array< uint64_t,...> 。。

我知道GCC可以使用-FOPT-INFO-VEC-*产生调试信息。我从详细日志中找不到任何东西,这些都表明了为什么两个编译器都会做出相同的次级决定使用初始标量负载。

另一方面,我不知道如何使Clang提供有关矢量化问题的详细信息。-RPASS-分析=循环 - 矢量化仅报告INIT中的循环不值得交错。当然,我的内在版本证明了循环可以进行矢量化,但所需的转换可能太复杂了,除了编译器外,还无法进行循环。

我当然可以使用固有的插入来实现热路,但这需要为每个CPU辩论复制相同的逻辑。我更喜欢编写编译器可以完美矢量化的Stanard C 代码。使用target_clones属性或宏和目标属性多次多次编译相同的代码。

如何使编译器告诉为什么负载无法矢量化?

我怀疑GCC可能已经打印了该信息,我只是不知道我在寻找什么。

为什么自动矢量化在初始加载时失败?

    /**
     * This is a test case removing abstraction layers from my actual code. My
     * real code includes one extra problem that access to pack loses alignment
     * information wasn't only issue. Compilers still generate
     * suboptimal machine code with alignment information present. I fail to
     * understand why loads are treated differently compared to stores to
     * same address when auto-vectorization is used.
     *
     * I tested gcc 6.2 and clang 3.9
     * g++ O3 -g -march=native vectest.cc -o vectest -fvect-cost-model=unlimited
     * clang++ -O3 -g -march=native vectest.cc -o vectest
     */

    #include <array>
    #include <cstdint>
    alignas(32) std::array<uint64_t, 52> pack;
    alignas(32) uint64_t board[4];
    __attribute__((noinline))
    static void init(uint64_t initial)
    {
        /* Clang seem to prefer large constant table and unrolled copy
         * which should perform worse outside micro benchmark. L1 misses
         * and memory bandwidth are bigger bottleneck than alu instruction
         * execution. But of course this code won't be compiled to hot path so
         * I don't care how it is compiled as long as it works correctly.
         *
         * But most interesting detail from clang is vectorized stores are
         * generated correctly like:
    4005db:       vpsllvq %ymm2,%ymm1,%ymm2
    4005e0:       vmovdqa %ymm2,0x200a78(%rip)        # 601060 <pack>
    4005e8:       vpaddq 0x390(%rip),%ymm0,%ymm2        # 400980 <_IO_stdin_used+0x60>
    4005f0:       vpsllvq %ymm2,%ymm1,%ymm2
    4005f5:       vmovdqa %ymm2,0x200a83(%rip)        # 601080 <pack+0x20>
    4005fd:       vpaddq 0x39b(%rip),%ymm0,%ymm2        # 4009a0 <_IO_stdin_used+0x80>
         *
         * gcc prefers scalar loop.
         */
        for (unsigned i = 0; i < pack.size(); i++) {
            pack[i] = 1UL << (i + initial);
        }
    }
    #include "immintrin.h"
    __attribute__((noinline))
    static void expected_init(uint64_t initial)
    {
        /** Just an intrinsic implementation of init that would be IMO ideal
         * optimization.
         */
    #if __AVX2__
        unsigned i;
        union {
            uint64_t *mem;
            __m256i *avx;
        } conv;
        conv.mem = &pack[0];
        __m256i t = _mm256_set_epi64x(
                1UL << 3,
                1UL << 2,
                1UL << 1,
                1UL << 0
                );
        /* initial is just extra random number to prevent constant array
         * initialization
         */
        t = _mm256_slli_epi64(t, initial);
        for(i = 0; i < pack.size()/4; i++) {
            _mm256_store_si256(&conv.avx[i], t);
            t = _mm256_slli_epi64(t, 4);
        }
    #endif
    }
    __attribute__((noinline))
    static void iter_or()
    {
        /** initial load (clang):
    4006f0:       vmovaps 0x200988(%rip),%xmm0        # 601080 <pack+0x20>
    4006f8:       vorps  0x200960(%rip),%xmm0,%xmm0        # 601060 <pack>
    400700:       vmovaps 0x200988(%rip),%xmm1        # 601090 <pack+0x30>
    400708:       vorps  0x200960(%rip),%xmm1,%xmm1        # 601070 <pack+0x10>
    400710:       vinsertf128 $0x1,%xmm1,%ymm0,%ymm0
        * expected:
    400810:       vmovaps 0x200868(%rip),%ymm0        # 601080 <pack+0x20>
    400818:       vorps  0x200840(%rip),%ymm0,%ymm0        # 601060 <pack>
    400820:       vorps  0x200878(%rip),%ymm0,%ymm0        # 6010a0 <pack+0x40>
        */
        auto iter = pack.begin();
        uint64_t n(*iter++),
             e(*iter++),
             s(*iter++),
             w(*iter++);
        for (;iter != pack.end();) {
            n |= *iter++;
            e |= *iter++;
            s |= *iter++;
            w |= *iter++;
        }
        /** Store is correctly vectorized to single instruction */
        board[0] = n;
        board[1] = e;
        board[2] = s;
        board[3] = w;
    }
    __attribute__((noinline))
    static void index_or()
    {
        /** Clang compiles this to same as iterator variant. gcc goes
         * completely insane. I don't even want to try to guess what all the
         * permutation stuff is trying to archive.
         */
        unsigned i;
        uint64_t n(pack[0]),
             e(pack[1]),
             s(pack[2]),
             w(pack[3]);
        for (i = 4 ; i < pack.size(); i+=4) {
            n |= pack[i+0];
            e |= pack[i+1];
            s |= pack[i+2];
            w |= pack[i+3];
        }
        board[0] = n;
        board[1] = e;
        board[2] = s;
        board[3] = w;
    }
    #include "immintrin.h"
    __attribute__((noinline))
    static void expected_result()
    {
        /** Intrinsics implementation what I would expect auto-vectorization
         * transform my c++ code. I simple can't understand why both compilers
         * fails to archive results I expect.
         */
    #if __AVX2__
        union {
            uint64_t *mem;
            __m256i *avx;
        } conv;
        conv.mem = &pack[0];
        unsigned i;
        __m256i res = _mm256_load_si256(&conv.avx[0]);
        for (i = 1; i < pack.size()/4; i++) {
            __m256i temp = _mm256_load_si256(&conv.avx[i]);
            res = _mm256_or_si256(res, temp);
        }
        conv.mem = board;
        _mm256_store_si256(conv.avx, res);
    #endif
    }
    int main(int c, char **v)
    {
        (void)v;
        expected_init(c - 1);
        init(c - 1);
        iter_or();
        index_or();
        expected_result();
    }

看来,GCC和Clang都无法从外部回路矢量化初始负载。如果将代码更改为零临时变量,然后使用或从第一个元素中使用两个编译器,则可以做得更好。Clang生成良好的展开矢量代码(只有单个YMM寄存器是瓶颈,所有指令都具有对先前的依赖性)。GCC生成更糟糕的代码,具有额外的初始VPXOR和一个非常糟糕的循环,进行一个vpor。

我还测试了一些替代实现,其中最佳基准是通过交替寄存器改进的clangs展开的代码。

/* only reduce (calling this function from a for loop):
 * ST 7.3 cycles (ST=single thread)
 * SMT 15.3 cycles (SMT=simultaneous multi threading aka hyper threading)
 * shuffle+reduce (calling Fisher-Yatas shuffle and then this function):
 * ST 222 cycles
 * SMT 383 cycles 
 */
    "vmovaps 0x00(%0), %%ymm0n"
    "vmovaps 0x20(%0), %%ymm1n"
    "vpor 0x40(%0), %%ymm0, %%ymm0n"
    "vpor 0x60(%0), %%ymm1, %%ymm1n"
    "vpor 0x80(%0), %%ymm0, %%ymm0n"
    "vpor 0xA0(%0), %%ymm1, %%ymm1n"
    "vpor 0xC0(%0), %%ymm0, %%ymm0n"
    "vpor 0xE0(%0), %%ymm1, %%ymm1n"
    "vpor 0x100(%0), %%ymm0, %%ymm0n"
    "vpor 0x120(%0), %%ymm1, %%ymm1n"
    "vpor 0x140(%0), %%ymm0, %%ymm0n"
    "vpor 0x160(%0), %%ymm1, %%ymm1n"
    "vpor 0x180(%0), %%ymm0, %%ymm0n"
    "vpor %%ymm0, %%ymm1, %%ymm0n"
    "vmovaps %%ymm0, 0x00(%1)n"

clang展开的环路具有

之类的时机
/* only reduce:
 * ST 9.8 cycles
 * SMT 21.8 cycles
 * shuffle+reduce:
 * ST 223 cycles
 * SMT 385 cycles
 */

但是,SMT降低传输代码的性能的数字看起来可疑。我决定尝试更好地编写GCC循环,该环路显然比展开的慢速慢。但是后来我决定通过使用两个寄存器和一次展开循环来打破指令依赖性。这导致稍微更快的洗牌 减少代码,而不是完全展开。

size_t end = pack.size() - 3*4;
asm (
/* The best SMT option outside micro optimization.
 * This allows executing two vpor instructions same time and
 * reduces loop count to half with single unroll
 *
 * only reduce:
 * ST 13.0 cycles
 * SMT 20.0 cycles
 * shuffle+reduce:
 * ST 221 cycles
 * SMT 380 cycles
 */
    "vmovaps 0x180(%[pack]), %%ymm0n"
    "vmovaps 0x160(%[pack]), %%ymm1n"
    "vpor 0x00(%[pack],%[cnt],8), %%ymm0, %%ymm0n"
    "1:n"
    "vpor -0x20(%[pack],%[cnt],8), %%ymm1, %%ymm1n"
    "vpor -0x40(%[pack],%[cnt],8), %%ymm0, %%ymm0n"
    "sub $8, %[cnt]n"
    "jne 1bn"
    "vpor %%ymm0, %%ymm1, %%ymm0n"
    "vmovaps %%ymm0, 0x00(%[out])n"
    : [cnt]"+r"(end)
    : [pack]"r"(begin), [out]"r"(hands_));

,当Fisher-Yates洗牌后,当代码运行时,差异很小。甚至在仅减少基准测试的GCC版本(16.4/38.8)运行shuffle 将测试接近相同速度(228/387)。