首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >为什么来自对齐std::数组的初始自动矢量负载是标量的?(g++/clang++)

为什么来自对齐std::数组的初始自动矢量负载是标量的?(g++/clang++)
EN

Stack Overflow用户
提问于 2016-12-24 06:39:31
回答 1查看 176关注 0票数 2

在从std::array读取数据时,我很难理解是什么阻止编译器使用初始向量加载。

我知道gcc可以使用-fopt-info-vec-*生成调试信息。我从冗长的日志中找不到任何东西来说明为什么两个编译器在使用初始标量负载时做出相同的次优决策。

另一方面,我不知道如何使clang提供有关矢量化问题的详细信息。-Rpass-analysis=循环-矢量化只在init中循环不值得交织的报告。当然,我的内在版本是循环可以向量化的证明,但是所需的转换可能太复杂了,除非来自编译器。

当然,我可以使用本质实现热路径,但这需要对每个cpu架构重复相同的逻辑。我更喜欢编写标准的c++代码,编译器可以非常完美地将代码向量化。使用target_clones属性或宏和目标属性使用不同的标志多次编译相同的代码变得非常简单。

如何使编译器知道为什么加载无法矢量化?

我怀疑gcc可能已经把这些信息打印出来了,我只是不知道我在找什么。

为什么自动矢量化在初始加载时失败?

代码语言:javascript
复制
    /**
     * This is a test case removing abstraction layers from my actual code. My
     * real code includes one extra problem that access to pack loses alignment
     * information wasn't only issue. Compilers still generate
     * suboptimal machine code with alignment information present. I fail to
     * understand why loads are treated differently compared to stores to
     * same address when auto-vectorization is used.
     *
     * I tested gcc 6.2 and clang 3.9
     * g++ O3 -g -march=native vectest.cc -o vectest -fvect-cost-model=unlimited
     * clang++ -O3 -g -march=native vectest.cc -o vectest
     */


    #include <array>
    #include <cstdint>

    alignas(32) std::array<uint64_t, 52> pack;
    alignas(32) uint64_t board[4];

    __attribute__((noinline))
    static void init(uint64_t initial)
    {
        /* Clang seem to prefer large constant table and unrolled copy
         * which should perform worse outside micro benchmark. L1 misses
         * and memory bandwidth are bigger bottleneck than alu instruction
         * execution. But of course this code won't be compiled to hot path so
         * I don't care how it is compiled as long as it works correctly.
         *
         * But most interesting detail from clang is vectorized stores are
         * generated correctly like:
    4005db:       vpsllvq %ymm2,%ymm1,%ymm2
    4005e0:       vmovdqa %ymm2,0x200a78(%rip)        # 601060 <pack>
    4005e8:       vpaddq 0x390(%rip),%ymm0,%ymm2        # 400980 <_IO_stdin_used+0x60>
    4005f0:       vpsllvq %ymm2,%ymm1,%ymm2
    4005f5:       vmovdqa %ymm2,0x200a83(%rip)        # 601080 <pack+0x20>
    4005fd:       vpaddq 0x39b(%rip),%ymm0,%ymm2        # 4009a0 <_IO_stdin_used+0x80>
         *
         * gcc prefers scalar loop.
         */

        for (unsigned i = 0; i < pack.size(); i++) {
            pack[i] = 1UL << (i + initial);
        }
    }

    #include "immintrin.h"
    __attribute__((noinline))
    static void expected_init(uint64_t initial)
    {
        /** Just an intrinsic implementation of init that would be IMO ideal
         * optimization.
         */
    #if __AVX2__
        unsigned i;
        union {
            uint64_t *mem;
            __m256i *avx;
        } conv;
        conv.mem = &pack[0];
        __m256i t = _mm256_set_epi64x(
                1UL << 3,
                1UL << 2,
                1UL << 1,
                1UL << 0
                );
        /* initial is just extra random number to prevent constant array
         * initialization
         */
        t = _mm256_slli_epi64(t, initial);
        for(i = 0; i < pack.size()/4; i++) {
            _mm256_store_si256(&conv.avx[i], t);
            t = _mm256_slli_epi64(t, 4);
        }
    #endif
    }

    __attribute__((noinline))
    static void iter_or()
    {
        /** initial load (clang):
    4006f0:       vmovaps 0x200988(%rip),%xmm0        # 601080 <pack+0x20>
    4006f8:       vorps  0x200960(%rip),%xmm0,%xmm0        # 601060 <pack>
    400700:       vmovaps 0x200988(%rip),%xmm1        # 601090 <pack+0x30>
    400708:       vorps  0x200960(%rip),%xmm1,%xmm1        # 601070 <pack+0x10>
    400710:       vinsertf128 $0x1,%xmm1,%ymm0,%ymm0
        * expected:
    400810:       vmovaps 0x200868(%rip),%ymm0        # 601080 <pack+0x20>
    400818:       vorps  0x200840(%rip),%ymm0,%ymm0        # 601060 <pack>
    400820:       vorps  0x200878(%rip),%ymm0,%ymm0        # 6010a0 <pack+0x40>
        */

        auto iter = pack.begin();
        uint64_t n(*iter++),
             e(*iter++),
             s(*iter++),
             w(*iter++);
        for (;iter != pack.end();) {
            n |= *iter++;
            e |= *iter++;
            s |= *iter++;
            w |= *iter++;
        }
        /** Store is correctly vectorized to single instruction */
        board[0] = n;
        board[1] = e;
        board[2] = s;
        board[3] = w;
    }

    __attribute__((noinline))
    static void index_or()
    {
        /** Clang compiles this to same as iterator variant. gcc goes
         * completely insane. I don't even want to try to guess what all the
         * permutation stuff is trying to archive.
         */
        unsigned i;
        uint64_t n(pack[0]),
             e(pack[1]),
             s(pack[2]),
             w(pack[3]);
        for (i = 4 ; i < pack.size(); i+=4) {
            n |= pack[i+0];
            e |= pack[i+1];
            s |= pack[i+2];
            w |= pack[i+3];
        }
        board[0] = n;
        board[1] = e;
        board[2] = s;
        board[3] = w;
    }

    #include "immintrin.h"

    __attribute__((noinline))
    static void expected_result()
    {
        /** Intrinsics implementation what I would expect auto-vectorization
         * transform my c++ code. I simple can't understand why both compilers
         * fails to archive results I expect.
         */
    #if __AVX2__
        union {
            uint64_t *mem;
            __m256i *avx;
        } conv;
        conv.mem = &pack[0];
        unsigned i;
        __m256i res = _mm256_load_si256(&conv.avx[0]);
        for (i = 1; i < pack.size()/4; i++) {
            __m256i temp = _mm256_load_si256(&conv.avx[i]);
            res = _mm256_or_si256(res, temp);
        }
        conv.mem = board;
        _mm256_store_si256(conv.avx, res);
    #endif
    }

    int main(int c, char **v)
    {
        (void)v;
        expected_init(c - 1);
        init(c - 1);

        iter_or();
        index_or();
        expected_result();
    }
EN

回答 1

Stack Overflow用户

发布于 2016-12-28 14:18:41

gcc和clang似乎都不能从外部循环中将初始载荷矢量化。如果首先将代码更改为零临时变量,然后使用第一个元素或从第一个元素开始,这两个编译器都会做得更好。Clang生成良好的未滚动向量代码(只有单个ymm寄存器是瓶颈,所有指令都依赖于以前的指令)。GCC通过额外的初始vpxor和一个非常糟糕的循环每次迭代生成一个vpor,从而生成了一个更糟糕的代码。

我还测试了一些替代实现,在这些实现中,微基准最好是clangs,使用交替寄存器改进展开代码。

代码语言:javascript
复制
/* only reduce (calling this function from a for loop):
 * ST 7.3 cycles (ST=single thread)
 * SMT 15.3 cycles (SMT=simultaneous multi threading aka hyper threading)
 * shuffle+reduce (calling Fisher-Yatas shuffle and then this function):
 * ST 222 cycles
 * SMT 383 cycles 
 */
    "vmovaps 0x00(%0), %%ymm0\n"
    "vmovaps 0x20(%0), %%ymm1\n"
    "vpor 0x40(%0), %%ymm0, %%ymm0\n"
    "vpor 0x60(%0), %%ymm1, %%ymm1\n"
    "vpor 0x80(%0), %%ymm0, %%ymm0\n"
    "vpor 0xA0(%0), %%ymm1, %%ymm1\n"
    "vpor 0xC0(%0), %%ymm0, %%ymm0\n"
    "vpor 0xE0(%0), %%ymm1, %%ymm1\n"
    "vpor 0x100(%0), %%ymm0, %%ymm0\n"
    "vpor 0x120(%0), %%ymm1, %%ymm1\n"
    "vpor 0x140(%0), %%ymm0, %%ymm0\n"
    "vpor 0x160(%0), %%ymm1, %%ymm1\n"
    "vpor 0x180(%0), %%ymm0, %%ymm0\n"

    "vpor %%ymm0, %%ymm1, %%ymm0\n"
    "vmovaps %%ymm0, 0x00(%1)\n"

Clang展开循环的时间安排如下

代码语言:javascript
复制
/* only reduce:
 * ST 9.8 cycles
 * SMT 21.8 cycles
 * shuffle+reduce:
 * ST 223 cycles
 * SMT 385 cycles
 */

但是,SMT降低了展开代码性能的数字看起来很可疑。我决定尝试更好地编写GCC循环,这个循环显然比未展开的还要慢。但是,我决定通过使用两个寄存器和展开循环一次来打破指令依赖关系。这使得shuffle+reduce代码比完全展开的速度略快一些。

代码语言:javascript
复制
size_t end = pack.size() - 3*4;
asm (
/* The best SMT option outside micro optimization.
 * This allows executing two vpor instructions same time and
 * reduces loop count to half with single unroll
 *
 * only reduce:
 * ST 13.0 cycles
 * SMT 20.0 cycles
 * shuffle+reduce:
 * ST 221 cycles
 * SMT 380 cycles
 */
    "vmovaps 0x180(%[pack]), %%ymm0\n"
    "vmovaps 0x160(%[pack]), %%ymm1\n"
    "vpor 0x00(%[pack],%[cnt],8), %%ymm0, %%ymm0\n"
    "1:\n"
    "vpor -0x20(%[pack],%[cnt],8), %%ymm1, %%ymm1\n"
    "vpor -0x40(%[pack],%[cnt],8), %%ymm0, %%ymm0\n"
    "sub $8, %[cnt]\n"
    "jne 1b\n"

    "vpor %%ymm0, %%ymm1, %%ymm0\n"
    "vmovaps %%ymm0, 0x00(%[out])\n"
    : [cnt]"+r"(end)
    : [pack]"r"(begin), [out]"r"(hands_));

但是,当在Fisher-Yates洗牌之后运行代码时,差异是小得令人惊讶的。即使gcc版本明显亏损,在精简仅基准(16.4/38.8)运行shuffle+reduce测试接近相同的速度(228/387)。

票数 1
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/41310990

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档