[avx] wrong optimization of intrinsics with -ffast-math

Tim Blechmann <tim@xxxxxxxxxx> · Mon, 10 Oct 2011 12:18:32 +0200

i've experiencing a problem with gcc-4.6.1 using avx intrinsics to write a sign 
function. when compiling the following code with -ffast-math, some wrong code is 
generated.

the code is the following:
essentially it seems that the sign_mask bitmask is initialized as 0 instead of 
0x80000000 and therefore one `and' and one `or' instruction are optimized away.

template <typename VecType>
inline VecType vec_sign(VecType const & arg)
{
    typedef VecType vec;
    const vec zero       = vec::gen_zero();
    const vec one        = vec::gen_one();
    const vec sign_mask  = vec::gen_sign_mask(); // wrongly initialized

    const vec nonzero    = mask_neq(arg, zero);
    const vec sign       = arg & sign_mask;

    const vec abs_ret    = nonzero & one;
    const vec ret        = sign | abs_ret;
    return ret;
}

&, | and mask_neq are implemented with _mm256_and_ps, _mm256_or_ps and 
_mm256_cmp_ps. sign_mask is generated via:

template <>
struct vec<float>
{
    [...]

    __m256 data_;

    static inline __m256 gen_sign_mask(void)
    {
        return set_bitmask(0x80000000);
    }
    static inline __m256 set_bitmask(unsigned int mask)
    {
        union {
            unsigned int i;
            float f;
        } u;
        u.i = mask;
        return _mm256_set1_ps(u.f);
    }
    [...]
};

the generated code looks is essentially the following:

00000000000056f0 <sign_nova(UnaryOpUGen*, int)>:
    56f0:       55                      push   %rbp
    56f1:       c1 ee 04                shr    $0x4,%esi
    56f4:       c5 f8 57 c0             vxorps %xmm0,%xmm0,%xmm0
    56f8:       48 89 e5                mov    %rsp,%rbp
    56fb:       48 83 e4 e0             and    $0xffffffffffffffe0,%rsp
    56ff:       83 ee 01                sub    $0x1,%esi
    5702:       48 83 c4 10             add    $0x10,%rsp
    5706:       48 c1 e6 06             shl    $0x6,%rsi
    570a:       48 8b 47 50             mov    0x50(%rdi),%rax
    570e:       c5 fc 28 0d ca 88 00    vmovaps 0x88ca(%rip),%ymm1        # dfe0 
<_fini+0x498>
    5715:       00 
    5716:       48 8b 08                mov    (%rax),%rcx
    5719:       48 8b 47 48             mov    0x48(%rdi),%rax
    571d:       48 8d 51 40             lea    0x40(%rcx),%rdx
    5721:       48 8b 00                mov    (%rax),%rax
    5724:       48 01 d6                add    %rdx,%rsi
    5727:       eb 0b                   jmp    5734 <sign_nova(UnaryOpUGen*, 
int)+0x44>
    5729:       0f 1f 80 00 00 00 00    nopl   0x0(%rax)
    5730:       48 83 c2 40             add    $0x40,%rdx
    5734:       c5 fc 28 10             vmovaps (%rax),%ymm2
    5738:       c5 ec c2 d0 04          vcmpneqps %ymm0,%ymm2,%ymm2
    573d:       c5 ec 54 d1             vandps %ymm1,%ymm2,%ymm2
    5741:       c5 fc 29 11             vmovaps %ymm2,(%rcx)
    5745:       c5 fc 28 50 20          vmovaps 0x20(%rax),%ymm2
    574a:       48 83 c0 40             add    $0x40,%rax
    574e:       48 39 f2                cmp    %rsi,%rdx
    5751:       c5 ec c2 d0 04          vcmpneqps %ymm0,%ymm2,%ymm2
    5756:       c5 ec 54 d1             vandps %ymm1,%ymm2,%ymm2
    575a:       c5 fc 29 51 20          vmovaps %ymm2,0x20(%rcx)
    575f:       48 89 d1                mov    %rdx,%rcx
    5762:       75 cc                   jne    5730 <sign_nova(UnaryOpUGen*, 
int)+0x40>
    5764:       c9                      leaveq 
    5765:       c5 f8 77                vzeroupper 
    5768:       c3                      retq   
    5769:       0f 1f 80 00 00 00 00    nopl   0x0(%rax)

without -ffast-math the code is correct:
0000000000001350 <sign_nova(UnaryOpUGen*, int)>:
    1350:       55                      push   %rbp
    1351:       c1 ee 04                shr    $0x4,%esi
    1354:       c5 f0 57 c9             vxorps %xmm1,%xmm1,%xmm1
    1358:       48 89 e5                mov    %rsp,%rbp
    135b:       48 83 e4 e0             and    $0xffffffffffffffe0,%rsp
    135f:       83 ee 01                sub    $0x1,%esi
    1362:       48 83 c4 10             add    $0x10,%rsp
    1366:       48 c1 e6 06             shl    $0x6,%rsi
    136a:       48 8b 47 50             mov    0x50(%rdi),%rax
    136e:       c5 fc 28 1d 00 00 00    vmovaps 0x0(%rip),%ymm3        # 1376 
<sign_nova(UnaryOpUGen*, int)+0x26>
    1375:       00 
    1376:       c5 fc 28 15 00 00 00    vmovaps 0x0(%rip),%ymm2        # 137e 
<sign_nova(UnaryOpUGen*, int)+0x2e>
    137d:       00 
    137e:       48 8b 08                mov    (%rax),%rcx
    1381:       48 8b 47 48             mov    0x48(%rdi),%rax
    1385:       48 8d 51 40             lea    0x40(%rcx),%rdx
    1389:       48 8b 00                mov    (%rax),%rax
    138c:       48 01 d6                add    %rdx,%rsi
    138f:       eb 0b                   jmp    139c <sign_nova(UnaryOpUGen*, 
int)+0x4c>
    1391:       0f 1f 80 00 00 00 00    nopl   0x0(%rax)
    1398:       48 83 c2 40             add    $0x40,%rdx
    139c:       c5 fc 28 00             vmovaps (%rax),%ymm0
    13a0:       c5 fc c2 e1 04          vcmpneqps %ymm1,%ymm0,%ymm4
    13a5:       c5 fc 54 c3             vandps %ymm3,%ymm0,%ymm0
    13a9:       c5 dc 54 e2             vandps %ymm2,%ymm4,%ymm4
    13ad:       c5 fc 56 c4             vorps  %ymm4,%ymm0,%ymm0
    13b1:       c5 fc 29 01             vmovaps %ymm0,(%rcx)
    13b5:       c5 fc 28 40 20          vmovaps 0x20(%rax),%ymm0
    13ba:       48 83 c0 40             add    $0x40,%rax
    13be:       48 39 f2                cmp    %rsi,%rdx
    13c1:       c5 fc c2 e1 04          vcmpneqps %ymm1,%ymm0,%ymm4
    13c6:       c5 fc 54 c3             vandps %ymm3,%ymm0,%ymm0
    13ca:       c5 dc 54 e2             vandps %ymm2,%ymm4,%ymm4
    13ce:       c5 fc 56 c4             vorps  %ymm4,%ymm0,%ymm0
    13d2:       c5 fc 29 41 20          vmovaps %ymm0,0x20(%rcx)
    13d7:       48 89 d1                mov    %rdx,%rcx
    13da:       75 bc                   jne    1398 <sign_nova(UnaryOpUGen*, 
int)+0x48>
    13dc:       c9                      leaveq 
    13dd:       c5 f8 77                vzeroupper 
    13e0:       c3                      retq   
    13e1:       66 66 66 66 66 66 2e    data32 data32 data32 data32 data32 nopw 
%cs:0x0(%rax,%rax,1)
    13e8:       0f 1f 84 00 00 00 00 
    13ef:       00 

is this a bug in the compiler or have i merely been lucky that my sse-specific 
code was never hit by this optimization?

thanks, tim