i've experiencing a problem with gcc-4.6.1 using avx intrinsics to write a sign function. when compiling the following code with -ffast-math, some wrong code is generated. the code is the following: essentially it seems that the sign_mask bitmask is initialized as 0 instead of 0x80000000 and therefore one `and' and one `or' instruction are optimized away. template <typename VecType> inline VecType vec_sign(VecType const & arg) { typedef VecType vec; const vec zero = vec::gen_zero(); const vec one = vec::gen_one(); const vec sign_mask = vec::gen_sign_mask(); // wrongly initialized const vec nonzero = mask_neq(arg, zero); const vec sign = arg & sign_mask; const vec abs_ret = nonzero & one; const vec ret = sign | abs_ret; return ret; } &, | and mask_neq are implemented with _mm256_and_ps, _mm256_or_ps and _mm256_cmp_ps. sign_mask is generated via: template <> struct vec<float> { [...] __m256 data_; static inline __m256 gen_sign_mask(void) { return set_bitmask(0x80000000); } static inline __m256 set_bitmask(unsigned int mask) { union { unsigned int i; float f; } u; u.i = mask; return _mm256_set1_ps(u.f); } [...] }; the generated code looks is essentially the following: 00000000000056f0 <sign_nova(UnaryOpUGen*, int)>: 56f0: 55 push %rbp 56f1: c1 ee 04 shr $0x4,%esi 56f4: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0 56f8: 48 89 e5 mov %rsp,%rbp 56fb: 48 83 e4 e0 and $0xffffffffffffffe0,%rsp 56ff: 83 ee 01 sub $0x1,%esi 5702: 48 83 c4 10 add $0x10,%rsp 5706: 48 c1 e6 06 shl $0x6,%rsi 570a: 48 8b 47 50 mov 0x50(%rdi),%rax 570e: c5 fc 28 0d ca 88 00 vmovaps 0x88ca(%rip),%ymm1 # dfe0 <_fini+0x498> 5715: 00 5716: 48 8b 08 mov (%rax),%rcx 5719: 48 8b 47 48 mov 0x48(%rdi),%rax 571d: 48 8d 51 40 lea 0x40(%rcx),%rdx 5721: 48 8b 00 mov (%rax),%rax 5724: 48 01 d6 add %rdx,%rsi 5727: eb 0b jmp 5734 <sign_nova(UnaryOpUGen*, int)+0x44> 5729: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 5730: 48 83 c2 40 add $0x40,%rdx 5734: c5 fc 28 10 vmovaps (%rax),%ymm2 5738: c5 ec c2 d0 04 vcmpneqps %ymm0,%ymm2,%ymm2 573d: c5 ec 54 d1 vandps %ymm1,%ymm2,%ymm2 5741: c5 fc 29 11 vmovaps %ymm2,(%rcx) 5745: c5 fc 28 50 20 vmovaps 0x20(%rax),%ymm2 574a: 48 83 c0 40 add $0x40,%rax 574e: 48 39 f2 cmp %rsi,%rdx 5751: c5 ec c2 d0 04 vcmpneqps %ymm0,%ymm2,%ymm2 5756: c5 ec 54 d1 vandps %ymm1,%ymm2,%ymm2 575a: c5 fc 29 51 20 vmovaps %ymm2,0x20(%rcx) 575f: 48 89 d1 mov %rdx,%rcx 5762: 75 cc jne 5730 <sign_nova(UnaryOpUGen*, int)+0x40> 5764: c9 leaveq 5765: c5 f8 77 vzeroupper 5768: c3 retq 5769: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) without -ffast-math the code is correct: 0000000000001350 <sign_nova(UnaryOpUGen*, int)>: 1350: 55 push %rbp 1351: c1 ee 04 shr $0x4,%esi 1354: c5 f0 57 c9 vxorps %xmm1,%xmm1,%xmm1 1358: 48 89 e5 mov %rsp,%rbp 135b: 48 83 e4 e0 and $0xffffffffffffffe0,%rsp 135f: 83 ee 01 sub $0x1,%esi 1362: 48 83 c4 10 add $0x10,%rsp 1366: 48 c1 e6 06 shl $0x6,%rsi 136a: 48 8b 47 50 mov 0x50(%rdi),%rax 136e: c5 fc 28 1d 00 00 00 vmovaps 0x0(%rip),%ymm3 # 1376 <sign_nova(UnaryOpUGen*, int)+0x26> 1375: 00 1376: c5 fc 28 15 00 00 00 vmovaps 0x0(%rip),%ymm2 # 137e <sign_nova(UnaryOpUGen*, int)+0x2e> 137d: 00 137e: 48 8b 08 mov (%rax),%rcx 1381: 48 8b 47 48 mov 0x48(%rdi),%rax 1385: 48 8d 51 40 lea 0x40(%rcx),%rdx 1389: 48 8b 00 mov (%rax),%rax 138c: 48 01 d6 add %rdx,%rsi 138f: eb 0b jmp 139c <sign_nova(UnaryOpUGen*, int)+0x4c> 1391: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 1398: 48 83 c2 40 add $0x40,%rdx 139c: c5 fc 28 00 vmovaps (%rax),%ymm0 13a0: c5 fc c2 e1 04 vcmpneqps %ymm1,%ymm0,%ymm4 13a5: c5 fc 54 c3 vandps %ymm3,%ymm0,%ymm0 13a9: c5 dc 54 e2 vandps %ymm2,%ymm4,%ymm4 13ad: c5 fc 56 c4 vorps %ymm4,%ymm0,%ymm0 13b1: c5 fc 29 01 vmovaps %ymm0,(%rcx) 13b5: c5 fc 28 40 20 vmovaps 0x20(%rax),%ymm0 13ba: 48 83 c0 40 add $0x40,%rax 13be: 48 39 f2 cmp %rsi,%rdx 13c1: c5 fc c2 e1 04 vcmpneqps %ymm1,%ymm0,%ymm4 13c6: c5 fc 54 c3 vandps %ymm3,%ymm0,%ymm0 13ca: c5 dc 54 e2 vandps %ymm2,%ymm4,%ymm4 13ce: c5 fc 56 c4 vorps %ymm4,%ymm0,%ymm0 13d2: c5 fc 29 41 20 vmovaps %ymm0,0x20(%rcx) 13d7: 48 89 d1 mov %rdx,%rcx 13da: 75 bc jne 1398 <sign_nova(UnaryOpUGen*, int)+0x48> 13dc: c9 leaveq 13dd: c5 f8 77 vzeroupper 13e0: c3 retq 13e1: 66 66 66 66 66 66 2e data32 data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1) 13e8: 0f 1f 84 00 00 00 00 13ef: 00 is this a bug in the compiler or have i merely been lucky that my sse-specific code was never hit by this optimization? thanks, tim