Here is a further reduced test case, together with the generated assembler output. I'm really at my wits' end here ... should I file this as a "missed-optimization" PR? Cheers, Martin On 3/22/21 3:34 PM, Martin Reinecke wrote: > Hi, > > the attached test case is the (slightly simplified) hot loop from a > library for spherical harmonic transforms. > This code uses explicit vectorization, and I try to use simple wrapper > classes around the primitive vector types (like __m256d) to simplify > operations like initialization with a scalar etc. > > However it seems that using the wrapper type inside the critical loop > causes g++ to produce sub-optimal code. This can be seen by running > > g++ -mfma -O3 -std=c++17 -ffast-math -S testcase.cc > > and inspecting the generated assembler code (I'm using gcc 10.2.1). > The version where I use the wrapper type even in the hot loop (i.e. > "foo<Tvsimple, 2>") has a few unnecessary "vmovapd" instructions before > the end of the loop body, which are missing in the version where I cast > to __m256d before doing the heavy computation (i.e. "foo<__m256d,2>"). > > My suspicion is that the "Tvsimple" type is somehow not completely POD > and that this prohibits g++ from optimizing more aggressively. On the > other hand, clang++ produces identical code for both versions, which is > comparable in speed with the faster version generated by g++. > > Is g++ missing an opportunity to optimize here? If so, is there a way to > alter the "Tvsimple" class so that it doesn't stop g++ from optimizing? > > Thanks, > Martin >
#include <immintrin.h> // simple OO wrapper around __m256d struct Tvsimple { __m256d v; Tvsimple &operator+=(const Tvsimple &other) {v+=other.v; return *this;} Tvsimple operator*(double val) const { Tvsimple res; res.v = v*_mm256_set1_pd(val); return res;} Tvsimple operator*(Tvsimple val) const { Tvsimple res; res.v = v*val.v; return res; } Tvsimple operator+(Tvsimple val) const { Tvsimple res; res.v = v+val.v; return res; } Tvsimple operator+(double val) const { Tvsimple res; res.v = v+_mm256_set1_pd(val); return res;} }; template<typename vtype> struct s0data_s { vtype sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; }; template<typename vtype> void foo(s0data_s<vtype> & __restrict__ d, const double * __restrict__ coef, const double * __restrict__ alm, size_t l, size_t il, size_t lmax) { // critical loop while (l<=lmax) { d.p1r += d.lam2*alm[2*l]; d.p1i += d.lam2*alm[2*l+1]; d.p2r += d.lam2*alm[2*l+2]; d.p2i += d.lam2*alm[2*l+3]; auto tmp = d.lam2*(d.csq*coef[2*il] + coef[2*il+1]) + d.lam1; d.lam1 = d.lam2; d.lam2 = tmp; ++il; l+=2; } } // this version has dead stores at the end of the loop template void foo<>(s0data_s<Tvsimple> & __restrict__ d, const double * __restrict__ coef, const double * __restrict__ alm, size_t l, size_t il, size_t lmax); // this version moves the stores after the end of the loop template void foo<>(s0data_s<__m256d> & __restrict__ d, const double * __restrict__ coef, const double * __restrict__ alm, size_t l, size_t il, size_t lmax);
.file "testcase.cc" .text .section .text._Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm,"axG",@progbits,_Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm,comdat .p2align 4 .weak _Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm .type _Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm, @function _Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm: .LFB5360: .cfi_startproc cmpq %r9, %rcx ja .L5 movq %rcx, %rax salq $4, %r8 vmovapd 160(%rdi), %ymm7 vmovapd 288(%rdi), %ymm5 salq $4, %rax vmovapd 256(%rdi), %ymm4 vmovapd 224(%rdi), %ymm3 addq %r8, %rsi vmovapd 192(%rdi), %ymm2 vmovapd 128(%rdi), %ymm0 addq %rax, %rdx .p2align 4,,10 .p2align 3 .L4: vbroadcastsd (%rdx), %ymm1 addq $2, %rcx addq $32, %rdx addq $16, %rsi vbroadcastsd -8(%rsi), %ymm6 vfmadd231pd %ymm0, %ymm1, %ymm2 vbroadcastsd -24(%rdx), %ymm1 vfmadd231pd %ymm0, %ymm1, %ymm3 vbroadcastsd -16(%rdx), %ymm1 vfmadd231pd %ymm0, %ymm1, %ymm4 vbroadcastsd -8(%rdx), %ymm1 vmovapd %ymm2, 192(%rdi) vfmadd231pd %ymm0, %ymm1, %ymm5 vbroadcastsd -16(%rsi), %ymm1 vmovapd %ymm3, 224(%rdi) vfmadd132pd %ymm7, %ymm6, %ymm1 vmovapd 128(%rdi), %ymm6 vfmadd213pd 96(%rdi), %ymm1, %ymm0 vmovapd %ymm4, 256(%rdi) vmovapd %ymm6, 96(%rdi) vmovapd %ymm5, 288(%rdi) vmovapd %ymm0, 128(%rdi) cmpq %rcx, %r9 jnb .L4 vzeroupper .L5: ret .cfi_endproc .LFE5360: .size _Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm, .-_Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm .section .text._Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm,"axG",@progbits,_Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm,comdat .p2align 4 .weak _Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm .type _Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm, @function _Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm: .LFB5361: .cfi_startproc cmpq %r9, %rcx ja .L11 movq %rcx, %rax salq $4, %r8 vmovapd 192(%rdi), %ymm5 vmovapd 128(%rdi), %ymm0 salq $4, %rax vmovapd 224(%rdi), %ymm4 vmovapd 256(%rdi), %ymm3 addq %r8, %rsi vmovapd 288(%rdi), %ymm2 vmovapd 160(%rdi), %ymm8 addq %rax, %rdx vmovapd 96(%rdi), %ymm6 jmp .L9 .p2align 4,,10 .p2align 3 .L10: vmovapd %ymm1, %ymm0 .L9: vbroadcastsd (%rdx), %ymm1 addq $2, %rcx addq $32, %rdx addq $16, %rsi vbroadcastsd -8(%rsi), %ymm7 vfmadd231pd %ymm0, %ymm1, %ymm5 vbroadcastsd -24(%rdx), %ymm1 vfmadd231pd %ymm0, %ymm1, %ymm4 vbroadcastsd -16(%rdx), %ymm1 vfmadd231pd %ymm0, %ymm1, %ymm3 vbroadcastsd -8(%rdx), %ymm1 vfmadd231pd %ymm0, %ymm1, %ymm2 vbroadcastsd -16(%rsi), %ymm1 vfmadd132pd %ymm8, %ymm7, %ymm1 vfmadd132pd %ymm0, %ymm6, %ymm1 vmovapd %ymm0, %ymm6 cmpq %rcx, %r9 jnb .L10 vmovapd %ymm5, 192(%rdi) vmovapd %ymm1, 128(%rdi) vmovapd %ymm4, 224(%rdi) vmovapd %ymm3, 256(%rdi) vmovapd %ymm2, 288(%rdi) vmovapd %ymm0, 96(%rdi) vzeroupper .L11: ret .cfi_endproc .LFE5361: .size _Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm, .-_Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm .ident "GCC: (Debian 10.2.1-6) 10.2.1 20210110" .section .note.GNU-stack,"",@progbits