On Tue, 23 Mar 2021, Martin Reinecke wrote: > Here is a further reduced test case, together with the generated > assembler output. > > I'm really at my wits' end here ... should I file this as a > "missed-optimization" PR? Yes. Note that _m256d is declared with 'may_alias' attribute, so in general it should weaken optimizations by disabling type-based alias analysis, but in this case replacing _m256d with __v4df does not help. The following version should be easier to optimize, but still is not handled well (at -O2 -mfma). I'd suggest to present it in the PR in addition to your reduced version. (to be clear, I'm not in any way suggesting that the below version is a "better C++ code" or anything, it just leads to a simpler GIMPLE IR) Alexander #include <immintrin.h> struct Tvsimple { __v4df v; Tvsimple() {} Tvsimple(double val) { v = _mm256_set1_pd(val); } }; Tvsimple operator*(Tvsimple v1, Tvsimple v2) { Tvsimple res; res.v = v1.v*v2.v; return res; } Tvsimple operator+(Tvsimple v1, Tvsimple v2) { Tvsimple res; res.v = v1.v+v2.v; return res; } template<typename vtype> struct s0data_s { vtype sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; }; template<typename vtype> void foo(s0data_s<vtype> & __restrict__ d, const double * __restrict__ coef, const double * __restrict__ alm, size_t l, size_t il, size_t lmax) { // critical loop while (l<=lmax) { d.p1r = d.p1r+d.lam2*alm[2*l]; d.p1i = d.p1i+d.lam2*alm[2*l+1]; d.p2r = d.p2r+d.lam2*alm[2*l+2]; d.p2i = d.p2i+d.lam2*alm[2*l+3]; auto tmp = d.lam2*(d.csq*coef[2*il] + coef[2*il+1]) + d.lam1; d.lam1 = d.lam2; d.lam2 = tmp; ++il; l+=2; } } // this version has dead stores at the end of the loop template void foo<>(s0data_s<Tvsimple> & __restrict__ d, const double * __restrict__ coef, const double * __restrict__ alm, size_t l, size_t il, size_t lmax); // this version moves the stores after the end of the loop template void foo<>(s0data_s<__v4df> & __restrict__ d, const double * __restrict__ coef, const double * __restrict__ alm, size_t l, size_t il, size_t lmax);