Re: g++: Suboptimal code generation for simple wrapper class around vector data type

Alexander Monakov via Gcc-help <gcc-help@xxxxxxxxxxx> · Tue, 23 Mar 2021 16:42:57 +0300 (MSK)

On Tue, 23 Mar 2021, Martin Reinecke wrote:

> Here is a further reduced test case, together with the generated
> assembler output.
> 
> I'm really at my wits' end here ... should I file this as a
> "missed-optimization" PR?

Yes. Note that _m256d is declared with 'may_alias' attribute, so in general
it should weaken optimizations by disabling type-based alias analysis,
but in this case replacing _m256d with __v4df does not help. 

The following version should be easier to optimize, but still is not handled
well (at -O2 -mfma). I'd suggest to present it in the PR in addition to your
reduced version.

(to be clear, I'm not in any way suggesting that the below version is a
"better C++ code" or anything, it just leads to a simpler GIMPLE IR)

Alexander

#include <immintrin.h>

struct Tvsimple
  {
  __v4df v;
  Tvsimple() {}
  Tvsimple(double val) { v = _mm256_set1_pd(val); }
  };
Tvsimple operator*(Tvsimple v1, Tvsimple v2)
{
  Tvsimple res; res.v = v1.v*v2.v; return res;
}
Tvsimple operator+(Tvsimple v1, Tvsimple v2)
{
  Tvsimple res; res.v = v1.v+v2.v; return res;
}

template<typename vtype> struct s0data_s
  { vtype sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; };

template<typename vtype> void foo(s0data_s<vtype> & __restrict__ d,
  const double * __restrict__ coef, const double * __restrict__ alm,
  size_t l, size_t il, size_t lmax)
  {
// critical loop
  while (l<=lmax)
    {
    d.p1r = d.p1r+d.lam2*alm[2*l];
    d.p1i = d.p1i+d.lam2*alm[2*l+1];
    d.p2r = d.p2r+d.lam2*alm[2*l+2];
    d.p2i = d.p2i+d.lam2*alm[2*l+3];
    auto tmp = d.lam2*(d.csq*coef[2*il] + coef[2*il+1]) + d.lam1;
    d.lam1 = d.lam2;
    d.lam2 = tmp;
    ++il; l+=2;
    }
  }

// this version has dead stores at the end of the loop
template void foo<>(s0data_s<Tvsimple> & __restrict__ d,
  const double * __restrict__ coef, const double * __restrict__ alm,
  size_t l, size_t il, size_t lmax);

// this version moves the stores after the end of the loop
template void foo<>(s0data_s<__v4df> & __restrict__ d,
  const double * __restrict__ coef, const double * __restrict__ alm,
  size_t l, size_t il, size_t lmax);