g++: Suboptimal code generation for simple wrapper class around vector data type

Martin Reinecke <martin@xxxxxxxxxxxxxxxxxxx> · Mon, 22 Mar 2021 15:34:22 +0100

Hi,

the attached test case is the (slightly simplified) hot loop from a
library for spherical harmonic transforms.
This code uses explicit vectorization, and I try to use simple wrapper
classes around the primitive vector types (like __m256d) to simplify
operations like initialization with a scalar etc.

However it seems that using the wrapper type inside the critical loop
causes g++ to produce sub-optimal code. This can be seen by running

g++ -mfma -O3 -std=c++17 -ffast-math -S testcase.cc

and inspecting the generated assembler code (I'm using gcc 10.2.1).
The version where I use the wrapper type even in the hot loop (i.e.
"foo<Tvsimple, 2>") has a few unnecessary "vmovapd" instructions before
the end of the loop body, which are missing in the version where I cast
to __m256d before doing the heavy computation (i.e. "foo<__m256d,2>").

My suspicion is that the "Tvsimple" type is somehow not completely POD
and that this prohibits g++ from optimizing more aggressively. On the
other hand, clang++ produces identical code for both versions, which is
comparable in speed with the faster version generated by g++.

Is g++ missing an opportunity to optimize here? If so, is there a way to
alter the "Tvsimple" class so that it doesn't stop g++ from optimizing?

Thanks,
  Martin
#include <complex>
#include <vector>
#include <array>
#include <immintrin.h>

using namespace std;

struct dbl2 { double a, b; };

// simple OO wrapper around __m256d
class Tvsimple
  {
  private:
    __m256d v;
  public:
    Tvsimple()=default;
    Tvsimple(const double &val) :v(_mm256_set1_pd(val)) {}
    Tvsimple(const __m256d &val) :v(val) {}
    operator __m256d() const {return v;}
    Tvsimple &operator*=(double val) {v*=val; return *this;}
    Tvsimple &operator+=(const Tvsimple &other) {v+=other.v; return *this;}
    Tvsimple operator*(double val) const {return v*_mm256_set1_pd(val);}
    Tvsimple operator*(Tvsimple val) const {return v*val.v;}
    Tvsimple operator+(Tvsimple val) const {return v+val.v;}
    Tvsimple operator+(double val) const {return v+_mm256_set1_pd(val);}
  };

constexpr size_t VLEN=4;

constexpr size_t nv0 = 128/VLEN;

typedef std::array<Tvsimple,nv0> Tbv0;

struct s0data_v
  { Tbv0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; };

template<typename T_inner, size_t nv2> void foo(s0data_v & d,
  const vector<dbl2> &coef, const complex<double> * alm,
  size_t l, size_t il, size_t lmax, size_t start)
  {
  T_inner p1r[nv2], p1i[nv2], p2r[nv2], p2i[nv2], lam1[nv2], lam2[nv2], csq[nv2];
  for (size_t i=0; i<nv2; ++i)
    {
    p1r[i] = d.p1r[i+start];
    p2r[i] = d.p2r[i+start];
    p1i[i] = d.p1i[i+start];
    p2i[i] = d.p2i[i+start];
    lam1[i] = d.lam1[i+start];
    lam2[i] = d.lam2[i+start];
    csq[i] = d.csq[i+start];
    }
// critical loop
  while (l<=lmax)
    {
    for (size_t i=0; i<nv2; ++i)
      p1r[i] += lam2[i]*alm[l  ].real();
    for (size_t i=0; i<nv2; ++i)
      p1i[i] += lam2[i]*alm[l  ].imag();
    for (size_t i=0; i<nv2; ++i)
      p2r[i] += lam2[i]*alm[l+1].real();
    for (size_t i=0; i<nv2; ++i)
      p2i[i] += lam2[i]*alm[l+1].imag();
    for (size_t i=0; i<nv2; ++i)
      {
      T_inner tmp = lam2[i]*(csq[i]*coef[il].a + coef[il].b) + lam1[i];
      lam1[i] = lam2[i];
      lam2[i] = tmp;
      }
    ++il; l+=2;
    }
  for (size_t i=0; i<nv2; ++i)
    {
    d.p1r[i+start] = p1r[i];
    d.p2r[i+start] = p2r[i];
    d.p1i[i+start] = p1i[i];
    d.p2i[i+start] = p2i[i];
    }
  }

template void foo<Tvsimple, 2>(s0data_v & __restrict__ d,
  const vector<dbl2> &coef, const complex<double> * alm,
  size_t l, size_t il, size_t lmax, size_t start);
template void foo<__m256d,2>(s0data_v & __restrict__ d,
  const vector<dbl2> &coef, const complex<double> * alm,
  size_t l, size_t il, size_t lmax, size_t start);