Hi, the attached test case is the (slightly simplified) hot loop from a library for spherical harmonic transforms. This code uses explicit vectorization, and I try to use simple wrapper classes around the primitive vector types (like __m256d) to simplify operations like initialization with a scalar etc. However it seems that using the wrapper type inside the critical loop causes g++ to produce sub-optimal code. This can be seen by running g++ -mfma -O3 -std=c++17 -ffast-math -S testcase.cc and inspecting the generated assembler code (I'm using gcc 10.2.1). The version where I use the wrapper type even in the hot loop (i.e. "foo<Tvsimple, 2>") has a few unnecessary "vmovapd" instructions before the end of the loop body, which are missing in the version where I cast to __m256d before doing the heavy computation (i.e. "foo<__m256d,2>"). My suspicion is that the "Tvsimple" type is somehow not completely POD and that this prohibits g++ from optimizing more aggressively. On the other hand, clang++ produces identical code for both versions, which is comparable in speed with the faster version generated by g++. Is g++ missing an opportunity to optimize here? If so, is there a way to alter the "Tvsimple" class so that it doesn't stop g++ from optimizing? Thanks, Martin
#include <complex> #include <vector> #include <array> #include <immintrin.h> using namespace std; struct dbl2 { double a, b; }; // simple OO wrapper around __m256d class Tvsimple { private: __m256d v; public: Tvsimple()=default; Tvsimple(const double &val) :v(_mm256_set1_pd(val)) {} Tvsimple(const __m256d &val) :v(val) {} operator __m256d() const {return v;} Tvsimple &operator*=(double val) {v*=val; return *this;} Tvsimple &operator+=(const Tvsimple &other) {v+=other.v; return *this;} Tvsimple operator*(double val) const {return v*_mm256_set1_pd(val);} Tvsimple operator*(Tvsimple val) const {return v*val.v;} Tvsimple operator+(Tvsimple val) const {return v+val.v;} Tvsimple operator+(double val) const {return v+_mm256_set1_pd(val);} }; constexpr size_t VLEN=4; constexpr size_t nv0 = 128/VLEN; typedef std::array<Tvsimple,nv0> Tbv0; struct s0data_v { Tbv0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; }; template<typename T_inner, size_t nv2> void foo(s0data_v & d, const vector<dbl2> &coef, const complex<double> * alm, size_t l, size_t il, size_t lmax, size_t start) { T_inner p1r[nv2], p1i[nv2], p2r[nv2], p2i[nv2], lam1[nv2], lam2[nv2], csq[nv2]; for (size_t i=0; i<nv2; ++i) { p1r[i] = d.p1r[i+start]; p2r[i] = d.p2r[i+start]; p1i[i] = d.p1i[i+start]; p2i[i] = d.p2i[i+start]; lam1[i] = d.lam1[i+start]; lam2[i] = d.lam2[i+start]; csq[i] = d.csq[i+start]; } // critical loop while (l<=lmax) { for (size_t i=0; i<nv2; ++i) p1r[i] += lam2[i]*alm[l ].real(); for (size_t i=0; i<nv2; ++i) p1i[i] += lam2[i]*alm[l ].imag(); for (size_t i=0; i<nv2; ++i) p2r[i] += lam2[i]*alm[l+1].real(); for (size_t i=0; i<nv2; ++i) p2i[i] += lam2[i]*alm[l+1].imag(); for (size_t i=0; i<nv2; ++i) { T_inner tmp = lam2[i]*(csq[i]*coef[il].a + coef[il].b) + lam1[i]; lam1[i] = lam2[i]; lam2[i] = tmp; } ++il; l+=2; } for (size_t i=0; i<nv2; ++i) { d.p1r[i+start] = p1r[i]; d.p2r[i+start] = p2r[i]; d.p1i[i+start] = p1i[i]; d.p2i[i+start] = p2i[i]; } } template void foo<Tvsimple, 2>(s0data_v & __restrict__ d, const vector<dbl2> &coef, const complex<double> * alm, size_t l, size_t il, size_t lmax, size_t start); template void foo<__m256d,2>(s0data_v & __restrict__ d, const vector<dbl2> &coef, const complex<double> * alm, size_t l, size_t il, size_t lmax, size_t start);