Re: g++: Suboptimal code generation for simple wrapper class around vector data type

Martin Reinecke <martin@xxxxxxxxxxxxxxxxxxx> · Tue, 23 Mar 2021 11:32:49 +0100

Here is a further reduced test case, together with the generated
assembler output.

I'm really at my wits' end here ... should I file this as a
"missed-optimization" PR?

Cheers,
  Martin

On 3/22/21 3:34 PM, Martin Reinecke wrote:
> Hi,
> 
> the attached test case is the (slightly simplified) hot loop from a
> library for spherical harmonic transforms.
> This code uses explicit vectorization, and I try to use simple wrapper
> classes around the primitive vector types (like __m256d) to simplify
> operations like initialization with a scalar etc.
> 
> However it seems that using the wrapper type inside the critical loop
> causes g++ to produce sub-optimal code. This can be seen by running
> 
> g++ -mfma -O3 -std=c++17 -ffast-math -S testcase.cc
> 
> and inspecting the generated assembler code (I'm using gcc 10.2.1).
> The version where I use the wrapper type even in the hot loop (i.e.
> "foo<Tvsimple, 2>") has a few unnecessary "vmovapd" instructions before
> the end of the loop body, which are missing in the version where I cast
> to __m256d before doing the heavy computation (i.e. "foo<__m256d,2>").
> 
> My suspicion is that the "Tvsimple" type is somehow not completely POD
> and that this prohibits g++ from optimizing more aggressively. On the
> other hand, clang++ produces identical code for both versions, which is
> comparable in speed with the faster version generated by g++.
> 
> Is g++ missing an opportunity to optimize here? If so, is there a way to
> alter the "Tvsimple" class so that it doesn't stop g++ from optimizing?
> 
> Thanks,
>   Martin
> 
#include <immintrin.h>

// simple OO wrapper around __m256d
struct Tvsimple
  {
  __m256d v;
  Tvsimple &operator+=(const Tvsimple &other) {v+=other.v; return *this;}
  Tvsimple operator*(double val) const { Tvsimple res; res.v = v*_mm256_set1_pd(val); return res;}
  Tvsimple operator*(Tvsimple val) const { Tvsimple res; res.v = v*val.v; return res; }
  Tvsimple operator+(Tvsimple val) const { Tvsimple res; res.v = v+val.v; return res; }
  Tvsimple operator+(double val) const { Tvsimple res; res.v = v+_mm256_set1_pd(val); return res;}
  };

template<typename vtype> struct s0data_s
  { vtype sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; };

template<typename vtype> void foo(s0data_s<vtype> & __restrict__ d,
  const double * __restrict__ coef, const double * __restrict__ alm,
  size_t l, size_t il, size_t lmax)
  {
// critical loop
  while (l<=lmax)
    {
    d.p1r += d.lam2*alm[2*l];
    d.p1i += d.lam2*alm[2*l+1];
    d.p2r += d.lam2*alm[2*l+2];
    d.p2i += d.lam2*alm[2*l+3];
    auto tmp = d.lam2*(d.csq*coef[2*il] + coef[2*il+1]) + d.lam1;
    d.lam1 = d.lam2;
    d.lam2 = tmp;
    ++il; l+=2;
    }
  }

// this version has dead stores at the end of the loop
template void foo<>(s0data_s<Tvsimple> & __restrict__ d,
  const double * __restrict__ coef, const double * __restrict__ alm,
  size_t l, size_t il, size_t lmax);

// this version moves the stores after the end of the loop
template void foo<>(s0data_s<__m256d> & __restrict__ d,
  const double * __restrict__ coef, const double * __restrict__ alm,
  size_t l, size_t il, size_t lmax);
	.file	"testcase.cc"
	.text
	.section	.text._Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm,"axG",@progbits,_Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm,comdat
	.p2align 4
	.weak	_Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm
	.type	_Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm, @function
_Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm:
.LFB5360:
	.cfi_startproc
	cmpq	%r9, %rcx
	ja	.L5
	movq	%rcx, %rax
	salq	$4, %r8
	vmovapd	160(%rdi), %ymm7
	vmovapd	288(%rdi), %ymm5
	salq	$4, %rax
	vmovapd	256(%rdi), %ymm4
	vmovapd	224(%rdi), %ymm3
	addq	%r8, %rsi
	vmovapd	192(%rdi), %ymm2
	vmovapd	128(%rdi), %ymm0
	addq	%rax, %rdx
	.p2align 4,,10
	.p2align 3
.L4:
	vbroadcastsd	(%rdx), %ymm1
	addq	$2, %rcx
	addq	$32, %rdx
	addq	$16, %rsi
	vbroadcastsd	-8(%rsi), %ymm6
	vfmadd231pd	%ymm0, %ymm1, %ymm2
	vbroadcastsd	-24(%rdx), %ymm1
	vfmadd231pd	%ymm0, %ymm1, %ymm3
	vbroadcastsd	-16(%rdx), %ymm1
	vfmadd231pd	%ymm0, %ymm1, %ymm4
	vbroadcastsd	-8(%rdx), %ymm1
	vmovapd	%ymm2, 192(%rdi)
	vfmadd231pd	%ymm0, %ymm1, %ymm5
	vbroadcastsd	-16(%rsi), %ymm1
	vmovapd	%ymm3, 224(%rdi)
	vfmadd132pd	%ymm7, %ymm6, %ymm1
	vmovapd	128(%rdi), %ymm6
	vfmadd213pd	96(%rdi), %ymm1, %ymm0
	vmovapd	%ymm4, 256(%rdi)
	vmovapd	%ymm6, 96(%rdi)
	vmovapd	%ymm5, 288(%rdi)
	vmovapd	%ymm0, 128(%rdi)
	cmpq	%rcx, %r9
	jnb	.L4
	vzeroupper
.L5:
	ret
	.cfi_endproc
.LFE5360:
	.size	_Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm, .-_Z3fooI8TvsimpleEvR8s0data_sIT_EPKdS6_mmm
	.section	.text._Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm,"axG",@progbits,_Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm,comdat
	.p2align 4
	.weak	_Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm
	.type	_Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm, @function
_Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm:
.LFB5361:
	.cfi_startproc
	cmpq	%r9, %rcx
	ja	.L11
	movq	%rcx, %rax
	salq	$4, %r8
	vmovapd	192(%rdi), %ymm5
	vmovapd	128(%rdi), %ymm0
	salq	$4, %rax
	vmovapd	224(%rdi), %ymm4
	vmovapd	256(%rdi), %ymm3
	addq	%r8, %rsi
	vmovapd	288(%rdi), %ymm2
	vmovapd	160(%rdi), %ymm8
	addq	%rax, %rdx
	vmovapd	96(%rdi), %ymm6
	jmp	.L9
	.p2align 4,,10
	.p2align 3
.L10:
	vmovapd	%ymm1, %ymm0
.L9:
	vbroadcastsd	(%rdx), %ymm1
	addq	$2, %rcx
	addq	$32, %rdx
	addq	$16, %rsi
	vbroadcastsd	-8(%rsi), %ymm7
	vfmadd231pd	%ymm0, %ymm1, %ymm5
	vbroadcastsd	-24(%rdx), %ymm1
	vfmadd231pd	%ymm0, %ymm1, %ymm4
	vbroadcastsd	-16(%rdx), %ymm1
	vfmadd231pd	%ymm0, %ymm1, %ymm3
	vbroadcastsd	-8(%rdx), %ymm1
	vfmadd231pd	%ymm0, %ymm1, %ymm2
	vbroadcastsd	-16(%rsi), %ymm1
	vfmadd132pd	%ymm8, %ymm7, %ymm1
	vfmadd132pd	%ymm0, %ymm6, %ymm1
	vmovapd	%ymm0, %ymm6
	cmpq	%rcx, %r9
	jnb	.L10
	vmovapd	%ymm5, 192(%rdi)
	vmovapd	%ymm1, 128(%rdi)
	vmovapd	%ymm4, 224(%rdi)
	vmovapd	%ymm3, 256(%rdi)
	vmovapd	%ymm2, 288(%rdi)
	vmovapd	%ymm0, 96(%rdi)
	vzeroupper
.L11:
	ret
	.cfi_endproc
.LFE5361:
	.size	_Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm, .-_Z3fooIDv4_dEvR8s0data_sIT_EPKdS6_mmm
	.ident	"GCC: (Debian 10.2.1-6) 10.2.1 20210110"
	.section	.note.GNU-stack,"",@progbits