On 06/01/2020 15:19, Marc Glisse wrote:
On Mon, 6 Jan 2020, Chris Hall wrote:
[description of NRVO]
Is there any way in which I can persuade the compiler that a function
returning a struct does not need to worry about preserving the value
of the final destination (ie the struct at %rdi) ?
Compile the file as C++ instead of C. Not that it would be forbidden in
C, but the optimization happens to be in the C++ front-end. There is
also an optimization pass called nrv, but it does trigger that often.
The idea of trying to write as-standard-as-possible C11, and then
compiling it as C++ makes me queasy :-(
As far as I can see, tree-nrv is enabled. Some of the functions do
write directly to %rdi, so I guess that's being done by tree-nrv. But,
as you say, this optimization does seem not to be applied often.
I was puzzled by why the optimization would not be forbidden in C...
...clearly, the problem is that 'z = f(...) ;' is defined such that the
result must be as if f() creates a temporary struct t, which is copied
to z when f() completes. If f() writes directly to z it must not then
read from z -- which looks hard to guarantee, not least because only the
*caller* knows what z is !!
Now, as I noted before, I found (after a little effort):
typedef struct { char s[64] ; } qerrst_t ;
extern qerrst_t
qerrst5(int err, char* foo)
{
qerr_str_t st ;
st.s[ 0] = err ;
st.s[ 2] = foo[ 8] ;
st.s[ 4] = foo[ 6] ;
st.s[ 6] = foo[ 4] ;
st.s[ 8] = foo[ 2] ;
st.s[10] = foo[ 0] ;
return st ;
}
qerrst5:
movq %rdi, %rax
movzbl 8(%rdx), %r9d
movzbl 6(%rdx), %r8d
movzbl 4(%rdx), %edi
movzbl 2(%rdx), %ecx
movb %sil, (%rax) -- BUG iff %rax ==
movzbl (%rdx), %edx -- %rdx !
movb %r9b, 2(%rax)
movb %r8b, 4(%rax)
movb %dil, 6(%rax)
movb %cl, 8(%rax)
movb %dl, 10(%rax)
ret
which does *not* do a copy in the function, and which is very nearly
correct... except as noted, if *foo points at the final destination !!
HOWEVER... when I tried this function the bug did NOT appear. It turns
out that the *caller* passes a pointer to a hidden struct and the
*caller* copies that to the final destination !!!
The test I ran is:
typedef struct { char s[64] ; } qerrst_t ;
extern qerrst_t qerrst5(int err, char* foo) ;
static void __attribute__((noinline))
show(const char* name, char* s)
{
printf(name) ;
for (int i = 0 ; i <= 10 ; ++i)
printf(" %3d", (unsigned char)s[i]) ;
printf("\n") ;
}
int
main(Unused int argc, Unused char* argv[])
{
int err = argc ;
qerrst_t x, y ;
for (int i = 0 ; i < (int)sizeof(x.s) ; ++i)
y.s[i] = x.s[i] = (char)(100 + i) ;
y = qerrst5(err, x.s) ;
show("y", y.s) ;
x = qerrst5(err, x.s) ;
show("x", x.s) ;
return 0 ;
}
and in a separate compilation unit (for completeness):
extern qerrst_t
qerrst5(int err, char* foo)
{
qerrst_t st ;
st.s[ 0] = (char)err ;
st.s[ 1] = -1 ;
st.s[ 2] = foo[ 8] ;
st.s[ 3] = -3 ;
st.s[ 4] = foo[ 6] ;
st.s[ 5] = -5 ;
st.s[ 6] = foo[ 4] ;
st.s[ 7] = -7 ;
st.s[ 8] = foo[ 2] ;
st.s[ 9] = -8 ;
st.s[10] = foo[ 0] ;
return st ;
}
which compiles to:
Dump of assembler code for function qerrst5:
0x47fdd0 <+0>: mov %rdi,%rax
0x47fdd3 <+3>: movzbl 0x8(%rdx),%r8d # read foo[8]
0x47fdd8 <+8>: movzbl 0x6(%rdx),%edi
0x47fddc <+12>: mov %esi,%r9d
0x47fddf <+15>: movzbl 0x2(%rdx),%ecx
0x47fde3 <+19>: movzbl 0x4(%rdx),%esi
0x47fde7 <+23>: mov %r9b,(%rax) # write st.s[0]
0x47fdea <+26>: movb $0xff,0x1(%rax)
0x47fdee <+30>: movb $0xfd,0x3(%rax)
0x47fdf2 <+34>: movb $0xfb,0x5(%rax)
0x47fdf6 <+38>: movb $0xf9,0x7(%rax)
0x47fdfa <+42>: movb $0xf8,0x9(%rax)
0x47fdfe <+46>: mov %r8b,0x2(%rax)
0x47fe02 <+50>: mov %dil,0x4(%rax)
0x47fe06 <+54>: mov %sil,0x6(%rax)
0x47fe0a <+58>: mov %cl,0x8(%rax)
0x47fe0d <+61>: movzbl (%rdx),%edx # read foo[0]
# -- BUG if foo == st.s
0x47fe10 <+64>: mov %dl,0xa(%rax) # write st.s[10]
0x47fe13 <+67>: retq
And the result was:
y 1 255 108 253 106 251 104 249 102 248 100
x 1 255 108 253 106 251 104 249 102 248 100
SURPRISE ! expected to see:
x 1 255 108 253 106 251 104 249 102 248 1 <<< BUG
Looking at main() we see:
Dump of assembler code for function main:
0x4012a0 <+0>: push %rbp
0x4012a1 <+1>: mov %edi,%esi
0x4012a3 <+3>: mov %rsp,%rbp
0x4012a6 <+6>: push %r12
0x4012a8 <+8>: mov %edi,%r12d
0x4012ab <+11>: and $0xffffffffffffffe0,%rsp
0x4012af <+15>: sub $0xc0,%rsp
0x4012b6 <+22>: vmovaps 0x94642(%rip),%xmm0 # 0x495900
0x4012be <+30>: lea 0x40(%rsp),%rdx # ->x
0x4012c3 <+35>: mov %rsp,%rdi # ->t
0x4012c6 <+38>: vmovaps %xmm0,0x40(%rsp) # x0
0x4012cc <+44>: vmovaps %xmm0,0x80(%rsp) # y0
0x4012d5 <+53>: vmovaps 0x94633(%rip),%xmm0 # 0x495910
0x4012dd <+61>: vmovaps %xmm0,0x50(%rsp) # x1
0x4012e3 <+67>: vmovaps %xmm0,0x90(%rsp) # y1
0x4012ec <+76>: vmovaps 0x9462c(%rip),%xmm0 # 0x495920
0x4012f4 <+84>: vmovaps %xmm0,0x60(%rsp) # x2
0x4012fa <+90>: vmovaps %xmm0,0xa0(%rsp) # y2
0x401303 <+99>: vmovaps 0x94625(%rip),%xmm0 # 0x495930
0x40130b <+107>: vmovaps %xmm0,0x70(%rsp) # x3
0x401311 <+113>: vmovaps %xmm0,0xb0(%rsp) # y3
0x40131a <+122>: callq 0x47fdd0 <qerrst5>
0x40131f <+127>: vmovups (%rsp),%xmm1 # t0
0x401324 <+132>: lea 0x80(%rsp),%rsi # ->y
0x40132c <+140>: mov $0x494b3e,%edi
0x401331 <+145>: vmovups 0x10(%rsp),%xmm2 # t1
0x401337 <+151>: vmovups 0x20(%rsp),%xmm3 # t2
0x40133d <+157>: vmovups 0x30(%rsp),%xmm4 # t3
0x401343 <+163>: vmovaps %xmm1,0x80(%rsp) # y0
0x40134c <+172>: vmovaps %xmm2,0x90(%rsp) # y1
0x401355 <+181>: vmovaps %xmm3,0xa0(%rsp) # t2
0x40135e <+190>: vmovaps %xmm4,0xb0(%rsp) # y3
0x401367 <+199>: callq 0x47fb60 <show>
0x40136c <+204>: lea 0x40(%rsp),%rdx # ->x
0x401371 <+209>: mov %r12d,%esi # err
0x401374 <+212>: mov %rsp,%rdi # ->t
0x401377 <+215>: callq 0x47fdd0 <qerrst5>
0x40137c <+220>: vmovups (%rsp),%xmm5 # t0
0x401381 <+225>: lea 0x40(%rsp),%rsi # ->x
0x401386 <+230>: mov $0x4937c4,%edi
0x40138b <+235>: vmovups 0x10(%rsp),%xmm6 # t1
0x401391 <+241>: vmovups 0x20(%rsp),%xmm7 # t2
0x401397 <+247>: vmovups 0x30(%rsp),%xmm1 # t3
0x40139d <+253>: vmovaps %xmm5,0x40(%rsp) # x0
0x4013a3 <+259>: vmovaps %xmm6,0x50(%rsp) # x1
0x4013a9 <+265>: vmovaps %xmm7,0x60(%rsp) # x2
0x4013af <+271>: vmovaps %xmm1,0x70(%rsp) # x3
0x4013b5 <+277>: callq 0x47fb60 <show>
0x4013ba <+282>: xor %eax,%eax
0x4013bc <+284>: mov -0x8(%rbp),%r12
0x4013c0 <+288>: leaveq
0x4013c1 <+289>: retq
The caller is passing a pointer to a hidden 't' and then *itself*
copying the result to the destination of the assignment !!
It looks like the caller is taking care of the problem, so a function
returning a struct does not need to... surely ?
So I also tried:
typedef struct { char s[64] ; } qerrst_t ;
extern qerrst_t qerrst0(int err) ;
int
main(Unused int argc, Unused char* argv[])
{
int err = argc ;
qerrst_t z ;
printf("qerrst0()='%s'\n", qerrst0(err).s) ;
z = qerrst0(err) ;
printf("qerrst0()='%s'\n", z.s) ;
return 0 ;
}
and in a separate compilation unit (for completeness):
extern qerrst_t
qerrst0(int err)
{
qerrst_t st ;
snprintf(st.s, sizeof(st.s), "errno=%d", err) ;
return st ;
}
which compiles to:
Dump of assembler code for function qerrst0:
0x47fc00 <+0>: push %r12
0x47fc02 <+2>: mov %esi,%ecx
0x47fc04 <+4>: mov %rdi,%r12
0x47fc07 <+7>: mov $0x495910,%edx
0x47fc0c <+12>: sub $0x40,%rsp
0x47fc10 <+16>: mov $0x40,%esi
0x47fc15 <+21>: xor %eax,%eax
0x47fc17 <+23>: mov %rsp,%rdi
0x47fc1a <+26>: callq 0x4010b0 <snprintf@plt>
0x47fc1f <+31>: vmovaps (%rsp),%xmm0
0x47fc24 <+36>: mov %r12,%rax
0x47fc27 <+39>: vmovaps 0x10(%rsp),%xmm1
0x47fc2d <+45>: vmovaps 0x20(%rsp),%xmm2
0x47fc33 <+51>: vmovaps 0x30(%rsp),%xmm3
0x47fc39 <+57>: vmovups %xmm0,(%r12)
0x47fc3f <+63>: vmovups %xmm1,0x10(%r12)
0x47fc46 <+70>: vmovups %xmm2,0x20(%r12)
0x47fc4d <+77>: vmovups %xmm3,0x30(%r12)
0x47fc54 <+84>: add $0x40,%rsp
0x47fc58 <+88>: pop %r12
0x47fc5a <+90>: retq
which, as before, creates a temporary, local struct which is copied to
the return struct pointed to by %rdi.
And now we see:
Dump of assembler code for function main:
0x401280 <+0>: push %rbp
0x401281 <+1>: mov %edi,%esi
0x401283 <+3>: mov %rsp,%rbp
0x401286 <+6>: push %r12
0x401288 <+8>: mov %edi,%r12d
0x40128b <+11>: and $0xffffffffffffffe0,%rsp
0x40128f <+15>: sub $0xc0,%rsp
0x401296 <+22>: lea 0x80(%rsp),%rdi # ->t
0x40129e <+30>: callq 0x47fc00 <qerrst0> # qerrst0(err).s
0x4012a3 <+35>: lea 0x80(%rsp),%rsi
0x4012ab <+43>: mov $0x495882,%edi
0x4012b0 <+48>: xor %eax,%eax
0x4012b2 <+50>: callq 0x4010a0 <printf@plt> # printf(..., t)
0x4012b7 <+55>: mov %r12d,%esi
0x4012ba <+58>: mov %rsp,%rdi # ->t
0x4012bd <+61>: callq 0x47fc00 <qerrst0> # z = qerrst0(err) ;
0x4012c2 <+66>: vmovups (%rsp),%xmm0 # t0
0x4012c7 <+71>: lea 0x40(%rsp),%rsi # ->z
0x4012cc <+76>: mov $0x495882,%edi
0x4012d1 <+81>: vmovups 0x10(%rsp),%xmm1 # t1
0x4012d7 <+87>: vmovups 0x20(%rsp),%xmm2 # t2
0x4012dd <+93>: xor %eax,%eax
0x4012df <+95>: vmovups 0x30(%rsp),%xmm3 # t3
0x4012e5 <+101>: vmovaps %xmm0,0x40(%rsp) # z0 )
0x4012eb <+107>: vmovaps %xmm1,0x50(%rsp) # z1 ) copied from t
0x4012f1 <+113>: vmovaps %xmm2,0x60(%rsp) # z2 )
0x4012f7 <+119>: vmovaps %xmm3,0x70(%rsp) # z3 )
0x4012fd <+125>: callq 0x4010a0 <printf@plt> # printf(..., z.s)
0x401302 <+130>: xor %eax,%eax
0x401304 <+132>: mov -0x8(%rbp),%r12
0x401308 <+136>: leaveq
0x401309 <+137>: retq
So for:
printf("qerrst0()='%s'\n", qerrst0(err).s) ;
there is one (spurious) copy in qerrst0().
And for:
z = qerrst0(err) ;
printf("qerrst0()='%s'\n", z.s) ;
there is one (spurious) copy in qerrst0() AND a *second* copy in main().
Is it just me, or is this broken ?
So, I looked at the AMD64 ABI (Draft 0.99.7 – November 17, 2014 –
15:08), Section 3.2.3 Parameter Passing, p22:
Returning of Values: ....
2. If the type has class MEMORY, then the caller provides space
for the return value and passes the address of this storage
in %rdi as if it were the first argument to the function.
In effect, this address becomes a “hidden” first argument.
This storage must not overlap any data visible to the callee
through other names than this argument.
So... the ABI appears to say that the callee does *not* need to do any
copying *ever*.
This pushes the problem back to the caller. If the caller can be sure
that the final destination is not visible to the callee, it too can
avoid copying.
So... why is the qerrst0() function doing a copy ?
Chris