Function returning struct on x86_64 (at least)

Chris Hall <gcc@xxxxxxx> · Mon, 6 Jan 2020 15:09:32 +0000

I hoped to do something "clever" with a function of the form:

  typedef struct { char s[64] ; } qerr_str_t ;

  extern qerr_str_t
  qerrst0(int err)
  {
    qerr_str_t st ;

    snprintf(st.s, sizeof(st.s), "errno=%d", err) ;

    return st ;
  }

but was disappointed to find that this compiles (gcc 8.3 and others, 
-O2) to this:

  .LC0:
        .string "errno=%d"
  qerrst0:
        pushq   %rbx
        movl    %esi, %ecx
        movq    %rdi, %rbx
        movl    $.LC0, %edx
        movl    $64, %esi
        xorl    %eax, %eax
        subq    $64, %rsp
        movq    %rsp, %rdi
        call    snprintf
        movdqa  (%rsp), %xmm0
        movq    %rbx, %rax
        movdqa  16(%rsp), %xmm1
        movdqa  32(%rsp), %xmm2
        movdqa  48(%rsp), %xmm3
        movups  %xmm0, (%rbx)
        movups  %xmm1, 16(%rbx)
        movups  %xmm2, 32(%rbx)
        movups  %xmm3, 48(%rbx)
        addq    $64, %rsp
        popq    %rbx
        ret

On reflection, the compiler is playing safe and not writing to whatever 
the "hidden" pointer %rdi is pointing at, until the implicit assignment. 
 So I have no right to be disappointed.

The object of the exercise is to create temporary strings for use like this:

  int
  main(int argc, char* argv[])
  {
    printf("%s: %s\n", argv[0], qerrst0(argc).s) ;
  }

where the "hidden" pointer passed to qerrst0() does not, in fact, point 
to anything accessible.  Sadly, even when qerrst0() is inlined, I find:

  .LC0:
        .string "errno=%d"
  .LC1:
        .string "%s: %s\n"
  main:
        pushq   %rbx
        movl    %edi, %ecx
        movq    %rsi, %rbx
        movl    $.LC0, %edx
        movl    $64, %esi
        xorl    %eax, %eax
        addq    $-128, %rsp
        leaq    64(%rsp), %rdi
        call    snprintf
        movdqa  64(%rsp), %xmm0
        movq    (%rbx), %rsi
        xorl    %eax, %eax
        movdqa  80(%rsp), %xmm1
        movdqa  96(%rsp), %xmm2
        movq    %rsp, %rdx
        movl    $.LC1, %edi
        movdqa  112(%rsp), %xmm3
        movaps  %xmm0, (%rsp)
        movaps  %xmm1, 16(%rsp)
        movaps  %xmm2, 32(%rsp)
        movaps  %xmm3, 48(%rsp)
        call    printf
        subq    $-128, %rsp
        xorl    %eax, %eax
        popq    %rbx
        ret

where there is still an (unnecessary) assignment going on !

I tried something simpler:

  extern qerr_str_t
  qerrst1(int err)
  {
    qerr_str_t st ;

    st.s[0] = err ;

    return st ;
  }

which compiles to:

  qerrst1:
        movq    %rdi, %rax
        movb    %sil, (%rdi)
        ret

...so a trivial case optimises as one might hope.

As does:

  extern qerr_str_t
  qerrst2(int err)
  {
    qerr_str_t st ;
    char* q = st.s ;

    q[0]  = err ;
    q[63] = err ;

    return st ;
  }

  qerrst2:
        movq    %rdi, %rax
        movb    %sil, (%rdi)
        movb    %sil, 63(%rdi)
        ret

The following are also optimised:

  extern qerr_str_t
  qerrst3a(int err)
  {
    qerr_str_t st = { "" } ;

    return st ;
  }

  extern qerr_str_t
  qerrst3b(int err)
  {
    qerr_str_t st ;
    char* q = st.s ;

    memset(q, 0, sizeof(st.s)) ;

    return st ;
  }

to the same code:

  qerrst3a/b:
        pxor    %xmm0, %xmm0
        movq    %rdi, %rax
        movups  %xmm0, (%rdi)
        movups  %xmm0, 16(%rdi)
        movups  %xmm0, 32(%rdi)
        movups  %xmm0, 48(%rdi)
        ret

However, ever so slightly more complicated:

  extern qerr_str_t
  qerrst4(int err)
  {
    qerr_str_t st ;

    for (int i = 0 ; i < (err & 63) ; ++i)
      st.s[i] = err - i ;

    return st ;
  }

  qerrst4:
        movl    %esi, %edx
        movq    %rdi, %rax
        andl    $63, %edx
        je      .L12
        subl    $1, %edx
        leaq    -71(%rsp,%rdx), %r8
        leaq    -72(%rsp), %rdx
        addl    %edx, %esi
  .L11:
        movl    %esi, %ecx
        subl    %edx, %ecx
        addq    $1, %rdx
        movb    %cl, -1(%rdx)
        cmpq    %r8, %rdx
        jne     .L11
  .L12:
        movdqa  -72(%rsp), %xmm0
        movdqa  -56(%rsp), %xmm1
        movdqa  -40(%rsp), %xmm2
        movdqa  -24(%rsp), %xmm3
        movups  %xmm0, (%rax)
        movups  %xmm1, 16(%rax)
        movups  %xmm2, 32(%rax)
        movups  %xmm3, 48(%rax)
        ret

Which is a puzzle :-(

Interestingly, I also found (after a little effort):

  extern qerr_str_t
  qerrst5(int err, char* fred)
  {
    qerr_str_t st ;

    st.s[ 0] = err ;
    st.s[ 2] = fred[ 8] ;
    st.s[ 4] = fred[ 6] ;
    st.s[ 6] = fred[ 4] ;
    st.s[ 8] = fred[ 2] ;
    st.s[10] = fred[ 0] ;

    return st ;
  }

  qerrst5:
        movq    %rdi, %rax
        movzbl  8(%rdx), %r9d
        movzbl  6(%rdx), %r8d
        movzbl  4(%rdx), %edi
        movzbl  2(%rdx), %ecx
        movb    %sil, (%rax)	-- BUG iff %rax ==
        movzbl  (%rdx), %edx	--                 %rdx !
        movb    %r9b, 2(%rax)
        movb    %r8b, 4(%rax)
        movb    %dil, 6(%rax)
        movb    %cl, 8(%rax)
        movb    %dl, 10(%rax)
        ret

which is very nearly correct... except as noted, if *fred points at the 
final destination !!

For this to do what I had hoped (and I imagine is the majority case), 
what is needed is a way to mark the declaration of 'qerr_str_t st' in 
the function as a "clone" of the final destination 'qerr_str_t' in the 
caller -- so that the compiler could Just Do It.

I looked for an __attribute__(()) for this... but could not find one.

Is there any way in which I can persuade the compiler that a function 
returning a struct does not need to worry about preserving the value of 
the final destination (ie the struct at %rdi) ?

Chris