Re: [PATCH v2] staging: greybus: use inline function for macros

Julia Lawall <julia.lawall@xxxxxxxx> · Thu, 23 Mar 2023 10:52:35 +0100 (CET)

On Thu, 23 Mar 2023, Greg KH wrote:

> On Wed, Mar 22, 2023 at 11:00:41AM +0100, Julia Lawall wrote:
> > Greg raised the question of whether the inline function is really as
> > efficient as a macro.
> >
> > I tried the following definitions:
> >
> > #define to_gbphy_dev(d) container_of(d, struct gbphy_device, dev)
> >
> > static inline struct gbphy_device *extra_to_gbphy_dev(const struct device *_dev)
> > {
> >        return container_of(_dev, struct gbphy_device, dev);
> > }
> >
> > And the following uses:
> >
> > ssize_t macro_protocol_id_show(struct device *dev,
> >                                 struct device_attribute *attr, char *buf)
> > {
> >         struct gbphy_device *gbphy_dev = to_gbphy_dev(dev);
> >
> >         return sprintf(buf, "%c macro 0x%02x\n", *buf, gbphy_dev->cport_desc->protocol_id);
> > }
> > ssize_t extra_protocol_id_show(struct device *dev,
> > 				struct device_attribute *attr, char *buf)
> > {
> >         struct gbphy_device *gbphy_dev = extra_to_gbphy_dev(dev);
> >
> >         return sprintf(buf, "extra 0x%02x %c\n", gbphy_dev->cport_desc->protocol_id, *buf);
> > }
> >
> > They are a little bit different to avoid too much compiler optimization.
> >
> > After doing make drivers/staging/greybus/gbphy.s, I get similar looking
> > code in both cases:
> >
> > Macro version:
> >
> >         .type   macro_protocol_id_show, @function
> > macro_protocol_id_show:
> >         endbr64
> > 1:      call    __fentry__
> >         .section __mcount_loc, "a",@progbits
> >         .quad 1b
> >         .previous
> >         pushq   %rbp    #
> >         movq    %rdx, %rbp      # tmp96, buf
> >         pushq   %rbx    #
> > # drivers/staging/greybus/gbphy.c:40: {
> >         movq    %rdi, %rbx      # tmp95, dev
> > # drivers/staging/greybus/gbphy.c:43:   return sprintf(buf, "%c macro 0x%02x\n", *buf, gbphy_dev->cport_desc->protocol_id);
> >         call    __sanitizer_cov_trace_pc        #
> > # drivers/staging/greybus/gbphy.c:43:   return sprintf(buf, "%c macro 0x%02x\n", *buf, gbphy_dev->cport_desc->protocol_id);
> >         movq    -32(%rbx), %rax # MEM[(struct gbphy_device *)dev_7(D) + -40B].cport_desc, MEM[(struct gbphy_device *)dev_7(D) + -40B].cport_desc
> > # drivers/staging/greybus/gbphy.c:43:   return sprintf(buf, "%c macro 0x%02x\n", *buf, gbphy_dev->cport_desc->protocol_id);
> >         movzbl  0(%rbp), %edx   # *buf_9(D), *buf_9(D)
> >         movq    %rbp, %rdi      # buf,
> >         movq    $.LC18, %rsi    #,
> >         movzbl  3(%rax), %ecx   # _1->protocol_id, _1->protocol_id
> >         call    sprintf #
> > # drivers/staging/greybus/gbphy.c:44: }
> >         movl    $13, %eax       #,
> >         popq    %rbx    #
> >         popq    %rbp    #
> >         jmp     __x86_return_thunk
> >         .size   macro_protocol_id_show, .-macro_protocol_id_show
> >
> > Function version:
> >
> >         .type   extra_protocol_id_show, @function
> > extra_protocol_id_show:
> >         endbr64
> > 1:      call    __fentry__
> >         .section __mcount_loc, "a",@progbits
> >         .quad 1b
> >         .previous
> >         pushq   %rbp    #
> >         movq    %rdx, %rbp      # tmp96, buf
> >         pushq   %rbx    #
> > # drivers/staging/greybus/gbphy.c:47: {
> >         movq    %rdi, %rbx      # tmp95, dev
> > # drivers/staging/greybus/gbphy.c:50:   return sprintf(buf, "extra 0x%02x %c\n", gbphy_dev->cport_desc->protocol_id, *buf);
> >         call    __sanitizer_cov_trace_pc        #
> > # drivers/staging/greybus/gbphy.c:50:   return sprintf(buf, "extra 0x%02x %c\n", gbphy_dev->cport_desc->protocol_id, *buf);
> >         movq    -32(%rbx), %rax # MEM[(struct gbphy_device *)dev_8(D) + -40B].cport_desc, MEM[(struct gbphy_device *)dev_8(D) + -40B].cport_desc
> > # drivers/staging/greybus/gbphy.c:50:   return sprintf(buf, "extra 0x%02x %c\n", gbphy_dev->cport_desc->protocol_id, *buf);
> >         movzbl  0(%rbp), %ecx   # *buf_9(D), *buf_9(D)
> >         movq    %rbp, %rdi      # buf,
> >         movq    $.LC19, %rsi    #,
> >         movzbl  3(%rax), %edx   # _3->protocol_id, _3->protocol_id
> >         call    sprintf #
> > # drivers/staging/greybus/gbphy.c:51: }
> >         movl    $13, %eax       #,
> >         popq    %rbx    #
> >         popq    %rbp    #
> >         jmp     __x86_return_thunk
> >         .size   extra_protocol_id_show, .-extra_protocol_id_show
> >
> > Both seem to access the memory directly.  Maybe the example is too simple,
> > and the compiler is more likely to optimize aggressively?
>
> Nice, that shows that it is the same both ways as the compiler version
> you are using is smart enough
>
> Which compiler and version is this?  Does it work the same for all of
> the supported versions we have to support (i.e. really old gcc?)

gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0

I got a similar result for gcc-5:

macro_protocol_id_show:
1:      call    __fentry__
        .section __mcount_loc, "a",@progbits
        .quad 1b
        .previous
        movq    %rdx, %rax      # buf, buf
        movq    -32(%rdi), %rdx # MEM[(struct gbphy_device *)dev_1(D) + -40B].cport_desc, MEM[(struct gbphy_device *)dev_1(D) + -40B].cport_desc
        movq    $.LC19, %rsi    #,
        movq    %rax, %rdi      # buf,
        movzbl  3(%rdx), %ecx   # _3->protocol_id, D.44996
        movzbl  (%rax), %edx    # *buf_6(D), D.44996
        call    sprintf #
        cltq
        jmp     __x86_return_thunk
        .size   macro_protocol_id_show, .-macro_protocol_id_show
        .section        .text.unlikely
.LCOLDE20:
        .text
.LHOTE20:
        .section        .rodata.str1.1
.LC21:
        .string "extra 0x%02x %c\n"
        .section        .text.unlikely
.LCOLDB22:
        .text
.LHOTB22:
        .p2align 6,,63
        .globl  extra_protocol_id_show
        .type   extra_protocol_id_show, @function
extra_protocol_id_show:
1:      call    __fentry__
        .section __mcount_loc, "a",@progbits
        .quad 1b
        .previous
        movq    %rdx, %rax      # buf, buf
        movzbl  (%rdx), %ecx    # *buf_3(D), D.45003
        movq    -32(%rdi), %rdx # MEM[(struct gbphy_device *)dev_2(D) + -40B].cport_desc, MEM[(struct gbphy_device *)dev_2(D) + -40B].cport_desc
        movq    $.LC21, %rsi    #,
        movq    %rax, %rdi      # buf,
        movzbl  3(%rdx), %edx   # _6->protocol_id, D.45003
        call    sprintf #
        cltq
        jmp     __x86_return_thunk
        .size   extra_protocol_id_show, .-extra_protocol_id_show
        .section        .text.unlikely

>
> For the most part, sysfs files are not on any sort of "fast path" so a
> function call is fine, but as I mentioned before, sometimes we are
> forced to move calls to container_of() to container_of_const() and that
> can not be an inline function, but must remain a macro :(

It seems that this is because there is not a unique return type, but not a
performance issue?

julia