Re: x86-64: Maintain 16-byte stack alignment

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 10 January 2017 at 14:33, Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx> wrote:
> I recently applied the patch
>
>         https://patchwork.kernel.org/patch/9468391/
>
> and ended up with a boot crash when it tried to run the x86 chacha20
> code.  It turned out that the patch changed a manually aligned
> stack buffer to one that is aligned by gcc.  What was happening was
> that gcc can stack align to any value on x86-64 except 16.  The
> reason is that gcc assumes that the stack is always 16-byte aligned,
> which is not actually the case in the kernel.
>

Apologies for introducing this breakage. It seemed like an obvious and
simple cleanup, so I didn't even bother to mention it in the commit
log, but if the kernel does not guarantee 16 byte alignment, I guess
we should revert to the old method. If SSE instructions are the only
ones that require this alignment, then I suppose not having a ABI
conforming stack pointer should not be an issue in general.

> The x86-64 CPU actually tries to keep the stack 16-byte aligned,
> e.g., it'll do so when an IRQ comes in.  So the reason it doesn't
> work in the kernel mostly comes down to the fact that the struct
> pt_regs which lives near the top of the stack is 168 bytes which
> is not a multiple of 16.
>
> This patch tries to fix this by adding an 8-byte padding at the
> top of the call-chain involving pt_regs so that when we call a C
> function later we do so with an aligned stack.
>
> The same problem probably exists on i386 too since gcc also assumes
> 16-byte alignment there.  It's harder to fix however as the CPU
> doesn't help us in the IRQ case.
>
> Signed-off-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>
>
> diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
> index 05ed3d3..29d3bcb 100644
> --- a/arch/x86/entry/calling.h
> +++ b/arch/x86/entry/calling.h
> @@ -59,39 +59,42 @@
>  /*
>   * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
>   * unless syscall needs a complete, fully filled "struct pt_regs".
> + *
> + * Note we add 8 extra bytes at the beginning to preserve stack alignment.
>   */
> -#define R15            0*8
> -#define R14            1*8
> -#define R13            2*8
> -#define R12            3*8
> -#define RBP            4*8
> -#define RBX            5*8
> +#define R15            1*8
> +#define R14            2*8
> +#define R13            3*8
> +#define R12            4*8
> +#define RBP            5*8
> +#define RBX            6*8
>  /* These regs are callee-clobbered. Always saved on kernel entry. */
> -#define R11            6*8
> -#define R10            7*8
> -#define R9             8*8
> -#define R8             9*8
> -#define RAX            10*8
> -#define RCX            11*8
> -#define RDX            12*8
> -#define RSI            13*8
> -#define RDI            14*8
> +#define R11            7*8
> +#define R10            8*8
> +#define R9             9*8
> +#define R8             10*8
> +#define RAX            11*8
> +#define RCX            12*8
> +#define RDX            13*8
> +#define RSI            14*8
> +#define RDI            15*8
>  /*
>   * On syscall entry, this is syscall#. On CPU exception, this is error code.
>   * On hw interrupt, it's IRQ number:
>   */
> -#define ORIG_RAX       15*8
> +#define ORIG_RAX       16*8
>  /* Return frame for iretq */
> -#define RIP            16*8
> -#define CS             17*8
> -#define EFLAGS         18*8
> -#define RSP            19*8
> -#define SS             20*8
> +#define RIP            17*8
> +#define CS             18*8
> +#define EFLAGS         19*8
> +#define RSP            20*8
> +#define SS             21*8
>
> +/* Note that this excludes the 8-byte padding. */
>  #define SIZEOF_PTREGS  21*8
>
>         .macro ALLOC_PT_GPREGS_ON_STACK
> -       addq    $-(15*8), %rsp
> +       addq    $-(16*8), %rsp
>         .endm
>
>         .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
> @@ -114,7 +117,7 @@
>         movq %rdi, 14*8+\offset(%rsp)
>         .endm
>         .macro SAVE_C_REGS offset=0
> -       SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
> +       SAVE_C_REGS_HELPER 8+\offset, 1, 1, 1, 1
>         .endm
>         .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
>         SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
> @@ -130,43 +133,43 @@
>         .endm
>
>         .macro SAVE_EXTRA_REGS offset=0
> -       movq %r15, 0*8+\offset(%rsp)
> -       movq %r14, 1*8+\offset(%rsp)
> -       movq %r13, 2*8+\offset(%rsp)
> -       movq %r12, 3*8+\offset(%rsp)
> -       movq %rbp, 4*8+\offset(%rsp)
> -       movq %rbx, 5*8+\offset(%rsp)
> +       movq %r15, 1*8+\offset(%rsp)
> +       movq %r14, 2*8+\offset(%rsp)
> +       movq %r13, 3*8+\offset(%rsp)
> +       movq %r12, 4*8+\offset(%rsp)
> +       movq %rbp, 5*8+\offset(%rsp)
> +       movq %rbx, 6*8+\offset(%rsp)
>         .endm
>
>         .macro RESTORE_EXTRA_REGS offset=0
> -       movq 0*8+\offset(%rsp), %r15
> -       movq 1*8+\offset(%rsp), %r14
> -       movq 2*8+\offset(%rsp), %r13
> -       movq 3*8+\offset(%rsp), %r12
> -       movq 4*8+\offset(%rsp), %rbp
> -       movq 5*8+\offset(%rsp), %rbx
> +       movq 1*8+\offset(%rsp), %r15
> +       movq 2*8+\offset(%rsp), %r14
> +       movq 3*8+\offset(%rsp), %r13
> +       movq 4*8+\offset(%rsp), %r12
> +       movq 5*8+\offset(%rsp), %rbp
> +       movq 6*8+\offset(%rsp), %rbx
>         .endm
>
>         .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
>         .if \rstor_r11
> -       movq 6*8(%rsp), %r11
> +       movq 7*8(%rsp), %r11
>         .endif
>         .if \rstor_r8910
> -       movq 7*8(%rsp), %r10
> -       movq 8*8(%rsp), %r9
> -       movq 9*8(%rsp), %r8
> +       movq 8*8(%rsp), %r10
> +       movq 9*8(%rsp), %r9
> +       movq 10*8(%rsp), %r8
>         .endif
>         .if \rstor_rax
> -       movq 10*8(%rsp), %rax
> +       movq 11*8(%rsp), %rax
>         .endif
>         .if \rstor_rcx
> -       movq 11*8(%rsp), %rcx
> +       movq 12*8(%rsp), %rcx
>         .endif
>         .if \rstor_rdx
> -       movq 12*8(%rsp), %rdx
> +       movq 13*8(%rsp), %rdx
>         .endif
> -       movq 13*8(%rsp), %rsi
> -       movq 14*8(%rsp), %rdi
> +       movq 14*8(%rsp), %rsi
> +       movq 15*8(%rsp), %rdi
>         .endm
>         .macro RESTORE_C_REGS
>         RESTORE_C_REGS_HELPER 1,1,1,1,1
> @@ -185,7 +188,7 @@
>         .endm
>
>         .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
> -       subq $-(15*8+\addskip), %rsp
> +       subq $-(16*8+\addskip), %rsp
>         .endm
>
>         .macro icebp
> @@ -203,11 +206,7 @@
>   */
>  .macro ENCODE_FRAME_POINTER ptregs_offset=0
>  #ifdef CONFIG_FRAME_POINTER
> -       .if \ptregs_offset
> -               leaq \ptregs_offset(%rsp), %rbp
> -       .else
> -               mov %rsp, %rbp
> -       .endif
> +       leaq    8+\ptregs_offset(%rsp), %rbp
>         orq     $0x1, %rbp
>  #endif
>  .endm
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 5b21970..880bbb8 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -168,7 +168,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
>         pushq   %r9                             /* pt_regs->r9 */
>         pushq   %r10                            /* pt_regs->r10 */
>         pushq   %r11                            /* pt_regs->r11 */
> -       sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not saved */
> +       sub     $(7*8), %rsp                    /* pt_regs->bp, bx, r12-15 not saved */
>
>         /*
>          * If we need to do entry work or if we guess we'll need to do
> @@ -234,14 +234,14 @@ entry_SYSCALL_64_fastpath:
>         TRACE_IRQS_ON
>         ENABLE_INTERRUPTS(CLBR_NONE)
>         SAVE_EXTRA_REGS
> -       movq    %rsp, %rdi
> +       leaq    8(%rsp), %rdi
>         call    syscall_return_slowpath /* returns with IRQs disabled */
>         jmp     return_from_SYSCALL_64
>
>  entry_SYSCALL64_slow_path:
>         /* IRQs are off. */
>         SAVE_EXTRA_REGS
> -       movq    %rsp, %rdi
> +       leaq    8(%rsp), %rdi
>         call    do_syscall_64           /* returns with IRQs disabled */
>
>  return_from_SYSCALL_64:
> @@ -342,9 +342,9 @@ ENTRY(stub_ptregs_64)
>          * Called from fast path -- disable IRQs again, pop return address
>          * and jump to slow path
>          */
> +       popq    %rax
>         DISABLE_INTERRUPTS(CLBR_NONE)
>         TRACE_IRQS_OFF
> -       popq    %rax
>         jmp     entry_SYSCALL64_slow_path
>
>  1:
> @@ -409,13 +409,14 @@ END(__switch_to_asm)
>   */
>  ENTRY(ret_from_fork)
>         movq    %rax, %rdi
> +       subq    $8, %rsp
>         call    schedule_tail                   /* rdi: 'prev' task parameter */
>
>         testq   %rbx, %rbx                      /* from kernel_thread? */
>         jnz     1f                              /* kernel threads are uncommon */
>
>  2:
> -       movq    %rsp, %rdi
> +       leaq    8(%rsp), %rdi
>         call    syscall_return_slowpath /* returns with IRQs disabled */
>         TRACE_IRQS_ON                   /* user mode is traced as IRQS on */
>         SWAPGS
> @@ -494,10 +495,12 @@ END(irq_entries_start)
>          * a little cheaper to use a separate counter in the PDA (short of
>          * moving irq_enter into assembly, which would be too much work)
>          */
> -       movq    %rsp, %rdi
> +       movq    %rsp, %rax
> +       leaq    8(%rsp), %rdi
>         incl    PER_CPU_VAR(irq_count)
>         cmovzq  PER_CPU_VAR(irq_stack_ptr), %rsp
> -       pushq   %rdi
> +       sub     $8, %rsp
> +       pushq   %rax
>         /* We entered an interrupt context - irqs are off: */
>         TRACE_IRQS_OFF
>
> @@ -527,7 +530,7 @@ ret_from_intr:
>
>         /* Interrupt came from user space */
>  GLOBAL(retint_user)
> -       mov     %rsp,%rdi
> +       leaq    8(%rsp), %rdi
>         call    prepare_exit_to_usermode
>         TRACE_IRQS_IRETQ
>         SWAPGS
> @@ -774,7 +777,7 @@ ENTRY(\sym)
>         .endif
>         .endif
>
> -       movq    %rsp, %rdi                      /* pt_regs pointer */
> +       leaq    8(%rsp), %rdi                   /* pt_regs pointer */
>
>         .if \has_error_code
>         movq    ORIG_RAX(%rsp), %rsi            /* get error code */
> @@ -810,11 +813,11 @@ ENTRY(\sym)
>         call    error_entry
>
>
> -       movq    %rsp, %rdi                      /* pt_regs pointer */
> +       leaq    8(%rsp), %rdi                   /* pt_regs pointer */
>         call    sync_regs
> -       movq    %rax, %rsp                      /* switch stack */
> +       leaq    -8(%rax), %rsp                  /* switch stack */
>
> -       movq    %rsp, %rdi                      /* pt_regs pointer */
> +       movq    %rax, %rdi                      /* pt_regs pointer */
>
>         .if \has_error_code
>         movq    ORIG_RAX(%rsp), %rsi            /* get error code */
> @@ -895,6 +898,7 @@ ENTRY(do_softirq_own_stack)
>         mov     %rsp, %rbp
>         incl    PER_CPU_VAR(irq_count)
>         cmove   PER_CPU_VAR(irq_stack_ptr), %rsp
> +       sub     $8, %rsp
>         push    %rbp                            /* frame pointer backlink */
>         call    __do_softirq
>         leaveq
> @@ -924,10 +928,11 @@ ENTRY(xen_do_hypervisor_callback)         /* do_hypervisor_callback(struct *pt_regs) */
>   * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
>   * see the correct pointer to the pt_regs
>   */
> -       movq    %rdi, %rsp                      /* we don't return, adjust the stack frame */
> +       leaq    -8(%rdi), %rsp                  /* we don't return, adjust the stack frame */
>  11:    incl    PER_CPU_VAR(irq_count)
>         movq    %rsp, %rbp
>         cmovzq  PER_CPU_VAR(irq_stack_ptr), %rsp
> +       subq    $8, %rsp
>         pushq   %rbp                            /* frame pointer backlink */
>         call    xen_evtchn_do_upcall
>         popq    %rsp
> @@ -1264,6 +1269,7 @@ ENTRY(nmi)
>          */
>
>         movq    %rsp, %rdi
> +       subq    $8, %rsp
>         movq    $-1, %rsi
>         call    do_nmi
>
> @@ -1475,7 +1481,7 @@ end_repeat_nmi:
>         call    paranoid_entry
>
>         /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
> -       movq    %rsp, %rdi
> +       leaq    8(%rsp), %rdi
>         movq    $-1, %rsi
>         call    do_nmi
>
> @@ -1519,7 +1525,7 @@ ENTRY(rewind_stack_do_exit)
>         xorl    %ebp, %ebp
>
>         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rax
> -       leaq    -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp
> +       leaq    -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE-8(%rax), %rsp
>
>         call    do_exit
>  1:     jmp 1b
> diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
> index e1721da..7d3f1e3 100644
> --- a/arch/x86/entry/entry_64_compat.S
> +++ b/arch/x86/entry/entry_64_compat.S
> @@ -89,6 +89,8 @@ ENTRY(entry_SYSENTER_compat)
>         pushq   $0                      /* pt_regs->r13 = 0 */
>         pushq   $0                      /* pt_regs->r14 = 0 */
>         pushq   $0                      /* pt_regs->r15 = 0 */
> +
> +       subq    $8, %rsp
>         cld
>
>         /*
> @@ -120,7 +122,7 @@ ENTRY(entry_SYSENTER_compat)
>          */
>         TRACE_IRQS_OFF
>
> -       movq    %rsp, %rdi
> +       leaq    8(%rsp), %rdi
>         call    do_fast_syscall_32
>         /* XEN PV guests always use IRET path */
>         ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
> @@ -215,13 +217,15 @@ ENTRY(entry_SYSCALL_compat)
>         pushq   $0                      /* pt_regs->r14 = 0 */
>         pushq   $0                      /* pt_regs->r15 = 0 */
>
> +       subq    $8, %rsp
> +
>         /*
>          * User mode is traced as though IRQs are on, and SYSENTER
>          * turned them off.
>          */
>         TRACE_IRQS_OFF
>
> -       movq    %rsp, %rdi
> +       leaq    8(%rsp), %rdi
>         call    do_fast_syscall_32
>         /* XEN PV guests always use IRET path */
>         ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
> @@ -324,6 +328,8 @@ ENTRY(entry_INT80_compat)
>         pushq   %r13                    /* pt_regs->r13 */
>         pushq   %r14                    /* pt_regs->r14 */
>         pushq   %r15                    /* pt_regs->r15 */
> +
> +       subq    $8, %rsp
>         cld
>
>         /*
> @@ -332,7 +338,7 @@ ENTRY(entry_INT80_compat)
>          */
>         TRACE_IRQS_OFF
>
> -       movq    %rsp, %rdi
> +       leaq    8(%rsp), %rdi
>         call    do_int80_syscall_32
>  .Lsyscall_32_done:
>
> diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
> index be36bf4..3c80aac 100644
> --- a/arch/x86/entry/thunk_64.S
> +++ b/arch/x86/entry/thunk_64.S
> @@ -33,6 +33,7 @@
>         movq 8(%rbp), %rdi
>         .endif
>
> +       sub $8, %rsp
>         call \func
>         jmp  .L_restore
>         _ASM_NOKPROBE(\name)
> @@ -58,6 +59,7 @@
>   || defined(CONFIG_DEBUG_LOCK_ALLOC) \
>   || defined(CONFIG_PREEMPT)
>  .L_restore:
> +       add $8, %rsp
>         popq %r11
>         popq %r10
>         popq %r9
> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> index b467b14..d03ab72 100644
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -384,6 +384,8 @@ early_idt_handler_common:
>         pushq %r14                              /* pt_regs->r14 */
>         pushq %r15                              /* pt_regs->r15 */
>
> +       sub $8, %rsp
> +
>         cmpq $14,%rsi           /* Page fault? */
>         jnz 10f
>         GET_CR2_INTO(%rdi)      /* Can clobber any volatile register if pv */
> @@ -392,7 +394,7 @@ early_idt_handler_common:
>         jz 20f                  /* All good */
>
>  10:
> -       movq %rsp,%rdi          /* RDI = pt_regs; RSI is already trapnr */
> +       leaq 8(%rsp), %rdi      /* RDI = pt_regs; RSI is already trapnr */
>         call early_fixup_exception
>
>  20:
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index bf0c6d0..2af9f81 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -590,6 +590,7 @@ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
>
>  struct bad_iret_stack {
>         void *error_entry_ret;
> +       void *padding;
>         struct pt_regs regs;
>  };
>
> --
> Email: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Kernel]     [Gnu Classpath]     [Gnu Crypto]     [DM Crypt]     [Netfilter]     [Bugtraq]

  Powered by Linux