I recently applied the patch https://patchwork.kernel.org/patch/9468391/ and ended up with a boot crash when it tried to run the x86 chacha20 code. It turned out that the patch changed a manually aligned stack buffer to one that is aligned by gcc. What was happening was that gcc can stack align to any value on x86-64 except 16. The reason is that gcc assumes that the stack is always 16-byte aligned, which is not actually the case in the kernel. The x86-64 CPU actually tries to keep the stack 16-byte aligned, e.g., it'll do so when an IRQ comes in. So the reason it doesn't work in the kernel mostly comes down to the fact that the struct pt_regs which lives near the top of the stack is 168 bytes which is not a multiple of 16. This patch tries to fix this by adding an 8-byte padding at the top of the call-chain involving pt_regs so that when we call a C function later we do so with an aligned stack. The same problem probably exists on i386 too since gcc also assumes 16-byte alignment there. It's harder to fix however as the CPU doesn't help us in the IRQ case. Signed-off-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx> diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 05ed3d3..29d3bcb 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -59,39 +59,42 @@ /* * C ABI says these regs are callee-preserved. They aren't saved on kernel entry * unless syscall needs a complete, fully filled "struct pt_regs". + * + * Note we add 8 extra bytes at the beginning to preserve stack alignment. */ -#define R15 0*8 -#define R14 1*8 -#define R13 2*8 -#define R12 3*8 -#define RBP 4*8 -#define RBX 5*8 +#define R15 1*8 +#define R14 2*8 +#define R13 3*8 +#define R12 4*8 +#define RBP 5*8 +#define RBX 6*8 /* These regs are callee-clobbered. Always saved on kernel entry. */ -#define R11 6*8 -#define R10 7*8 -#define R9 8*8 -#define R8 9*8 -#define RAX 10*8 -#define RCX 11*8 -#define RDX 12*8 -#define RSI 13*8 -#define RDI 14*8 +#define R11 7*8 +#define R10 8*8 +#define R9 9*8 +#define R8 10*8 +#define RAX 11*8 +#define RCX 12*8 +#define RDX 13*8 +#define RSI 14*8 +#define RDI 15*8 /* * On syscall entry, this is syscall#. On CPU exception, this is error code. * On hw interrupt, it's IRQ number: */ -#define ORIG_RAX 15*8 +#define ORIG_RAX 16*8 /* Return frame for iretq */ -#define RIP 16*8 -#define CS 17*8 -#define EFLAGS 18*8 -#define RSP 19*8 -#define SS 20*8 +#define RIP 17*8 +#define CS 18*8 +#define EFLAGS 19*8 +#define RSP 20*8 +#define SS 21*8 +/* Note that this excludes the 8-byte padding. */ #define SIZEOF_PTREGS 21*8 .macro ALLOC_PT_GPREGS_ON_STACK - addq $-(15*8), %rsp + addq $-(16*8), %rsp .endm .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 @@ -114,7 +117,7 @@ movq %rdi, 14*8+\offset(%rsp) .endm .macro SAVE_C_REGS offset=0 - SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 + SAVE_C_REGS_HELPER 8+\offset, 1, 1, 1, 1 .endm .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 @@ -130,43 +133,43 @@ .endm .macro SAVE_EXTRA_REGS offset=0 - movq %r15, 0*8+\offset(%rsp) - movq %r14, 1*8+\offset(%rsp) - movq %r13, 2*8+\offset(%rsp) - movq %r12, 3*8+\offset(%rsp) - movq %rbp, 4*8+\offset(%rsp) - movq %rbx, 5*8+\offset(%rsp) + movq %r15, 1*8+\offset(%rsp) + movq %r14, 2*8+\offset(%rsp) + movq %r13, 3*8+\offset(%rsp) + movq %r12, 4*8+\offset(%rsp) + movq %rbp, 5*8+\offset(%rsp) + movq %rbx, 6*8+\offset(%rsp) .endm .macro RESTORE_EXTRA_REGS offset=0 - movq 0*8+\offset(%rsp), %r15 - movq 1*8+\offset(%rsp), %r14 - movq 2*8+\offset(%rsp), %r13 - movq 3*8+\offset(%rsp), %r12 - movq 4*8+\offset(%rsp), %rbp - movq 5*8+\offset(%rsp), %rbx + movq 1*8+\offset(%rsp), %r15 + movq 2*8+\offset(%rsp), %r14 + movq 3*8+\offset(%rsp), %r13 + movq 4*8+\offset(%rsp), %r12 + movq 5*8+\offset(%rsp), %rbp + movq 6*8+\offset(%rsp), %rbx .endm .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 - movq 6*8(%rsp), %r11 + movq 7*8(%rsp), %r11 .endif .if \rstor_r8910 - movq 7*8(%rsp), %r10 - movq 8*8(%rsp), %r9 - movq 9*8(%rsp), %r8 + movq 8*8(%rsp), %r10 + movq 9*8(%rsp), %r9 + movq 10*8(%rsp), %r8 .endif .if \rstor_rax - movq 10*8(%rsp), %rax + movq 11*8(%rsp), %rax .endif .if \rstor_rcx - movq 11*8(%rsp), %rcx + movq 12*8(%rsp), %rcx .endif .if \rstor_rdx - movq 12*8(%rsp), %rdx + movq 13*8(%rsp), %rdx .endif - movq 13*8(%rsp), %rsi - movq 14*8(%rsp), %rdi + movq 14*8(%rsp), %rsi + movq 15*8(%rsp), %rdi .endm .macro RESTORE_C_REGS RESTORE_C_REGS_HELPER 1,1,1,1,1 @@ -185,7 +188,7 @@ .endm .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 - subq $-(15*8+\addskip), %rsp + subq $-(16*8+\addskip), %rsp .endm .macro icebp @@ -203,11 +206,7 @@ */ .macro ENCODE_FRAME_POINTER ptregs_offset=0 #ifdef CONFIG_FRAME_POINTER - .if \ptregs_offset - leaq \ptregs_offset(%rsp), %rbp - .else - mov %rsp, %rbp - .endif + leaq 8+\ptregs_offset(%rsp), %rbp orq $0x1, %rbp #endif .endm diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5b21970..880bbb8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -168,7 +168,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r9 /* pt_regs->r9 */ pushq %r10 /* pt_regs->r10 */ pushq %r11 /* pt_regs->r11 */ - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + sub $(7*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ /* * If we need to do entry work or if we guess we'll need to do @@ -234,14 +234,14 @@ entry_SYSCALL_64_fastpath: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_EXTRA_REGS - movq %rsp, %rdi + leaq 8(%rsp), %rdi call syscall_return_slowpath /* returns with IRQs disabled */ jmp return_from_SYSCALL_64 entry_SYSCALL64_slow_path: /* IRQs are off. */ SAVE_EXTRA_REGS - movq %rsp, %rdi + leaq 8(%rsp), %rdi call do_syscall_64 /* returns with IRQs disabled */ return_from_SYSCALL_64: @@ -342,9 +342,9 @@ ENTRY(stub_ptregs_64) * Called from fast path -- disable IRQs again, pop return address * and jump to slow path */ + popq %rax DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - popq %rax jmp entry_SYSCALL64_slow_path 1: @@ -409,13 +409,14 @@ END(__switch_to_asm) */ ENTRY(ret_from_fork) movq %rax, %rdi + subq $8, %rsp call schedule_tail /* rdi: 'prev' task parameter */ testq %rbx, %rbx /* from kernel_thread? */ jnz 1f /* kernel threads are uncommon */ 2: - movq %rsp, %rdi + leaq 8(%rsp), %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS @@ -494,10 +495,12 @@ END(irq_entries_start) * a little cheaper to use a separate counter in the PDA (short of * moving irq_enter into assembly, which would be too much work) */ - movq %rsp, %rdi + movq %rsp, %rax + leaq 8(%rsp), %rdi incl PER_CPU_VAR(irq_count) cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rdi + sub $8, %rsp + pushq %rax /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -527,7 +530,7 @@ ret_from_intr: /* Interrupt came from user space */ GLOBAL(retint_user) - mov %rsp,%rdi + leaq 8(%rsp), %rdi call prepare_exit_to_usermode TRACE_IRQS_IRETQ SWAPGS @@ -774,7 +777,7 @@ ENTRY(\sym) .endif .endif - movq %rsp, %rdi /* pt_regs pointer */ + leaq 8(%rsp), %rdi /* pt_regs pointer */ .if \has_error_code movq ORIG_RAX(%rsp), %rsi /* get error code */ @@ -810,11 +813,11 @@ ENTRY(\sym) call error_entry - movq %rsp, %rdi /* pt_regs pointer */ + leaq 8(%rsp), %rdi /* pt_regs pointer */ call sync_regs - movq %rax, %rsp /* switch stack */ + leaq -8(%rax), %rsp /* switch stack */ - movq %rsp, %rdi /* pt_regs pointer */ + movq %rax, %rdi /* pt_regs pointer */ .if \has_error_code movq ORIG_RAX(%rsp), %rsi /* get error code */ @@ -895,6 +898,7 @@ ENTRY(do_softirq_own_stack) mov %rsp, %rbp incl PER_CPU_VAR(irq_count) cmove PER_CPU_VAR(irq_stack_ptr), %rsp + sub $8, %rsp push %rbp /* frame pointer backlink */ call __do_softirq leaveq @@ -924,10 +928,11 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ - movq %rdi, %rsp /* we don't return, adjust the stack frame */ + leaq -8(%rdi), %rsp /* we don't return, adjust the stack frame */ 11: incl PER_CPU_VAR(irq_count) movq %rsp, %rbp cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp + subq $8, %rsp pushq %rbp /* frame pointer backlink */ call xen_evtchn_do_upcall popq %rsp @@ -1264,6 +1269,7 @@ ENTRY(nmi) */ movq %rsp, %rdi + subq $8, %rsp movq $-1, %rsi call do_nmi @@ -1475,7 +1481,7 @@ end_repeat_nmi: call paranoid_entry /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ - movq %rsp, %rdi + leaq 8(%rsp), %rdi movq $-1, %rsi call do_nmi @@ -1519,7 +1525,7 @@ ENTRY(rewind_stack_do_exit) xorl %ebp, %ebp movq PER_CPU_VAR(cpu_current_top_of_stack), %rax - leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp + leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE-8(%rax), %rsp call do_exit 1: jmp 1b diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e1721da..7d3f1e3 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -89,6 +89,8 @@ ENTRY(entry_SYSENTER_compat) pushq $0 /* pt_regs->r13 = 0 */ pushq $0 /* pt_regs->r14 = 0 */ pushq $0 /* pt_regs->r15 = 0 */ + + subq $8, %rsp cld /* @@ -120,7 +122,7 @@ ENTRY(entry_SYSENTER_compat) */ TRACE_IRQS_OFF - movq %rsp, %rdi + leaq 8(%rsp), %rdi call do_fast_syscall_32 /* XEN PV guests always use IRET path */ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ @@ -215,13 +217,15 @@ ENTRY(entry_SYSCALL_compat) pushq $0 /* pt_regs->r14 = 0 */ pushq $0 /* pt_regs->r15 = 0 */ + subq $8, %rsp + /* * User mode is traced as though IRQs are on, and SYSENTER * turned them off. */ TRACE_IRQS_OFF - movq %rsp, %rdi + leaq 8(%rsp), %rdi call do_fast_syscall_32 /* XEN PV guests always use IRET path */ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ @@ -324,6 +328,8 @@ ENTRY(entry_INT80_compat) pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + + subq $8, %rsp cld /* @@ -332,7 +338,7 @@ ENTRY(entry_INT80_compat) */ TRACE_IRQS_OFF - movq %rsp, %rdi + leaq 8(%rsp), %rdi call do_int80_syscall_32 .Lsyscall_32_done: diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index be36bf4..3c80aac 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -33,6 +33,7 @@ movq 8(%rbp), %rdi .endif + sub $8, %rsp call \func jmp .L_restore _ASM_NOKPROBE(\name) @@ -58,6 +59,7 @@ || defined(CONFIG_DEBUG_LOCK_ALLOC) \ || defined(CONFIG_PREEMPT) .L_restore: + add $8, %rsp popq %r11 popq %r10 popq %r9 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b467b14..d03ab72 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -384,6 +384,8 @@ early_idt_handler_common: pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + sub $8, %rsp + cmpq $14,%rsi /* Page fault? */ jnz 10f GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */ @@ -392,7 +394,7 @@ early_idt_handler_common: jz 20f /* All good */ 10: - movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ + leaq 8(%rsp), %rdi /* RDI = pt_regs; RSI is already trapnr */ call early_fixup_exception 20: diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bf0c6d0..2af9f81 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -590,6 +590,7 @@ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) struct bad_iret_stack { void *error_entry_ret; + void *padding; struct pt_regs regs; }; -- Email: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html