On Tue, Aug 03, 2021 at 09:32:21PM -0700, 'Ira Weiny' wrote: > From: Ira Weiny <ira.weiny@xxxxxxxxx> > > The PKRS MSR is not managed by XSAVE. It is preserved through a context > switch but this support leaves exception handling code open to memory > accesses during exceptions. > > 2 possible places for preserving this state were considered, > irqentry_state_t or pt_regs.[1] pt_regs was much more complicated and > was potentially fraught with unintended consequences.[2] However, Andy > came up with a way to hide additional values on the stack which could be > accessed as "extended_pt_regs".[3] Andy, I'm preparing to send V8 of this PKS work. But I have not seen any feed back since I originally implemented this in V4[1]. Does this meets your expectations? Are there any issues you can see with this code? I would appreciate your feedback. Thanks, Ira [1] https://lore.kernel.org/lkml/20210322053020.2287058-9-ira.weiny@xxxxxxxxx/ > This method allows for; any place > which has struct pt_regs can get access to the extra information; no > extra information is added to irq_state; and pt_regs is left intact for > compatibility with outside tools like BPF. > > To simplify, the assembly code only adds space on the stack. The > setting or use of any needed values are left to the C code. While some > entry points may not use this space it is still added where ever pt_regs > is passed to the C code for consistency. > > Each nested exception gets another copy of this extended space allowing > for any number of levels of exception handling. > > In the assembly, a macro is defined to allow a central place to add > space for other uses should the need arise. > > Finally export pkrs_{save|restore}_irq to the common code to allow > it to preserve the current task's PKRS in the new extended pt_regs if > enabled. > > Peter, Thomas, Andy, Dave, and Dan all suggested parts of the patch or > aided in the development of the patch.. > > [1] https://lore.kernel.org/lkml/CALCETrVe1i5JdyzD_BcctxQJn+ZE3T38EFPgjxN1F577M36g+w@xxxxxxxxxxxxxx/ > [2] https://lore.kernel.org/lkml/874kpxx4jf.fsf@xxxxxxxxxxxxxxxxxxxxxxx/#t > [3] https://lore.kernel.org/lkml/CALCETrUHwZPic89oExMMe-WyDY8-O3W68NcZvse3=PGW+iW5=w@xxxxxxxxxxxxxx/ > > Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> > Cc: Dan Williams <dan.j.williams@xxxxxxxxx> > Suggested-by: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> > Suggested-by: Dan Williams <dan.j.williams@xxxxxxxxx> > Suggested-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx> > Suggested-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > Suggested-by: Andy Lutomirski <luto@xxxxxxxxxx> > Signed-off-by: Ira Weiny <ira.weiny@xxxxxxxxx> > > --- > Changes for V7: > Rebased to 5.14 entry code > declare write_pkrs() in pks.h > s/INIT_PKRS_VALUE/pkrs_init_value > Remove unnecessary INIT_PKRS_VALUE def > s/pkrs_save_set_irq/pkrs_save_irq/ > The inital value for exceptions is best managed > completely within the pkey code. > --- > arch/x86/entry/calling.h | 26 +++++++++++++ > arch/x86/entry/common.c | 54 ++++++++++++++++++++++++++ > arch/x86/entry/entry_64.S | 22 ++++++----- > arch/x86/entry/entry_64_compat.S | 6 +-- > arch/x86/include/asm/pks.h | 18 +++++++++ > arch/x86/include/asm/processor-flags.h | 2 + > arch/x86/kernel/head_64.S | 7 ++-- > arch/x86/mm/fault.c | 3 ++ > include/linux/pkeys.h | 11 +++++- > kernel/entry/common.c | 14 ++++++- > 10 files changed, 143 insertions(+), 20 deletions(-) > > diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h > index a4c061fb7c6e..a2f94677c3fd 100644 > --- a/arch/x86/entry/calling.h > +++ b/arch/x86/entry/calling.h > @@ -63,6 +63,32 @@ For 32-bit we have the following conventions - kernel is built with > * for assembly code: > */ > > +/* > + * __call_ext_ptregs - Helper macro to call into C with extended pt_regs > + * @cfunc: C function to be called > + * > + * This will ensure that extended_ptregs is added and removed as needed during > + * a call into C code. > + */ > +.macro __call_ext_ptregs cfunc annotate_retpoline_safe:req > +#ifdef CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS > + /* add space for extended_pt_regs */ > + subq $EXTENDED_PT_REGS_SIZE, %rsp > +#endif > + .if \annotate_retpoline_safe == 1 > + ANNOTATE_RETPOLINE_SAFE > + .endif > + call \cfunc > +#ifdef CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS > + /* remove space for extended_pt_regs */ > + addq $EXTENDED_PT_REGS_SIZE, %rsp > +#endif > +.endm > + > +.macro call_ext_ptregs cfunc > + __call_ext_ptregs \cfunc, annotate_retpoline_safe=0 > +.endm > + > .macro PUSH_REGS rdx=%rdx rax=%rax save_ret=0 > .if \save_ret > pushq %rsi /* pt_regs->si */ > diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c > index 6c2826417b33..a0d1d5519dba 100644 > --- a/arch/x86/entry/common.c > +++ b/arch/x86/entry/common.c > @@ -19,6 +19,7 @@ > #include <linux/nospec.h> > #include <linux/syscalls.h> > #include <linux/uaccess.h> > +#include <linux/pkeys.h> > > #ifdef CONFIG_XEN_PV > #include <xen/xen-ops.h> > @@ -34,6 +35,7 @@ > #include <asm/io_bitmap.h> > #include <asm/syscall.h> > #include <asm/irq_stack.h> > +#include <asm/pks.h> > > #ifdef CONFIG_X86_64 > > @@ -252,6 +254,56 @@ SYSCALL_DEFINE0(ni_syscall) > return -ENOSYS; > } > > +#ifdef CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS > + > +void show_extended_regs_oops(struct pt_regs *regs, unsigned long error_code) > +{ > + struct extended_pt_regs *ept_regs = extended_pt_regs(regs); > + > + if (cpu_feature_enabled(X86_FEATURE_PKS) && (error_code & X86_PF_PK)) > + pr_alert("PKRS: 0x%x\n", ept_regs->thread_pkrs); > +} > + > +/* > + * PKRS is a per-logical-processor MSR which overlays additional protection for > + * pages which have been mapped with a protection key. > + * > + * Context switches save the MSR in the task struct thus taking that value to > + * other processors if necessary. > + * > + * To protect against exceptions having access to this memory save the current > + * thread value and set the PKRS value to be used during the exception. > + */ > +void pkrs_save_irq(struct pt_regs *regs) > +{ > + struct extended_pt_regs *ept_regs; > + > + BUILD_BUG_ON(sizeof(struct extended_pt_regs) > + != EXTENDED_PT_REGS_SIZE > + + sizeof(struct pt_regs)); > + > + if (!cpu_feature_enabled(X86_FEATURE_PKS)) > + return; > + > + ept_regs = extended_pt_regs(regs); > + ept_regs->thread_pkrs = current->thread.saved_pkrs; > + write_pkrs(pkrs_init_value); > +} > + > +void pkrs_restore_irq(struct pt_regs *regs) > +{ > + struct extended_pt_regs *ept_regs; > + > + if (!cpu_feature_enabled(X86_FEATURE_PKS)) > + return; > + > + ept_regs = extended_pt_regs(regs); > + write_pkrs(ept_regs->thread_pkrs); > + current->thread.saved_pkrs = ept_regs->thread_pkrs; > +} > + > +#endif /* CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */ > + > #ifdef CONFIG_XEN_PV > #ifndef CONFIG_PREEMPTION > /* > @@ -309,6 +361,8 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) > > inhcall = get_and_clear_inhcall(); > if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { > + /* Normally called by irqentry_exit, restore pkrs here */ > + pkrs_restore_irq(regs); > irqentry_exit_cond_resched(); > instrumentation_end(); > restore_inhcall(inhcall); > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S > index e38a4cf795d9..1c390975a3de 100644 > --- a/arch/x86/entry/entry_64.S > +++ b/arch/x86/entry/entry_64.S > @@ -332,7 +332,7 @@ SYM_CODE_END(ret_from_fork) > movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ > .endif > > - call \cfunc > + call_ext_ptregs \cfunc > > jmp error_return > .endm > @@ -435,7 +435,7 @@ SYM_CODE_START(\asmsym) > > movq %rsp, %rdi /* pt_regs pointer */ > > - call \cfunc > + call_ext_ptregs \cfunc > > jmp paranoid_exit > > @@ -496,7 +496,7 @@ SYM_CODE_START(\asmsym) > * stack. > */ > movq %rsp, %rdi /* pt_regs pointer */ > - call vc_switch_off_ist > + call_ext_ptregs vc_switch_off_ist > movq %rax, %rsp /* Switch to new stack */ > > UNWIND_HINT_REGS > @@ -507,7 +507,7 @@ SYM_CODE_START(\asmsym) > > movq %rsp, %rdi /* pt_regs pointer */ > > - call kernel_\cfunc > + call_ext_ptregs kernel_\cfunc > > /* > * No need to switch back to the IST stack. The current stack is either > @@ -542,7 +542,7 @@ SYM_CODE_START(\asmsym) > movq %rsp, %rdi /* pt_regs pointer into first argument */ > movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ > movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ > - call \cfunc > + call_ext_ptregs \cfunc > > jmp paranoid_exit > > @@ -781,7 +781,7 @@ SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback) > movq %rdi, %rsp /* we don't return, adjust the stack frame */ > UNWIND_HINT_REGS > > - call xen_pv_evtchn_do_upcall > + call_ext_ptregs xen_pv_evtchn_do_upcall > > jmp error_return > SYM_CODE_END(exc_xen_hypervisor_callback) > @@ -987,7 +987,7 @@ SYM_CODE_START_LOCAL(error_entry) > /* Put us onto the real thread stack. */ > popq %r12 /* save return addr in %12 */ > movq %rsp, %rdi /* arg0 = pt_regs pointer */ > - call sync_regs > + call_ext_ptregs sync_regs > movq %rax, %rsp /* switch stack */ > ENCODE_FRAME_POINTER > pushq %r12 > @@ -1042,7 +1042,7 @@ SYM_CODE_START_LOCAL(error_entry) > * as if we faulted immediately after IRET. > */ > mov %rsp, %rdi > - call fixup_bad_iret > + call_ext_ptregs fixup_bad_iret > mov %rax, %rsp > jmp .Lerror_entry_from_usermode_after_swapgs > SYM_CODE_END(error_entry) > @@ -1148,7 +1148,7 @@ SYM_CODE_START(asm_exc_nmi) > > movq %rsp, %rdi > movq $-1, %rsi > - call exc_nmi > + call_ext_ptregs exc_nmi > > /* > * Return back to user mode. We must *not* do the normal exit > @@ -1184,6 +1184,8 @@ SYM_CODE_START(asm_exc_nmi) > * +---------------------------------------------------------+ > * | pt_regs | > * +---------------------------------------------------------+ > + * | (Optionally) extended_pt_regs | > + * +---------------------------------------------------------+ > * > * The "original" frame is used by hardware. Before re-enabling > * NMIs, we need to be done with it, and we need to leave enough > @@ -1360,7 +1362,7 @@ end_repeat_nmi: > > movq %rsp, %rdi > movq $-1, %rsi > - call exc_nmi > + call_ext_ptregs exc_nmi > > /* Always restore stashed CR3 value (see paranoid_entry) */ > RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 > diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S > index 0051cf5c792d..53254d29d5c7 100644 > --- a/arch/x86/entry/entry_64_compat.S > +++ b/arch/x86/entry/entry_64_compat.S > @@ -136,7 +136,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) > .Lsysenter_flags_fixed: > > movq %rsp, %rdi > - call do_SYSENTER_32 > + call_ext_ptregs do_SYSENTER_32 > /* XEN PV guests always use IRET path */ > ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ > "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV > @@ -253,7 +253,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) > UNWIND_HINT_REGS > > movq %rsp, %rdi > - call do_fast_syscall_32 > + call_ext_ptregs do_fast_syscall_32 > /* XEN PV guests always use IRET path */ > ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ > "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV > @@ -410,6 +410,6 @@ SYM_CODE_START(entry_INT80_compat) > cld > > movq %rsp, %rdi > - call do_int80_syscall_32 > + call_ext_ptregs do_int80_syscall_32 > jmp swapgs_restore_regs_and_return_to_usermode > SYM_CODE_END(entry_INT80_compat) > diff --git a/arch/x86/include/asm/pks.h b/arch/x86/include/asm/pks.h > index e7727086cec2..76960ec71b4b 100644 > --- a/arch/x86/include/asm/pks.h > +++ b/arch/x86/include/asm/pks.h > @@ -4,15 +4,33 @@ > > #ifdef CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS > > +struct extended_pt_regs { > + u32 thread_pkrs; > + /* Keep stack 8 byte aligned */ > + u32 pad; > + struct pt_regs pt_regs; > +}; > + > void setup_pks(void); > void pkrs_write_current(void); > void pks_init_task(struct task_struct *task); > +void write_pkrs(u32 new_pkrs); > + > +static inline struct extended_pt_regs *extended_pt_regs(struct pt_regs *regs) > +{ > + return container_of(regs, struct extended_pt_regs, pt_regs); > +} > + > +void show_extended_regs_oops(struct pt_regs *regs, unsigned long error_code); > > #else /* !CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */ > > static inline void setup_pks(void) { } > static inline void pkrs_write_current(void) { } > static inline void pks_init_task(struct task_struct *task) { } > +static inline void write_pkrs(u32 new_pkrs) { } > +static inline void show_extended_regs_oops(struct pt_regs *regs, > + unsigned long error_code) { } > > #endif /* CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */ > > diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h > index 02c2cbda4a74..4a41fc4cf028 100644 > --- a/arch/x86/include/asm/processor-flags.h > +++ b/arch/x86/include/asm/processor-flags.h > @@ -53,4 +53,6 @@ > # define X86_CR3_PTI_PCID_USER_BIT 11 > #endif > > +#define EXTENDED_PT_REGS_SIZE 8 > + > #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ > diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S > index d8b3ebd2bb85..90e76178b6b4 100644 > --- a/arch/x86/kernel/head_64.S > +++ b/arch/x86/kernel/head_64.S > @@ -319,8 +319,7 @@ SYM_CODE_START_NOALIGN(vc_boot_ghcb) > movq %rsp, %rdi > movq ORIG_RAX(%rsp), %rsi > movq initial_vc_handler(%rip), %rax > - ANNOTATE_RETPOLINE_SAFE > - call *%rax > + __call_ext_ptregs *%rax, annotate_retpoline_safe=1 > > /* Unwind pt_regs */ > POP_REGS > @@ -397,7 +396,7 @@ SYM_CODE_START_LOCAL(early_idt_handler_common) > UNWIND_HINT_REGS > > movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ > - call do_early_exception > + call_ext_ptregs do_early_exception > > decl early_recursion_flag(%rip) > jmp restore_regs_and_return_to_kernel > @@ -421,7 +420,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) > /* Call C handler */ > movq %rsp, %rdi > movq ORIG_RAX(%rsp), %rsi > - call do_vc_no_ghcb > + call_ext_ptregs do_vc_no_ghcb > > /* Unwind pt_regs */ > POP_REGS > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c > index e133c0ed72a0..a4ce7cef0260 100644 > --- a/arch/x86/mm/fault.c > +++ b/arch/x86/mm/fault.c > @@ -32,6 +32,7 @@ > #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ > #include <asm/kvm_para.h> /* kvm_handle_async_pf */ > #include <asm/vdso.h> /* fixup_vdso_exception() */ > +#include <asm/pks.h> > > #define CREATE_TRACE_POINTS > #include <asm/trace/exceptions.h> > @@ -547,6 +548,8 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad > (error_code & X86_PF_PK) ? "protection keys violation" : > "permissions violation"); > > + show_extended_regs_oops(regs, error_code); > + > if (!(error_code & X86_PF_USER) && user_mode(regs)) { > struct desc_ptr idt, gdt; > u16 ldtr, tr; > diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h > index 580238388f0c..76eb19a37942 100644 > --- a/include/linux/pkeys.h > +++ b/include/linux/pkeys.h > @@ -52,6 +52,15 @@ enum pks_pkey_consumers { > PKS_KEY_NR_CONSUMERS > }; > extern u32 pkrs_init_value; > -#endif > + > +void pkrs_save_irq(struct pt_regs *regs); > +void pkrs_restore_irq(struct pt_regs *regs); > + > +#else /* !CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */ > + > +static inline void pkrs_save_irq(struct pt_regs *regs) { } > +static inline void pkrs_restore_irq(struct pt_regs *regs) { } > + > +#endif /* CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */ > > #endif /* _LINUX_PKEYS_H */ > diff --git a/kernel/entry/common.c b/kernel/entry/common.c > index bf16395b9e13..aa0b1e8dd742 100644 > --- a/kernel/entry/common.c > +++ b/kernel/entry/common.c > @@ -6,6 +6,7 @@ > #include <linux/livepatch.h> > #include <linux/audit.h> > #include <linux/tick.h> > +#include <linux/pkeys.h> > > #include "common.h" > > @@ -364,7 +365,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) > instrumentation_end(); > > ret.exit_rcu = true; > - return ret; > + goto done; > } > > /* > @@ -379,6 +380,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) > trace_hardirqs_off_finish(); > instrumentation_end(); > > +done: > + pkrs_save_irq(regs); > return ret; > } > > @@ -404,7 +407,12 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) > /* Check whether this returns to user mode */ > if (user_mode(regs)) { > irqentry_exit_to_user_mode(regs); > - } else if (!regs_irqs_disabled(regs)) { > + return; > + } > + > + pkrs_restore_irq(regs); > + > + if (!regs_irqs_disabled(regs)) { > /* > * If RCU was not watching on entry this needs to be done > * carefully and needs the same ordering of lockdep/tracing > @@ -458,11 +466,13 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) > ftrace_nmi_enter(); > instrumentation_end(); > > + pkrs_save_irq(regs); > return irq_state; > } > > void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) > { > + pkrs_restore_irq(regs); > instrumentation_begin(); > ftrace_nmi_exit(); > if (irq_state.lockdep) { > -- > 2.28.0.rc0.12.gb6a658bd00c9 >