Re: [PATCH V7 08/18] x86/entry: Preserve PKRS MSR across exceptions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Aug 03, 2021 at 09:32:21PM -0700, 'Ira Weiny' wrote:
> From: Ira Weiny <ira.weiny@xxxxxxxxx>
> 
> The PKRS MSR is not managed by XSAVE.  It is preserved through a context
> switch but this support leaves exception handling code open to memory
> accesses during exceptions.
> 
> 2 possible places for preserving this state were considered,
> irqentry_state_t or pt_regs.[1]  pt_regs was much more complicated and
> was potentially fraught with unintended consequences.[2]  However, Andy
> came up with a way to hide additional values on the stack which could be
> accessed as "extended_pt_regs".[3]

Andy,

I'm preparing to send V8 of this PKS work.  But I have not seen any feed back
since I originally implemented this in V4[1].

Does this meets your expectations?  Are there any issues you can see with this
code?

I would appreciate your feedback.

Thanks,
Ira

[1] https://lore.kernel.org/lkml/20210322053020.2287058-9-ira.weiny@xxxxxxxxx/

> This method allows for; any place
> which has struct pt_regs can get access to the extra information; no
> extra information is added to irq_state; and pt_regs is left intact for
> compatibility with outside tools like BPF.
> 
> To simplify, the assembly code only adds space on the stack.  The
> setting or use of any needed values are left to the C code.  While some
> entry points may not use this space it is still added where ever pt_regs
> is passed to the C code for consistency.
> 
> Each nested exception gets another copy of this extended space allowing
> for any number of levels of exception handling.
> 
> In the assembly, a macro is defined to allow a central place to add
> space for other uses should the need arise.
> 
> Finally export pkrs_{save|restore}_irq to the common code to allow
> it to preserve the current task's PKRS in the new extended pt_regs if
> enabled.
> 
> Peter, Thomas, Andy, Dave, and Dan all suggested parts of the patch or
> aided in the development of the patch..
> 
> [1] https://lore.kernel.org/lkml/CALCETrVe1i5JdyzD_BcctxQJn+ZE3T38EFPgjxN1F577M36g+w@xxxxxxxxxxxxxx/
> [2] https://lore.kernel.org/lkml/874kpxx4jf.fsf@xxxxxxxxxxxxxxxxxxxxxxx/#t
> [3] https://lore.kernel.org/lkml/CALCETrUHwZPic89oExMMe-WyDY8-O3W68NcZvse3=PGW+iW5=w@xxxxxxxxxxxxxx/
> 
> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> Cc: Dan Williams <dan.j.williams@xxxxxxxxx>
> Suggested-by: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> Suggested-by: Dan Williams <dan.j.williams@xxxxxxxxx>
> Suggested-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Suggested-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Suggested-by: Andy Lutomirski <luto@xxxxxxxxxx>
> Signed-off-by: Ira Weiny <ira.weiny@xxxxxxxxx>
> 
> ---
> Changes for V7:
> 	Rebased to 5.14 entry code
> 	declare write_pkrs() in pks.h
> 	s/INIT_PKRS_VALUE/pkrs_init_value
> 	Remove unnecessary INIT_PKRS_VALUE def
> 	s/pkrs_save_set_irq/pkrs_save_irq/
> 		The inital value for exceptions is best managed
> 		completely within the pkey code.
> ---
>  arch/x86/entry/calling.h               | 26 +++++++++++++
>  arch/x86/entry/common.c                | 54 ++++++++++++++++++++++++++
>  arch/x86/entry/entry_64.S              | 22 ++++++-----
>  arch/x86/entry/entry_64_compat.S       |  6 +--
>  arch/x86/include/asm/pks.h             | 18 +++++++++
>  arch/x86/include/asm/processor-flags.h |  2 +
>  arch/x86/kernel/head_64.S              |  7 ++--
>  arch/x86/mm/fault.c                    |  3 ++
>  include/linux/pkeys.h                  | 11 +++++-
>  kernel/entry/common.c                  | 14 ++++++-
>  10 files changed, 143 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
> index a4c061fb7c6e..a2f94677c3fd 100644
> --- a/arch/x86/entry/calling.h
> +++ b/arch/x86/entry/calling.h
> @@ -63,6 +63,32 @@ For 32-bit we have the following conventions - kernel is built with
>   * for assembly code:
>   */
>  
> +/*
> + * __call_ext_ptregs - Helper macro to call into C with extended pt_regs
> + * @cfunc:		C function to be called
> + *
> + * This will ensure that extended_ptregs is added and removed as needed during
> + * a call into C code.
> + */
> +.macro __call_ext_ptregs cfunc annotate_retpoline_safe:req
> +#ifdef CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS
> +	/* add space for extended_pt_regs */
> +	subq    $EXTENDED_PT_REGS_SIZE, %rsp
> +#endif
> +	.if \annotate_retpoline_safe == 1
> +		ANNOTATE_RETPOLINE_SAFE
> +	.endif
> +	call	\cfunc
> +#ifdef CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS
> +	/* remove space for extended_pt_regs */
> +	addq    $EXTENDED_PT_REGS_SIZE, %rsp
> +#endif
> +.endm
> +
> +.macro call_ext_ptregs cfunc
> +	__call_ext_ptregs \cfunc, annotate_retpoline_safe=0
> +.endm
> +
>  .macro PUSH_REGS rdx=%rdx rax=%rax save_ret=0
>  	.if \save_ret
>  	pushq	%rsi		/* pt_regs->si */
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 6c2826417b33..a0d1d5519dba 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -19,6 +19,7 @@
>  #include <linux/nospec.h>
>  #include <linux/syscalls.h>
>  #include <linux/uaccess.h>
> +#include <linux/pkeys.h>
>  
>  #ifdef CONFIG_XEN_PV
>  #include <xen/xen-ops.h>
> @@ -34,6 +35,7 @@
>  #include <asm/io_bitmap.h>
>  #include <asm/syscall.h>
>  #include <asm/irq_stack.h>
> +#include <asm/pks.h>
>  
>  #ifdef CONFIG_X86_64
>  
> @@ -252,6 +254,56 @@ SYSCALL_DEFINE0(ni_syscall)
>  	return -ENOSYS;
>  }
>  
> +#ifdef CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS
> +
> +void show_extended_regs_oops(struct pt_regs *regs, unsigned long error_code)
> +{
> +	struct extended_pt_regs *ept_regs = extended_pt_regs(regs);
> +
> +	if (cpu_feature_enabled(X86_FEATURE_PKS) && (error_code & X86_PF_PK))
> +		pr_alert("PKRS: 0x%x\n", ept_regs->thread_pkrs);
> +}
> +
> +/*
> + * PKRS is a per-logical-processor MSR which overlays additional protection for
> + * pages which have been mapped with a protection key.
> + *
> + * Context switches save the MSR in the task struct thus taking that value to
> + * other processors if necessary.
> + *
> + * To protect against exceptions having access to this memory save the current
> + * thread value and set the PKRS value to be used during the exception.
> + */
> +void pkrs_save_irq(struct pt_regs *regs)
> +{
> +	struct extended_pt_regs *ept_regs;
> +
> +	BUILD_BUG_ON(sizeof(struct extended_pt_regs)
> +			!= EXTENDED_PT_REGS_SIZE
> +				+ sizeof(struct pt_regs));
> +
> +	if (!cpu_feature_enabled(X86_FEATURE_PKS))
> +		return;
> +
> +	ept_regs = extended_pt_regs(regs);
> +	ept_regs->thread_pkrs = current->thread.saved_pkrs;
> +	write_pkrs(pkrs_init_value);
> +}
> +
> +void pkrs_restore_irq(struct pt_regs *regs)
> +{
> +	struct extended_pt_regs *ept_regs;
> +
> +	if (!cpu_feature_enabled(X86_FEATURE_PKS))
> +		return;
> +
> +	ept_regs = extended_pt_regs(regs);
> +	write_pkrs(ept_regs->thread_pkrs);
> +	current->thread.saved_pkrs = ept_regs->thread_pkrs;
> +}
> +
> +#endif /* CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
> +
>  #ifdef CONFIG_XEN_PV
>  #ifndef CONFIG_PREEMPTION
>  /*
> @@ -309,6 +361,8 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
>  
>  	inhcall = get_and_clear_inhcall();
>  	if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
> +		/* Normally called by irqentry_exit, restore pkrs here */
> +		pkrs_restore_irq(regs);
>  		irqentry_exit_cond_resched();
>  		instrumentation_end();
>  		restore_inhcall(inhcall);
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index e38a4cf795d9..1c390975a3de 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -332,7 +332,7 @@ SYM_CODE_END(ret_from_fork)
>  		movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
>  	.endif
>  
> -	call	\cfunc
> +	call_ext_ptregs \cfunc
>  
>  	jmp	error_return
>  .endm
> @@ -435,7 +435,7 @@ SYM_CODE_START(\asmsym)
>  
>  	movq	%rsp, %rdi		/* pt_regs pointer */
>  
> -	call	\cfunc
> +	call_ext_ptregs \cfunc
>  
>  	jmp	paranoid_exit
>  
> @@ -496,7 +496,7 @@ SYM_CODE_START(\asmsym)
>  	 * stack.
>  	 */
>  	movq	%rsp, %rdi		/* pt_regs pointer */
> -	call	vc_switch_off_ist
> +	call_ext_ptregs vc_switch_off_ist
>  	movq	%rax, %rsp		/* Switch to new stack */
>  
>  	UNWIND_HINT_REGS
> @@ -507,7 +507,7 @@ SYM_CODE_START(\asmsym)
>  
>  	movq	%rsp, %rdi		/* pt_regs pointer */
>  
> -	call	kernel_\cfunc
> +	call_ext_ptregs kernel_\cfunc
>  
>  	/*
>  	 * No need to switch back to the IST stack. The current stack is either
> @@ -542,7 +542,7 @@ SYM_CODE_START(\asmsym)
>  	movq	%rsp, %rdi		/* pt_regs pointer into first argument */
>  	movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
>  	movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
> -	call	\cfunc
> +	call_ext_ptregs \cfunc
>  
>  	jmp	paranoid_exit
>  
> @@ -781,7 +781,7 @@ SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
>  	movq	%rdi, %rsp			/* we don't return, adjust the stack frame */
>  	UNWIND_HINT_REGS
>  
> -	call	xen_pv_evtchn_do_upcall
> +	call_ext_ptregs xen_pv_evtchn_do_upcall
>  
>  	jmp	error_return
>  SYM_CODE_END(exc_xen_hypervisor_callback)
> @@ -987,7 +987,7 @@ SYM_CODE_START_LOCAL(error_entry)
>  	/* Put us onto the real thread stack. */
>  	popq	%r12				/* save return addr in %12 */
>  	movq	%rsp, %rdi			/* arg0 = pt_regs pointer */
> -	call	sync_regs
> +	call_ext_ptregs sync_regs
>  	movq	%rax, %rsp			/* switch stack */
>  	ENCODE_FRAME_POINTER
>  	pushq	%r12
> @@ -1042,7 +1042,7 @@ SYM_CODE_START_LOCAL(error_entry)
>  	 * as if we faulted immediately after IRET.
>  	 */
>  	mov	%rsp, %rdi
> -	call	fixup_bad_iret
> +	call_ext_ptregs fixup_bad_iret
>  	mov	%rax, %rsp
>  	jmp	.Lerror_entry_from_usermode_after_swapgs
>  SYM_CODE_END(error_entry)
> @@ -1148,7 +1148,7 @@ SYM_CODE_START(asm_exc_nmi)
>  
>  	movq	%rsp, %rdi
>  	movq	$-1, %rsi
> -	call	exc_nmi
> +	call_ext_ptregs exc_nmi
>  
>  	/*
>  	 * Return back to user mode.  We must *not* do the normal exit
> @@ -1184,6 +1184,8 @@ SYM_CODE_START(asm_exc_nmi)
>  	 * +---------------------------------------------------------+
>  	 * | pt_regs                                                 |
>  	 * +---------------------------------------------------------+
> +	 * | (Optionally) extended_pt_regs                           |
> +	 * +---------------------------------------------------------+
>  	 *
>  	 * The "original" frame is used by hardware.  Before re-enabling
>  	 * NMIs, we need to be done with it, and we need to leave enough
> @@ -1360,7 +1362,7 @@ end_repeat_nmi:
>  
>  	movq	%rsp, %rdi
>  	movq	$-1, %rsi
> -	call	exc_nmi
> +	call_ext_ptregs exc_nmi
>  
>  	/* Always restore stashed CR3 value (see paranoid_entry) */
>  	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
> diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
> index 0051cf5c792d..53254d29d5c7 100644
> --- a/arch/x86/entry/entry_64_compat.S
> +++ b/arch/x86/entry/entry_64_compat.S
> @@ -136,7 +136,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
>  .Lsysenter_flags_fixed:
>  
>  	movq	%rsp, %rdi
> -	call	do_SYSENTER_32
> +	call_ext_ptregs do_SYSENTER_32
>  	/* XEN PV guests always use IRET path */
>  	ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
>  		    "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
> @@ -253,7 +253,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
>  	UNWIND_HINT_REGS
>  
>  	movq	%rsp, %rdi
> -	call	do_fast_syscall_32
> +	call_ext_ptregs do_fast_syscall_32
>  	/* XEN PV guests always use IRET path */
>  	ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
>  		    "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
> @@ -410,6 +410,6 @@ SYM_CODE_START(entry_INT80_compat)
>  	cld
>  
>  	movq	%rsp, %rdi
> -	call	do_int80_syscall_32
> +	call_ext_ptregs do_int80_syscall_32
>  	jmp	swapgs_restore_regs_and_return_to_usermode
>  SYM_CODE_END(entry_INT80_compat)
> diff --git a/arch/x86/include/asm/pks.h b/arch/x86/include/asm/pks.h
> index e7727086cec2..76960ec71b4b 100644
> --- a/arch/x86/include/asm/pks.h
> +++ b/arch/x86/include/asm/pks.h
> @@ -4,15 +4,33 @@
>  
>  #ifdef CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS
>  
> +struct extended_pt_regs {
> +	u32 thread_pkrs;
> +	/* Keep stack 8 byte aligned */
> +	u32 pad;
> +	struct pt_regs pt_regs;
> +};
> +
>  void setup_pks(void);
>  void pkrs_write_current(void);
>  void pks_init_task(struct task_struct *task);
> +void write_pkrs(u32 new_pkrs);
> +
> +static inline struct extended_pt_regs *extended_pt_regs(struct pt_regs *regs)
> +{
> +	return container_of(regs, struct extended_pt_regs, pt_regs);
> +}
> +
> +void show_extended_regs_oops(struct pt_regs *regs, unsigned long error_code);
>  
>  #else /* !CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
>  
>  static inline void setup_pks(void) { }
>  static inline void pkrs_write_current(void) { }
>  static inline void pks_init_task(struct task_struct *task) { }
> +static inline void write_pkrs(u32 new_pkrs) { }
> +static inline void show_extended_regs_oops(struct pt_regs *regs,
> +					   unsigned long error_code) { }
>  
>  #endif /* CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
>  
> diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
> index 02c2cbda4a74..4a41fc4cf028 100644
> --- a/arch/x86/include/asm/processor-flags.h
> +++ b/arch/x86/include/asm/processor-flags.h
> @@ -53,4 +53,6 @@
>  # define X86_CR3_PTI_PCID_USER_BIT	11
>  #endif
>  
> +#define EXTENDED_PT_REGS_SIZE 8
> +
>  #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> index d8b3ebd2bb85..90e76178b6b4 100644
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -319,8 +319,7 @@ SYM_CODE_START_NOALIGN(vc_boot_ghcb)
>  	movq    %rsp, %rdi
>  	movq	ORIG_RAX(%rsp), %rsi
>  	movq	initial_vc_handler(%rip), %rax
> -	ANNOTATE_RETPOLINE_SAFE
> -	call	*%rax
> +	__call_ext_ptregs *%rax, annotate_retpoline_safe=1
>  
>  	/* Unwind pt_regs */
>  	POP_REGS
> @@ -397,7 +396,7 @@ SYM_CODE_START_LOCAL(early_idt_handler_common)
>  	UNWIND_HINT_REGS
>  
>  	movq %rsp,%rdi		/* RDI = pt_regs; RSI is already trapnr */
> -	call do_early_exception
> +	call_ext_ptregs do_early_exception
>  
>  	decl early_recursion_flag(%rip)
>  	jmp restore_regs_and_return_to_kernel
> @@ -421,7 +420,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb)
>  	/* Call C handler */
>  	movq    %rsp, %rdi
>  	movq	ORIG_RAX(%rsp), %rsi
> -	call    do_vc_no_ghcb
> +	call_ext_ptregs do_vc_no_ghcb
>  
>  	/* Unwind pt_regs */
>  	POP_REGS
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index e133c0ed72a0..a4ce7cef0260 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -32,6 +32,7 @@
>  #include <asm/pgtable_areas.h>		/* VMALLOC_START, ...		*/
>  #include <asm/kvm_para.h>		/* kvm_handle_async_pf		*/
>  #include <asm/vdso.h>			/* fixup_vdso_exception()	*/
> +#include <asm/pks.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <asm/trace/exceptions.h>
> @@ -547,6 +548,8 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
>  		 (error_code & X86_PF_PK)    ? "protection keys violation" :
>  					       "permissions violation");
>  
> +	show_extended_regs_oops(regs, error_code);
> +
>  	if (!(error_code & X86_PF_USER) && user_mode(regs)) {
>  		struct desc_ptr idt, gdt;
>  		u16 ldtr, tr;
> diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
> index 580238388f0c..76eb19a37942 100644
> --- a/include/linux/pkeys.h
> +++ b/include/linux/pkeys.h
> @@ -52,6 +52,15 @@ enum pks_pkey_consumers {
>  	PKS_KEY_NR_CONSUMERS
>  };
>  extern u32 pkrs_init_value;
> -#endif
> +
> +void pkrs_save_irq(struct pt_regs *regs);
> +void pkrs_restore_irq(struct pt_regs *regs);
> +
> +#else /* !CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
> +
> +static inline void pkrs_save_irq(struct pt_regs *regs) { }
> +static inline void pkrs_restore_irq(struct pt_regs *regs) { }
> +
> +#endif /* CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
>  
>  #endif /* _LINUX_PKEYS_H */
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index bf16395b9e13..aa0b1e8dd742 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -6,6 +6,7 @@
>  #include <linux/livepatch.h>
>  #include <linux/audit.h>
>  #include <linux/tick.h>
> +#include <linux/pkeys.h>
>  
>  #include "common.h"
>  
> @@ -364,7 +365,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
>  		instrumentation_end();
>  
>  		ret.exit_rcu = true;
> -		return ret;
> +		goto done;
>  	}
>  
>  	/*
> @@ -379,6 +380,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
>  	trace_hardirqs_off_finish();
>  	instrumentation_end();
>  
> +done:
> +	pkrs_save_irq(regs);
>  	return ret;
>  }
>  
> @@ -404,7 +407,12 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
>  	/* Check whether this returns to user mode */
>  	if (user_mode(regs)) {
>  		irqentry_exit_to_user_mode(regs);
> -	} else if (!regs_irqs_disabled(regs)) {
> +		return;
> +	}
> +
> +	pkrs_restore_irq(regs);
> +
> +	if (!regs_irqs_disabled(regs)) {
>  		/*
>  		 * If RCU was not watching on entry this needs to be done
>  		 * carefully and needs the same ordering of lockdep/tracing
> @@ -458,11 +466,13 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
>  	ftrace_nmi_enter();
>  	instrumentation_end();
>  
> +	pkrs_save_irq(regs);
>  	return irq_state;
>  }
>  
>  void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
>  {
> +	pkrs_restore_irq(regs);
>  	instrumentation_begin();
>  	ftrace_nmi_exit();
>  	if (irq_state.lockdep) {
> -- 
> 2.28.0.rc0.12.gb6a658bd00c9
> 




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux