On Tue, Dec 27, 2022 at 11:10:31AM -0800, Linus Torvalds wrote: > On Mon, Dec 26, 2022 at 7:08 PM Kirill A. Shutemov > <kirill.shutemov@xxxxxxxxxxxxxxx> wrote: > > > > --- a/arch/x86/include/asm/uaccess.h > > +++ b/arch/x86/include/asm/uaccess.h > > @@ -21,6 +22,37 @@ static inline bool pagefault_disabled(void); > > # define WARN_ON_IN_IRQ() > > #endif > > > > +#ifdef CONFIG_X86_64 > > I think this should be CONFIG_ADDRESS_MASKING or something like that. > > This is not a "64 vs 32-bit feature". This is something else. > > Even if you then were to select it unconditionally for 64-bit kernels > (but why would you?) it reads better if the #ifdef's make sense. I hoped to get away without a new option. It leads to more ifdeffery, but well... > > +#define __untagged_addr(mm, addr) ({ \ > > + u64 __addr = (__force u64)(addr); \ > > + s64 sign = (s64)__addr >> 63; \ > > + __addr &= READ_ONCE((mm)->context.untag_mask) | sign; \ > > Now the READ_ONCE() doesn't make much sense. There shouldn't be any > data races on that thing. True. Removed. > Plus: > > > +#define untagged_addr(addr) __untagged_addr(current->mm, addr) > > I think this should at least allow caching it in 'current' without the > mm indirection. > > In fact, it might be even better off as a per-cpu variable. > > Because it is now in somewhat crititcal code sections: > > > -#define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); }) > > +#define get_user(x,ptr) \ > > +({ \ > > + might_fault(); \ > > + do_get_user_call(get_user,x,untagged_ptr(ptr)); \ > > +}) > > This is disgusting and wrong. > > The whole reason we do do_get_user_call() as a function call is > because we *don't* want to do this kind of stuff at the call sites. We > used to inline it all, but with all the clac/stac and access_ok > checks, it all just ended up ballooning so much that it was much > better to make it a special function call with particular calling > conventions. > > That untagged_ptr() should be done in that asm function, not in every call site. > > Now, the sad part is that we got *rid* of all this kind of crap not > that long ago when Christoph cleaned up the old legacy set_fs() mess, > and we were able to make the task limit be a constant (ok, be _two_ > constants, depending on LA57). So we'd have to re-introduce that nasty > "look up task size dynamically". See commit 47058bb54b57 ("x86: remove > address space overrides using set_fs()") for the removal that would > have to be re-instated. > > But see above about "maybe it should be a per-cpu variable" - and > making that ALTERNATIVE th8ing even nastier. I made it a per-cpu variable (outside struct tlb_state to be visible in modules). __get/put_user_X() now have a single instruction to untag the address and it is gated by X86_FEATURE_LAM. Seems reasonable to me. BTW, am I blind or we have no infrastructure to hookup static branches from assembly? I would be a better fit than ALTERNATIVE here. It would allow to defer overhead until the first user of the feature. Is there any fundamental reason for this or just no demand? > Another alternative mght be to *only* test the sign bit in the > get_user/put_user functions, and just take the fault instead. Right > now we warn about non-canonical addresses because it implies somebody > might have missed an access_ok(), but we'd just mark those > get_user/put_user accesses special. > > That would get this all entirely off the critical path. Most other > address masking is for relatively rare things (ie mmap/munmap), but > the user accesses are hot. > > Hmm? Below is fixup that suppose to address your concerns. I will also extend selftests to cover get/put_user(). diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3604074a878b..211869aa618d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2290,6 +2290,17 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING If unsure, leave at the default value. +config ADDRESS_MASKING + bool "Linear Address Masking support" + depends on X86_64 + help + Linear Address Masking (LAM) modifies the checking that is applied + to 64-bit linear addresses, allowing software to use of the + untranslated address bits for metadata. + + The capability can be used for efficient address sanitizers (ASAN) + implementation and for optimizations in JITs. + config HOTPLUG_CPU def_bool y depends on SMP diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index c44b56f7ffba..66be8acabe92 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -99,6 +99,12 @@ # define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) #endif +#ifdef CONFIG_ADDRESS_MASKING +# define DISABLE_LAM 0 +#else +# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -115,7 +121,7 @@ #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ DISABLE_CALL_DEPTH_TRACKING) -#define DISABLED_MASK12 0 +#define DISABLED_MASK12 (DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 90d20679e4d7..0da5c227f490 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -44,7 +44,9 @@ typedef struct { #ifdef CONFIG_X86_64 unsigned long flags; +#endif +#ifdef CONFIG_ADDRESS_MASKING /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */ unsigned long lam_cr3_mask; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 4bc95c35cbd3..6ffc42dfd59d 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -91,7 +91,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) } #endif -#ifdef CONFIG_X86_64 +#ifdef CONFIG_ADDRESS_MASKING static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) { return READ_ONCE(mm->context.lam_cr3_mask); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 662598dea937..75bfaa421030 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_TLBFLUSH_H #define _ASM_X86_TLBFLUSH_H -#include <linux/mm.h> +#include <linux/mm_types.h> #include <linux/sched.h> #include <asm/processor.h> @@ -12,6 +12,7 @@ #include <asm/invpcid.h> #include <asm/pti.h> #include <asm/processor-flags.h> +#include <asm/pgtable.h> void __flush_tlb_all(void); @@ -53,6 +54,15 @@ static inline void cr4_clear_bits(unsigned long mask) local_irq_restore(flags); } +#ifdef CONFIG_ADDRESS_MASKING +DECLARE_PER_CPU(u64, tlbstate_untag_mask); + +static inline u64 current_untag_mask(void) +{ + return this_cpu_read(tlbstate_untag_mask); +} +#endif + #ifndef MODULE /* * 6 because 6 should be plenty and struct tlb_state will fit in two cache @@ -101,7 +111,7 @@ struct tlb_state { */ bool invalidate_other; -#ifdef CONFIG_X86_64 +#ifdef CONFIG_ADDRESS_MASKING /* * Active LAM mode. * @@ -367,27 +377,29 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) } #define huge_pmd_needs_flush huge_pmd_needs_flush -#ifdef CONFIG_X86_64 -static inline unsigned long tlbstate_lam_cr3_mask(void) +#ifdef CONFIG_ADDRESS_MASKING +static inline u64 tlbstate_lam_cr3_mask(void) { - unsigned long lam = this_cpu_read(cpu_tlbstate.lam); + u64 lam = this_cpu_read(cpu_tlbstate.lam); return lam << X86_CR3_LAM_U57_BIT; } -static inline void set_tlbstate_cr3_lam_mask(unsigned long mask) +static inline void set_tlbstate_lam_mode(struct mm_struct *mm) { - this_cpu_write(cpu_tlbstate.lam, mask >> X86_CR3_LAM_U57_BIT); + this_cpu_write(cpu_tlbstate.lam, + mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT); + this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask); } #else -static inline unsigned long tlbstate_lam_cr3_mask(void) +static inline u64 tlbstate_lam_cr3_mask(void) { return 0; } -static inline void set_tlbstate_cr3_lam_mask(u64 mask) +static inline void set_tlbstate_lam_mode(struct mm_struct *mm) { } #endif diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 1d931c7f6741..730649175191 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -13,6 +13,7 @@ #include <asm/page.h> #include <asm/smap.h> #include <asm/extable.h> +#include <asm/tlbflush.h> #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline bool pagefault_disabled(void); @@ -22,7 +23,7 @@ static inline bool pagefault_disabled(void); # define WARN_ON_IN_IRQ() #endif -#ifdef CONFIG_X86_64 +#ifdef CONFIG_ADDRESS_MASKING DECLARE_STATIC_KEY_FALSE(tagged_addr_key); /* @@ -31,31 +32,24 @@ DECLARE_STATIC_KEY_FALSE(tagged_addr_key); * Magic with the 'sign' allows to untag userspace pointer without any branches * while leaving kernel addresses intact. */ -#define __untagged_addr(mm, addr) ({ \ +#define __untagged_addr(untag_mask, addr) ({ \ u64 __addr = (__force u64)(addr); \ if (static_branch_likely(&tagged_addr_key)) { \ s64 sign = (s64)__addr >> 63; \ - u64 mask = READ_ONCE((mm)->context.untag_mask); \ - __addr &= mask | sign; \ + __addr &= untag_mask | sign; \ } \ (__force __typeof__(addr))__addr; \ }) -#define untagged_addr(addr) __untagged_addr(current->mm, addr) +#define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr) #define untagged_addr_remote(mm, addr) ({ \ mmap_assert_locked(mm); \ - __untagged_addr(mm, addr); \ + __untagged_addr((mm)->context.untag_mask, addr); \ }) -#define untagged_ptr(ptr) ({ \ - u64 __ptrval = (__force u64)(ptr); \ - __ptrval = untagged_addr(__ptrval); \ - (__force __typeof__(ptr))__ptrval; \ -}) #else -#define untagged_addr(addr) (addr) -#define untagged_ptr(ptr) (ptr) +#define untagged_addr(addr) (addr) #endif /** @@ -167,7 +161,7 @@ extern int __get_user_bad(void); #define get_user(x,ptr) \ ({ \ might_fault(); \ - do_get_user_call(get_user,x,untagged_ptr(ptr)); \ + do_get_user_call(get_user,x,ptr); \ }) /** @@ -270,7 +264,7 @@ extern void __put_user_nocheck_8(void); */ #define put_user(x, ptr) ({ \ might_fault(); \ - do_put_user_call(put_user,x,untagged_ptr(ptr)); \ + do_put_user_call(put_user,x,ptr); \ }) /** diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index add85615d5ae..1f61e3a13b4f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -743,6 +743,7 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) } #endif +#ifdef CONFIG_ADDRESS_MASKING DEFINE_STATIC_KEY_FALSE(tagged_addr_key); EXPORT_SYMBOL_GPL(tagged_addr_key); @@ -775,7 +776,7 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) } write_cr3(__read_cr3() | mm->context.lam_cr3_mask); - set_tlbstate_cr3_lam_mask(mm->context.lam_cr3_mask); + set_tlbstate_lam_mode(mm); set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); mmap_write_unlock(mm); @@ -783,6 +784,7 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) static_branch_enable(&tagged_addr_key); return 0; } +#endif long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { @@ -871,6 +873,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) case ARCH_MAP_VDSO_64: return prctl_map_vdso(&vdso_image_64, arg2); #endif +#ifdef CONFIG_ADDRESS_MASKING case ARCH_GET_UNTAG_MASK: return put_user(task->mm->context.untag_mask, (unsigned long __user *)arg2); @@ -884,6 +887,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) return put_user(0, (unsigned long __user *)arg2); else return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); +#endif default: ret = -EINVAL; break; diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index b70d98d79a9d..22e92236e8f6 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -35,6 +35,13 @@ #include <asm/smap.h> #include <asm/export.h> +#ifdef CONFIG_ADDRESS_MASKING +#define UNTAG_ADDR \ + ALTERNATIVE "", __stringify(and PER_CPU_VAR(tlbstate_untag_mask), %rax), X86_FEATURE_LAM +#else +#define UNTAG_ADDR +#endif + #define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC #ifdef CONFIG_X86_5LEVEL @@ -48,6 +55,7 @@ .text SYM_FUNC_START(__get_user_1) + UNTAG_ADDR LOAD_TASK_SIZE_MINUS_N(0) cmp %_ASM_DX,%_ASM_AX jae bad_get_user @@ -62,6 +70,7 @@ SYM_FUNC_END(__get_user_1) EXPORT_SYMBOL(__get_user_1) SYM_FUNC_START(__get_user_2) + UNTAG_ADDR LOAD_TASK_SIZE_MINUS_N(1) cmp %_ASM_DX,%_ASM_AX jae bad_get_user @@ -76,6 +85,7 @@ SYM_FUNC_END(__get_user_2) EXPORT_SYMBOL(__get_user_2) SYM_FUNC_START(__get_user_4) + UNTAG_ADDR LOAD_TASK_SIZE_MINUS_N(3) cmp %_ASM_DX,%_ASM_AX jae bad_get_user @@ -91,6 +101,7 @@ EXPORT_SYMBOL(__get_user_4) SYM_FUNC_START(__get_user_8) #ifdef CONFIG_X86_64 + UNTAG_ADDR LOAD_TASK_SIZE_MINUS_N(7) cmp %_ASM_DX,%_ASM_AX jae bad_get_user diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 32125224fcca..9e0276c553a8 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -33,6 +33,13 @@ * as they get called from within inline assembly. */ +#ifdef CONFIG_ADDRESS_MASKING +#define UNTAG_ADDR \ + ALTERNATIVE "", __stringify(and PER_CPU_VAR(tlbstate_untag_mask), %rcx), X86_FEATURE_LAM +#else +#define UNTAG_ADDR +#endif + #ifdef CONFIG_X86_5LEVEL #define LOAD_TASK_SIZE_MINUS_N(n) \ ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rbx), \ @@ -44,6 +51,7 @@ .text SYM_FUNC_START(__put_user_1) + UNTAG_ADDR LOAD_TASK_SIZE_MINUS_N(0) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user @@ -66,6 +74,7 @@ SYM_FUNC_END(__put_user_nocheck_1) EXPORT_SYMBOL(__put_user_nocheck_1) SYM_FUNC_START(__put_user_2) + UNTAG_ADDR LOAD_TASK_SIZE_MINUS_N(1) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user @@ -88,6 +97,7 @@ SYM_FUNC_END(__put_user_nocheck_2) EXPORT_SYMBOL(__put_user_nocheck_2) SYM_FUNC_START(__put_user_4) + UNTAG_ADDR LOAD_TASK_SIZE_MINUS_N(3) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user @@ -110,6 +120,7 @@ SYM_FUNC_END(__put_user_nocheck_4) EXPORT_SYMBOL(__put_user_nocheck_4) SYM_FUNC_START(__put_user_8) + UNTAG_ADDR LOAD_TASK_SIZE_MINUS_N(7) cmp %_ASM_BX,%_ASM_CX jae .Lbad_put_user diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d3987359d441..be5c7d1c0265 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1044,6 +1044,11 @@ __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = { .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; +#ifdef CONFIG_ADDRESS_MASKING +DEFINE_PER_CPU(u64, tlbstate_untag_mask); +EXPORT_PER_CPU_SYMBOL(tlbstate_untag_mask); +#endif + void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { /* entry 0 MUST be WB (hardwired to speed up translations) */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 9d1e7a5f141c..8c330a6d0ece 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -635,7 +635,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, barrier(); } - set_tlbstate_cr3_lam_mask(new_lam); + set_tlbstate_lam_mode(next); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); @@ -726,7 +726,7 @@ void initialize_tlbstate_and_flush(void) this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); - set_tlbstate_cr3_lam_mask(0); + set_tlbstate_lam_mode(mm); for (i = 1; i < TLB_NR_DYN_ASIDS; i++) this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); -- Kiryl Shutsemau / Kirill A. Shutemov