In switch_mm_irqs_off(), we read the 'mm->context.lam_cr3_mask' into 'new_lam', which is later passed to load_new_mm_cr3(). However, there is a call to set_tlbstate_lam_mode() in between which will read 'mm->context.lam_cr3_mask' again and set 'cpu_tlbstate.lam' accordingly. If we race with another thread updating 'mm->context.lam_cr3_mask', the value in 'cpu_tlbstate.lam' could end up being different from CR3. Fix the problem by updating set_tlbstate_lam_mode() to return the LAM mask that was set to 'cpu_tlbstate.lam', and use that mask in switch_mm_irqs_off() when writing CR3. Use READ_ONCE to make sure we read the mask once and use it consistenly. No practical problems have been observed from this, but it's a recipe for future problems (e.g. debug warnings in switch_mm_irqs_off() or __get_current_cr3_fast() could fire). It is unlikely that this can cause any real issues since only a single-threaded process can update its own LAM mask, so the race here could happen when context switching between kthreads using a borrowed MM. In this case, it's unlikely that LAM is important. If it is, then we would need to ensure all CPUs using the mm are updated before returning to userspace when LAM is enabled -- but we don't do that. While we are at it, remove the misguiding comment that states that 'new_lam' may not match tlbstate_lam_cr3_mask() if a race occurs. This can happen without a race, a different thread could have just enabled LAM since the last context switch on the current CPU. Replace it with a hopefully clearer comment above set_tlbstate_lam_mode(). Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx> --- arch/x86/include/asm/tlbflush.h | 11 +++++++---- arch/x86/mm/tlb.c | 17 ++++++++--------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 25726893c6f4d..a4ddb20f84fe7 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -399,11 +399,13 @@ static inline u64 tlbstate_lam_cr3_mask(void) return lam << X86_CR3_LAM_U57_BIT; } -static inline void set_tlbstate_lam_mode(struct mm_struct *mm) +static inline unsigned long set_tlbstate_lam_mode(struct mm_struct *mm) { - this_cpu_write(cpu_tlbstate.lam, - mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT); + unsigned long lam = READ_ONCE(mm->context.lam_cr3_mask); + + this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT); this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask); + return lam; } #else @@ -413,8 +415,9 @@ static inline u64 tlbstate_lam_cr3_mask(void) return 0; } -static inline void set_tlbstate_lam_mode(struct mm_struct *mm) +static inline unsigned long set_tlbstate_lam_mode(struct mm_struct *mm) { + return 0; } #endif #endif /* !MODULE */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 51f9f56941058..2975d3f89a5de 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -503,9 +503,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, { struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - unsigned long new_lam = mm_lam_cr3_mask(next); bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); + unsigned long new_lam; u64 next_tlb_gen; bool need_flush; u16 new_asid; @@ -561,11 +561,6 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); - /* - * If this races with another thread that enables lam, 'new_lam' - * might not match tlbstate_lam_cr3_mask(). - */ - /* * Even in lazy TLB mode, the CPU should stay set in the * mm_cpumask. The TLB shootdown code can figure out from @@ -633,7 +628,12 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, barrier(); } - set_tlbstate_lam_mode(next); + /* + * Even if we are not actually switching mm's, another thread could have + * updated mm->context.lam_cr3_mask. Make sure tlbstate_lam_cr3_mask() + * and the loaded CR3 use the up-to-date mask. + */ + new_lam = set_tlbstate_lam_mode(next); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); @@ -705,7 +705,6 @@ void initialize_tlbstate_and_flush(void) /* LAM expected to be disabled */ WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); - WARN_ON(mm_lam_cr3_mask(mm)); /* * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization @@ -724,7 +723,7 @@ void initialize_tlbstate_and_flush(void) this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); - set_tlbstate_lam_mode(mm); + WARN_ON(set_tlbstate_lam_mode(mm)); for (i = 1; i < TLB_NR_DYN_ASIDS; i++) this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); -- 2.44.0.278.ge034bb2e1d-goog