On Tue, Jan 21, 2025 at 10:55:07AM +0100, Peter Zijlstra wrote: > On Sun, Jan 19, 2025 at 09:40:17PM -0500, Rik van Riel wrote: > > +/* > > + * Figure out whether to assign a global ASID to a process. > > + * We vary the threshold by how empty or full global ASID space is. > > + * 1/4 full: >= 4 active threads > > + * 1/2 full: >= 8 active threads > > + * 3/4 full: >= 16 active threads > > + * 7/8 full: >= 32 active threads > > + * etc > > + * > > + * This way we should never exhaust the global ASID space, even on very > > + * large systems, and the processes with the largest number of active > > + * threads should be able to use broadcast TLB invalidation. > > + */ > > +#define HALFFULL_THRESHOLD 8 > > +static bool meets_global_asid_threshold(struct mm_struct *mm) > > +{ > > + int avail = global_asid_available; > > + int threshold = HALFFULL_THRESHOLD; > > + > > + if (!avail) > > + return false; > > + > > + if (avail > MAX_ASID_AVAILABLE * 3 / 4) { > > + threshold = HALFFULL_THRESHOLD / 4; > > + } else if (avail > MAX_ASID_AVAILABLE / 2) { > > + threshold = HALFFULL_THRESHOLD / 2; > > + } else if (avail < MAX_ASID_AVAILABLE / 3) { > > + do { > > + avail *= 2; > > + threshold *= 2; > > + } while ((avail + threshold) < MAX_ASID_AVAILABLE / 2); > > + } > > + > > + return mm_active_cpus_exceeds(mm, threshold); > > +} > > I'm still very much disliking this. Why do we need this? Yes, running > out of ASID space is a pain, but this increasing threshold also makes > things behave weird. > > Suppose our most used processes starts slow, and ends up not getting an > ASID because too much irrelevant crap gets started before it spawns > enough threads and then no longer qualifies. > > Can't we just start with a very simple constant test and poke at things > if/when its found to not work? Something like so perhaps? --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -268,7 +268,7 @@ static inline u16 mm_global_asid(struct if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) return 0; - asid = READ_ONCE(mm->context.global_asid); + asid = smp_load_acquire(&mm->context.global_asid); /* mm->context.global_asid is either 0, or a global ASID */ VM_WARN_ON_ONCE(is_dyn_asid(asid)); --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -308,13 +308,18 @@ static void reset_global_asid_space(void static u16 get_global_asid(void) { lockdep_assert_held(&global_asid_lock); + bool done_reset = false; do { u16 start = last_global_asid; u16 asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, start); - if (asid >= MAX_ASID_AVAILABLE) { + if (asid > MAX_ASID_AVAILABLE) { + if (done_reset) + return asid; + reset_global_asid_space(); + done_reset = true; continue; } @@ -392,6 +398,12 @@ static bool mm_active_cpus_exceeds(struc */ static void use_global_asid(struct mm_struct *mm) { + u16 asid; + + /* This process is already using broadcast TLB invalidation. */ + if (mm->context.global_asid) + return; + guard(raw_spinlock_irqsave)(&global_asid_lock); /* This process is already using broadcast TLB invalidation. */ @@ -402,58 +414,25 @@ static void use_global_asid(struct mm_st if (!global_asid_available) return; + asid = get_global_asid(); + if (asid > MAX_ASID_AVAILABLE) + return; + /* - * The transition from IPI TLB flushing, with a dynamic ASID, - * and broadcast TLB flushing, using a global ASID, uses memory - * ordering for synchronization. - * - * While the process has threads still using a dynamic ASID, - * TLB invalidation IPIs continue to get sent. - * - * This code sets asid_transition first, before assigning the - * global ASID. - * - * The TLB flush code will only verify the ASID transition - * after it has seen the new global ASID for the process. + * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() -> + * finish_asid_transition() needs to observe asid_transition == true + * once it observes global_asid. */ - WRITE_ONCE(mm->context.asid_transition, true); - WRITE_ONCE(mm->context.global_asid, get_global_asid()); + mm->context.asid_transition = true; + smp_store_release(&mm->context.global_asid, asid); } -/* - * Figure out whether to assign a global ASID to a process. - * We vary the threshold by how empty or full global ASID space is. - * 1/4 full: >= 4 active threads - * 1/2 full: >= 8 active threads - * 3/4 full: >= 16 active threads - * 7/8 full: >= 32 active threads - * etc - * - * This way we should never exhaust the global ASID space, even on very - * large systems, and the processes with the largest number of active - * threads should be able to use broadcast TLB invalidation. - */ -#define HALFFULL_THRESHOLD 8 static bool meets_global_asid_threshold(struct mm_struct *mm) { - int avail = global_asid_available; - int threshold = HALFFULL_THRESHOLD; - - if (!avail) + if (!global_asid_available) return false; - if (avail > MAX_ASID_AVAILABLE * 3 / 4) { - threshold = HALFFULL_THRESHOLD / 4; - } else if (avail > MAX_ASID_AVAILABLE / 2) { - threshold = HALFFULL_THRESHOLD / 2; - } else if (avail < MAX_ASID_AVAILABLE / 3) { - do { - avail *= 2; - threshold *= 2; - } while ((avail + threshold) < MAX_ASID_AVAILABLE / 2); - } - - return mm_active_cpus_exceeds(mm, threshold); + return mm_active_cpus_exceeds(mm, 4); } static void consider_global_asid(struct mm_struct *mm)