Re: [PATCH v5 10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Nadav Amit <nadav.amit@xxxxxxxxx> · Mon, 20 Jan 2025 11:56:29 +0200

On 16/01/2025 4:30, Rik van Riel wrote:
Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.


[snip]

--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1659,9 +1659,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
  	 * a local TLB flush is needed. Optimize this use-case by calling
  	 * flush_tlb_func_local() directly in this case.
  	 */
-	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
-		invlpgb_flush_all_nonglobals();
-	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
  		flush_tlb_multi(&batch->cpumask, info);
  	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
  		lockdep_assert_irqs_enabled();
@@ -1670,12 +1668,62 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
  		local_irq_enable();
  	}
  
+	/*
+	 * If we issued (asynchronous) INVLPGB flushes, wait for them here.
+	 * The cpumask above contains only CPUs that were running tasks
+	 * not using broadcast TLB flushing.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) {
+		tlbsync();
+		migrate_enable();

Maybe someone mentioned it before, but I would emphasize that I do not 
think that preventing migration for potentially long time is that great.

One alternative solution would be to set a bit on cpu_tlbstate, that 
when set, you'd issue a tlbsync on context switch.

(I can think about other solutions, but I think the one I just mentioned 
is the cleanest one).

+		batch->used_invlpgb = false;
+	}
+
  	cpumask_clear(&batch->cpumask);
  
  	put_flush_tlb_info();
  	put_cpu();
  }
  
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr)
+{
+	if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_global_asid(mm)) {
+		u16 asid = mm_global_asid(mm);
+		/*
+		 * Queue up an asynchronous invalidation. The corresponding
+		 * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
+		 * on the same CPU.
+		 */
+		if (!batch->used_invlpgb) {
+			batch->used_invlpgb = true;
+			migrate_disable();

See my comment above...

+		}
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+		/* Do any CPUs supporting INVLPGB need PTI? */
+		if (static_cpu_has(X86_FEATURE_PTI))
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+
+		/*
+		 * Some CPUs might still be using a local ASID for this
+		 * process, and require IPIs, while others are using the
+		 * global ASID.
+		 *
+		 * In this corner case we need to do both the broadcast
+		 * TLB invalidation, and send IPIs. The IPIs will help
+		 * stragglers transition to the broadcast ASID.
+		 */
+		if (READ_ONCE(mm->context.asid_transition))
+			goto also_send_ipi;
+	} else {
+also_send_ipi:

I really think you should avoid such goto's. A simple bool variable of 
"need_ipi" would suffice.

+		inc_mm_tlb_gen(mm);
+		cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+	}
+	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
  /*
   * Blindly accessing user memory from NMI context can be dangerous
   * if we're in the middle of switching the current user task or