Re: [PATCH 16/23] sched: Use lightweight hazard pointers to grab lazy mms

Nicholas Piggin <npiggin@xxxxxxxxx> · Mon, 10 Jan 2022 15:17:56 +1000

Excerpts from Nicholas Piggin's message of January 10, 2022 2:56 pm:
> Excerpts from Linus Torvalds's message of January 10, 2022 7:51 am:
>> [ Ugh, I actually went back and looked at Nick's patches again, to
>> just verify my memory, and they weren't as pretty as I thought they
>> were ]
>> 
>> On Sun, Jan 9, 2022 at 12:48 PM Linus Torvalds
>> <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>>>
>>> I'd much rather have a *much* smaller patch that says "on x86 and
>>> powerpc, we don't need this overhead at all".
>> 
>> For some reason I thought Nick's patch worked at "last mmput" time and
>> the TLB flush IPIs that happen at that point anyway would then make
>> sure any lazy TLB is cleaned up.
>> 
>> But that's not actually what it does. It ties the
>> MMU_LAZY_TLB_REFCOUNT to an explicit TLB shootdown triggered by the
>> last mmdrop() instead. Because it really tied the whole logic to the
>> mm_count logic (and made lazy tlb to not do mm_count) rather than the
>> mm_users thing I mis-remembered it doing.
> 
> It does this because on powerpc with hash MMU, we can't use IPIs for
> TLB shootdowns.
> 
>> So at least some of my arguments were based on me just mis-remembering
>> what Nick's patch actually did (mainly because I mentally recreated
>> the patch from "Nick did something like this" and what I thought would
>> be the way to do it on x86).
> 
> With powerpc with the radix MMU using IPI based shootdowns, we can 
> actually do the switch-away-from-lazy on the final TLB flush and the
> final broadcast shootdown thing becomes a no-op. I didn't post that
> additional patch because it's powerpc-specific and I didn't want to
> post more code so widely.

This is the patch that goes on top of the series I posted. It's not
very clean at the moment it was just a proof of concept. I wrote it
a year ago by the looks so no guarantees. But it exits all other
lazies as part of the final exit_mmap TLB flush so there should not be
additional IPIs at drop-time. Possibly you could get preempted and
moved CPUs between them but the point is the vast majority of the time
you won't require more IPIs.

Well, with powerpc it's not _quite_ that simple, it is possible we
could use broadcast TLBIE instruction rather than IPIs for this, in
practice I think that's not _so much_ faster that the IPIs are a
problem and on highly threaded jobs where you might have hundreds of
other CPUs in the mask, you'd rather avoid the cacheline bouncing in
context switch anyway.

Thanks,
Nick

>From 1f7fd5de284fab6b94bf49f55ce691ae22538473 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@xxxxxxxxx>
Date: Tue, 23 Feb 2021 14:11:21 +1000
Subject: [PATCH] powerpc/64s/radix: TLB flush optimization for lazy mm
 shootdown refcounting

XXX: could also clear lazy at exit, perhaps?
XXX: doesn't really matter AFAIKS because it will soon go away with mmput
XXX: must audit exit flushes for nest MMU
---
 arch/powerpc/mm/book3s64/radix_tlb.c | 45 ++++++++++++++--------------
 kernel/fork.c                        | 16 ++++++++--
 2 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 59156c2d2ebe..b64cd0d22b8b 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -723,7 +723,7 @@ void radix__local_flush_all_mm(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(radix__local_flush_all_mm);
 
-static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
+static void radix__flush_all_mm(struct mm_struct *mm, bool fullmm)
 {
 	radix__local_flush_all_mm(mm);
 }
@@ -862,7 +862,7 @@ enum tlb_flush_type {
 	FLUSH_TYPE_GLOBAL,
 };
 
-static enum tlb_flush_type flush_type_needed(struct mm_struct *mm, bool fullmm)
+static enum tlb_flush_type flush_type_needed(struct mm_struct *mm)
 {
 	int active_cpus = atomic_read(&mm->context.active_cpus);
 	int cpu = smp_processor_id();
@@ -888,14 +888,6 @@ static enum tlb_flush_type flush_type_needed(struct mm_struct *mm, bool fullmm)
 	if (atomic_read(&mm->context.copros) > 0)
 		return FLUSH_TYPE_GLOBAL;
 
-	/*
-	 * In the fullmm case there's no point doing the exit_flush_lazy_tlbs
-	 * because the mm is being taken down anyway, and a TLBIE tends to
-	 * be faster than an IPI+TLBIEL.
-	 */
-	if (fullmm)
-		return FLUSH_TYPE_GLOBAL;
-
 	/*
 	 * If we are running the only thread of a single-threaded process,
 	 * then we should almost always be able to trim off the rest of the
@@ -947,7 +939,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 	 * switch_mm_irqs_off
 	 */
 	smp_mb();
-	type = flush_type_needed(mm, false);
+	type = flush_type_needed(mm);
 	if (type == FLUSH_TYPE_LOCAL) {
 		_tlbiel_pid(pid, RIC_FLUSH_TLB);
 	} else if (type == FLUSH_TYPE_GLOBAL) {
@@ -971,7 +963,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(radix__flush_tlb_mm);
 
-static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
+void radix__flush_all_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
 	enum tlb_flush_type type;
@@ -982,7 +974,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
 
 	preempt_disable();
 	smp_mb(); /* see radix__flush_tlb_mm */
-	type = flush_type_needed(mm, fullmm);
+	type = flush_type_needed(mm);
 	if (type == FLUSH_TYPE_LOCAL) {
 		_tlbiel_pid(pid, RIC_FLUSH_ALL);
 	} else if (type == FLUSH_TYPE_GLOBAL) {
@@ -1002,11 +994,6 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
 	}
 	preempt_enable();
 }
-
-void radix__flush_all_mm(struct mm_struct *mm)
-{
-	__flush_all_mm(mm, false);
-}
 EXPORT_SYMBOL(radix__flush_all_mm);
 
 void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
@@ -1021,7 +1008,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 
 	preempt_disable();
 	smp_mb(); /* see radix__flush_tlb_mm */
-	type = flush_type_needed(mm, false);
+	type = flush_type_needed(mm);
 	if (type == FLUSH_TYPE_LOCAL) {
 		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
 	} else if (type == FLUSH_TYPE_GLOBAL) {
@@ -1127,7 +1114,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
 
 	preempt_disable();
 	smp_mb(); /* see radix__flush_tlb_mm */
-	type = flush_type_needed(mm, fullmm);
+	type = flush_type_needed(mm);
 	if (type == FLUSH_TYPE_NONE)
 		goto out;
 
@@ -1295,7 +1282,18 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 	 * See the comment for radix in arch_exit_mmap().
 	 */
 	if (tlb->fullmm || tlb->need_flush_all) {
-		__flush_all_mm(mm, true);
+		/*
+		 * Shootdown based lazy tlb mm refcounting means we have to
+		 * IPI everyone in the mm_cpumask anyway soon when the mm goes
+		 * away, so might as well do it as part of the final flush now.
+		 *
+		 * If lazy shootdown was improved to reduce IPIs (e.g., by
+		 * batching), then it may end up being better to use tlbies
+		 * here instead.
+		 */
+		smp_mb(); /* see radix__flush_tlb_mm */
+		exit_flush_lazy_tlbs(mm);
+		_tlbiel_pid(mm->context.id, RIC_FLUSH_ALL);
 	} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
 		if (!tlb->freed_tables)
 			radix__flush_tlb_mm(mm);
@@ -1326,10 +1324,11 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
 		return;
 
 	fullmm = (end == TLB_FLUSH_ALL);
+	WARN_ON_ONCE(fullmm); /* XXX: this shouldn't get fullmm? */
 
 	preempt_disable();
 	smp_mb(); /* see radix__flush_tlb_mm */
-	type = flush_type_needed(mm, fullmm);
+	type = flush_type_needed(mm);
 	if (type == FLUSH_TYPE_NONE)
 		goto out;
 
@@ -1412,7 +1411,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 	/* Otherwise first do the PWC, then iterate the pages. */
 	preempt_disable();
 	smp_mb(); /* see radix__flush_tlb_mm */
-	type = flush_type_needed(mm, false);
+	type = flush_type_needed(mm);
 	if (type == FLUSH_TYPE_LOCAL) {
 		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
 	} else if (type == FLUSH_TYPE_GLOBAL) {