[RFC PATCH 6/7] mm: introduce CONFIG_ARCH_PCP_RSS_USE_CPUMASK

Kairui Song <ryncsn@xxxxxxxxx> · Fri, 29 Jul 2022 04:45:10 +0800

From: Kairui Song <kasong@xxxxxxxxxxx>

If the arch related code can provide helpers to bind the RSS cache to
mm_cpumask, then the syncing code can just rely on that instead of doing
full CPU synchronization. This speed up the reading/mm_exit by a lot.

Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
---
 arch/Kconfig        |  3 ++
 kernel/sched/core.c |  3 +-
 mm/memory.c         | 94 ++++++++++++++++++++++++++++-----------------
 3 files changed, 64 insertions(+), 36 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 71b9272acb28..8df45b6346ae 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1403,6 +1403,9 @@ config ARCH_HAS_ELFCORE_COMPAT
 config ARCH_HAS_PARANOID_L1D_FLUSH
 	bool
 
+config ARCH_PCP_RSS_USE_CPUMASK
+	bool
+
 config DYNAMIC_SIGFRAME
 	bool
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11df67bb52ee..6f7991caf24b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5143,7 +5143,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	prepare_lock_switch(rq, next, rf);
 
 	/* Cache new active_mm */
-	switch_pcp_rss_cache_no_irq(next->active_mm);
+	if (!IS_ENABLED(CONFIG_ARCH_PCP_RSS_USE_CPUMASK))
+		switch_pcp_rss_cache_no_irq(next->active_mm);
 
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
diff --git a/mm/memory.c b/mm/memory.c
index 09d7d193da51..a819009aa3e0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -188,9 +188,16 @@ unsigned long get_mm_counter(struct mm_struct *mm, int member)
 {
 	int cpu;
 	long ret, update, sync_count;
+	const struct cpumask *mm_mask;
 
 	ret = atomic_long_read(&mm->rss_stat.count[member]);
-	for_each_possible_cpu(cpu) {
+
+	if (IS_ENABLED(CONFIG_ARCH_PCP_RSS_USE_CPUMASK))
+		mm_mask = mm_cpumask(mm);
+	else
+		mm_mask = cpu_possible_mask;
+
+	for_each_cpu(cpu, mm_mask) {
 		if (READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu)) != mm)
 			continue;
 		sync_count = READ_ONCE(per_cpu(cpu_rss_cache.sync_count, cpu));
@@ -217,12 +224,18 @@ unsigned long get_mm_rss(struct mm_struct *mm)
 {
 	int cpu;
 	long ret, update, sync_count;
+	const struct cpumask *mm_mask;
 
 	ret = atomic_long_read(&mm->rss_stat.count[MM_FILEPAGES]),
 	    + atomic_long_read(&mm->rss_stat.count[MM_ANONPAGES]),
 	    + atomic_long_read(&mm->rss_stat.count[MM_SHMEMPAGES]);
 
-	for_each_possible_cpu(cpu) {
+	if (IS_ENABLED(CONFIG_ARCH_PCP_RSS_USE_CPUMASK))
+		mm_mask = mm_cpumask(mm);
+	else
+		mm_mask = cpu_possible_mask;
+
+	for_each_cpu(cpu, mm_mask) {
 		if (READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu)) != mm)
 			continue;
 		sync_count = READ_ONCE(per_cpu(cpu_rss_cache.sync_count, cpu));
@@ -266,10 +279,13 @@ void switch_pcp_rss_cache_no_irq(struct mm_struct *next_mm)
 	if (cpu_mm == NULL)
 		goto commit_done;
 
-	/* Race with check_discard_rss_cache */
-	if (cpu_mm != cmpxchg(this_cpu_ptr(&cpu_rss_cache.mm), cpu_mm,
-			      __pcp_rss_mm_mark(cpu_mm)))
-		goto commit_done;
+	/* Arch will take care of cache invalidation */
+	if (!IS_ENABLED(CONFIG_ARCH_PCP_RSS_USE_CPUMASK)) {
+		/* Race with check_discard_rss_cache */
+		if (cpu_mm != cmpxchg(this_cpu_ptr(&cpu_rss_cache.mm), cpu_mm,
+				      __pcp_rss_mm_mark(cpu_mm)))
+			goto commit_done;
+	}
 
 	for (int i = 0; i < NR_MM_COUNTERS; i++) {
 		count = this_cpu_read(cpu_rss_cache.count[i]);
@@ -328,46 +344,54 @@ static void check_discard_rss_cache(struct mm_struct *mm)
 	long cached_count[NR_MM_COUNTERS] = { 0 };
 	struct mm_struct *cpu_mm;
 
-	/* Invalidate the RSS cache on every CPU */
-	for_each_possible_cpu(cpu) {
-		cpu_mm = READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu));
-		if (__pcp_rss_mm_unmark(cpu_mm) != mm)
-			continue;
-
-		/*
-		 * If not being flusehd, try read-in the counter and mark it NULL,
-		 * once cache's mm is set NULL, counter are considered invalided
-		 */
-		if (cpu_mm != __pcp_rss_mm_mark(cpu_mm)) {
-			long count[NR_MM_COUNTERS];
-
-			for (int i = 0; i < NR_MM_COUNTERS; i++)
-				count[i] = READ_ONCE(per_cpu(cpu_rss_cache.count[i], cpu));
+	/* Arch will take care of cache invalidation */
+	if (!IS_ENABLED(CONFIG_ARCH_PCP_RSS_USE_CPUMASK)) {
+		/* Invalidate the RSS cache on every CPU */
+		for_each_possible_cpu(cpu) {
+			cpu_mm = READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu));
+			if (__pcp_rss_mm_unmark(cpu_mm) != mm)
+				continue;
 
 			/*
-			 * If successfully set to NULL, the owner CPU is not flushing it, counters
-			 * are uncommiteed and untouched during this period, since a dying mm won't
-			 * be accouted anymore
+			 * If not being flusehd, try read-in the counter and mark it NULL,
+			 * once cache's mm is set NULL, counter are considered invalided.
 			 */
-			cpu_mm = cmpxchg(&per_cpu(cpu_rss_cache.mm, cpu), mm, NULL);
-			if (cpu_mm == mm) {
+			if (cpu_mm != __pcp_rss_mm_mark(cpu_mm)) {
+				long count[NR_MM_COUNTERS];
+
 				for (int i = 0; i < NR_MM_COUNTERS; i++)
-					cached_count[i] += count[i];
-				continue;
+					count[i] = READ_ONCE(per_cpu(cpu_rss_cache.count[i], cpu));
+
+				/*
+				 * If successfully set to NULL, the owner CPU is not flushing it,
+				 * counters are uncommitted and untouched during this period, since
+				 * a dying mm won't be accouted anymore.
+				 */
+				cpu_mm = cmpxchg(&per_cpu(cpu_rss_cache.mm, cpu), mm, NULL);
+				if (cpu_mm == mm) {
+					for (int i = 0; i < NR_MM_COUNTERS; i++)
+						cached_count[i] += count[i];
+					continue;
+				}
 			}
-		}
 
-		/* It's being flushed, just busy wait as the critial section is really short */
-		do {
-			cpu_relax();
-			cpu_mm = READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu));
-		} while (cpu_mm == __pcp_rss_mm_mark(mm));
+			/*
+			 * It's being flushed, just busy wait as the critial section
+			 * is really short.
+			 */
+			do {
+				cpu_relax();
+				cpu_mm = READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu));
+			} while (cpu_mm == __pcp_rss_mm_mark(mm));
+		}
 	}
 
 	for (int i = 0; i < NR_MM_COUNTERS; i++) {
 		long val = atomic_long_read(&mm->rss_stat.count[i]);
 
-		val += cached_count[i];
+		if (!IS_ENABLED(CONFIG_ARCH_PCP_RSS_USE_CPUMASK)) {
+			val += cached_count[i];
+		}
 
 		if (unlikely(val)) {
 			pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
-- 
2.35.2