+ memory-tiering-rate-limit-numa-migration-throughput.patch added to mm-unstable branch

Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> · Sun, 14 Aug 2022 17:39:42 -0700

The patch titled
     Subject: memory tiering: rate limit NUMA migration throughput
has been added to the -mm mm-unstable branch.  Its filename is
     memory-tiering-rate-limit-numa-migration-throughput.patch

This patch will shortly appear at
     https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/memory-tiering-rate-limit-numa-migration-throughput.patch

This patch will later appear in the mm-unstable branch at
    git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Before you just go and hit "reply", please:
   a) Consider who else should be cc'ed
   b) Prefer to cc a suitable mailing list as well
   c) Ideally: find the original patch on the mailing list and do a
      reply-to-all to that, adding suitable additional cc's

*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***

The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days

------------------------------------------------------
From: Huang Ying <ying.huang@xxxxxxxxx>
Subject: memory tiering: rate limit NUMA migration throughput
Date: Wed, 13 Jul 2022 16:39:52 +0800

In NUMA balancing memory tiering mode, if there are hot pages in slow
memory node and cold pages in fast memory node, we need to promote/demote
hot/cold pages between the fast and cold memory nodes.

A choice is to promote/demote as fast as possible.  But the CPU cycles and
memory bandwidth consumed by the high promoting/demoting throughput will
hurt the latency of some workload because of accessing inflating and slow
memory bandwidth contention.

A way to resolve this issue is to restrict the max promoting/demoting
throughput.  It will take longer to finish the promoting/demoting.  But
the workload latency will be better.  This is implemented in this patch as
the page promotion rate limit mechanism.

The number of the candidate pages to be promoted to the fast memory node
via NUMA balancing is counted, if the count exceeds the limit specified by
the users, the NUMA balancing promotion will be stopped until the next
second.

A new sysctl knob kernel.numa_balancing_promote_rate_limit_MBps is added
for the users to specify the limit.

Link: https://lkml.kernel.org/r/20220713083954.34196-3-ying.huang@xxxxxxxxx
Signed-off-by: "Huang, Ying" <ying.huang@xxxxxxxxx>
Reviewed-by: Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx>
Tested-by: Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxx>
Cc: osalvador <osalvador@xxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxxx>
Cc: Shakeel Butt <shakeelb@xxxxxxxxxx>
Cc: Wei Xu <weixugc@xxxxxxxxxx>
Cc: Yang Shi <shy828301@xxxxxxxxx>
Cc: Zhong Jiang <zhongjiang-ali@xxxxxxxxxxxxxxxxx>
Cc: Zi Yan <ziy@xxxxxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 Documentation/admin-guide/sysctl/kernel.rst |   11 ++++++
 include/linux/mmzone.h                      |    7 +++
 include/linux/sched/sysctl.h                |    1 
 kernel/sched/fair.c                         |   33 ++++++++++++++++--
 kernel/sysctl.c                             |    8 ++++
 mm/vmstat.c                                 |    1 
 6 files changed, 59 insertions(+), 2 deletions(-)

--- a/Documentation/admin-guide/sysctl/kernel.rst~memory-tiering-rate-limit-numa-migration-throughput
+++ a/Documentation/admin-guide/sysctl/kernel.rst
@@ -635,6 +635,17 @@ different types of memory (represented a
 place the hot pages in the fast memory.  This is implemented based on
 unmapping and page fault too.
 
+numa_balancing_promote_rate_limit_MBps
+======================================
+
+Too high promotion/demotion throughput between different memory types
+may hurt application latency.  This can be used to rate limit the
+promotion throughput.  The per-node max promotion throughput in MB/s
+will be limited to be no more than the set value.
+
+A rule of thumb is to set this to less than 1/10 of the PMEM node
+write bandwidth.
+
 oops_all_cpu_backtrace
 ======================
 
--- a/include/linux/mmzone.h~memory-tiering-rate-limit-numa-migration-throughput
+++ a/include/linux/mmzone.h
@@ -221,6 +221,7 @@ enum node_stat_item {
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 	PGPROMOTE_SUCCESS,	/* promote successfully */
+	PGPROMOTE_CANDIDATE,	/* candidate pages to promote */
 #endif
 	NR_VM_NODE_STAT_ITEMS
 };
@@ -998,6 +999,12 @@ typedef struct pglist_data {
 	struct deferred_split deferred_split_queue;
 #endif
 
+#ifdef CONFIG_NUMA_BALANCING
+	/* start time in ms of current promote rate limit period */
+	unsigned int nbp_rl_start;
+	/* number of promote candidate pages at start time of current rate limit period */
+	unsigned long nbp_rl_nr_cand;
+#endif
 	/* Fields commonly accessed by the page reclaim scanner */
 
 	/*
--- a/include/linux/sched/sysctl.h~memory-tiering-rate-limit-numa-migration-throughput
+++ a/include/linux/sched/sysctl.h
@@ -27,6 +27,7 @@ enum sched_tunable_scaling {
 
 #ifdef CONFIG_NUMA_BALANCING
 extern int sysctl_numa_balancing_mode;
+extern unsigned int sysctl_numa_balancing_promote_rate_limit;
 #else
 #define sysctl_numa_balancing_mode	0
 #endif
--- a/kernel/sched/fair.c~memory-tiering-rate-limit-numa-migration-throughput
+++ a/kernel/sched/fair.c
@@ -1097,6 +1097,9 @@ unsigned int sysctl_numa_balancing_scan_
 /* The page with hint page fault latency < threshold in ms is considered hot */
 unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
 
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+
 struct numa_group {
 	refcount_t refcount;
 
@@ -1501,6 +1504,29 @@ static int numa_hint_fault_latency(struc
 	return (time - last_time) & PAGE_ACCESS_TIME_MASK;
 }
 
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency.  So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+				      unsigned long rate_limit, int nr)
+{
+	unsigned long nr_cand;
+	unsigned int now, start;
+
+	now = jiffies_to_msecs(jiffies);
+	mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+	nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+	start = pgdat->nbp_rl_start;
+	if (now - start > MSEC_PER_SEC &&
+	    cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+		pgdat->nbp_rl_nr_cand = nr_cand;
+	if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+		return true;
+	return false;
+}
+
 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 				int src_nid, int dst_cpu)
 {
@@ -1515,7 +1541,7 @@ bool should_numa_migrate_memory(struct t
 	if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
 	    !node_is_toptier(src_nid)) {
 		struct pglist_data *pgdat;
-		unsigned long latency, th;
+		unsigned long rate_limit, latency, th;
 
 		pgdat = NODE_DATA(dst_nid);
 		if (pgdat_free_space_enough(pgdat))
@@ -1526,7 +1552,10 @@ bool should_numa_migrate_memory(struct t
 		if (latency >= th)
 			return false;
 
-		return true;
+		rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+			(20 - PAGE_SHIFT);
+		return !numa_promotion_rate_limit(pgdat, rate_limit,
+						  thp_nr_pages(page));
 	}
 
 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
--- a/kernel/sysctl.c~memory-tiering-rate-limit-numa-migration-throughput
+++ a/kernel/sysctl.c
@@ -1641,6 +1641,14 @@ static struct ctl_table kern_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_FOUR,
 	},
+	{
+		.procname	= "numa_balancing_promote_rate_limit_MBps",
+		.data		= &sysctl_numa_balancing_promote_rate_limit,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+	},
 #endif /* CONFIG_NUMA_BALANCING */
 	{
 		.procname	= "panic",
--- a/mm/vmstat.c~memory-tiering-rate-limit-numa-migration-throughput
+++ a/mm/vmstat.c
@@ -1252,6 +1252,7 @@ const char * const vmstat_text[] = {
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 	"pgpromote_success",
+	"pgpromote_candidate",
 #endif
 
 	/* enum writeback_stat_item counters */
_

Patches currently in -mm which might be from ying.huang@xxxxxxxxx are

memory-tiering-hot-page-selection-with-hint-page-fault-latency.patch
memory-tiering-rate-limit-numa-migration-throughput.patch
memory-tiering-adjust-hot-threshold-automatically.patch