[PATCH v1] sched/numa: add per-process numa_balancing

Gang Li <ligang.bdlg@xxxxxxxxxxxxx> · Wed, 27 Oct 2021 21:26:32 +0800

This patch add a new api PR_NUMA_BALANCING in prctl.

A large number of page faults will cause performance loss when numa balancing
is performing. Thus those processes which care about worst-case performance
need numa balancing disabled. Others, on the contrary, allow a temporary
performance loss in exchange for higher average performance, so enable numa
balancing is better for them.

Numa balancing can only be controlled globally by /proc/sys/kernel/numa_balancing.
Due to the above case, we want to disable/enable numa_balancing per-process
instead.

Add numa_balancing under mm_struct. Then use it in task_tick_numa.

Disable/enable per-process numa balancing:
	prctl(PR_NUMA_BALANCING, PR_SET_NUMA_BALANCING, 0/1);
Get numa_balancing state:
	prctl(PR_NUMA_BALANCING, PR_GET_NUMA_BALANCING, &ret);
	cat /proc/<pid>/status | grep NumaBalancing_enabled

mm->numa_balancing only works when global numa_balancing is enabled.
When the global numa_balancing is diabled, mm->numa_blancing will not
change, but you will always get 0 while you want to get process
numa_balancing state and kernel will return err when you use prctl set
it.

Signed-off-by: Gang Li <ligang.bdlg@xxxxxxxxxxxxx>
---
 Documentation/filesystems/proc.rst |  2 ++
 fs/proc/task_mmu.c                 | 16 ++++++++++++
 include/linux/mm_types.h           |  3 +++
 include/uapi/linux/prctl.h         |  5 ++++
 kernel/fork.c                      |  3 +++
 kernel/sched/fair.c                |  3 +++
 kernel/sys.c                       | 39 ++++++++++++++++++++++++++++++
 7 files changed, 71 insertions(+)

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 8d7f141c6fc7..b90f43ed0668 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -192,6 +192,7 @@ read the file /proc/PID/status::
   VmLib:      1412 kB
   VmPTE:        20 kb
   VmSwap:        0 kB
+  NumaBalancing_enabled:  1
   HugetlbPages:          0 kB
   CoreDumping:    0
   THP_enabled:	  1
@@ -273,6 +274,7 @@ It's slow but very precise.
  VmPTE                       size of page table entries
  VmSwap                      amount of swap used by anonymous private data
                              (shmem swap usage is not included)
+ NumaBalancing_enabled       numa balancing state, use prctl(PR_NUMA_BALANCING, ...)
  HugetlbPages                size of hugetlb memory portions
  CoreDumping                 process's memory is currently being dumped
                              (killing the process may lead to a corrupted core)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ad667dbc96f5..161295e027d2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -19,6 +19,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/uaccess.h>
 #include <linux/pkeys.h>
+#include <linux/sched/numa_balancing.h>
 
 #include <asm/elf.h>
 #include <asm/tlb.h>
@@ -27,14 +28,23 @@
 
 #define SEQ_PUT_DEC(str, val) \
 		seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
+
+DECLARE_STATIC_KEY_FALSE(sched_numa_balancing);
+
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
 	unsigned long text, lib, swap, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+#ifdef CONFIG_NUMA_BALANCING
+	int numa_balancing;
+#endif
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
 	file = get_mm_counter(mm, MM_FILEPAGES);
 	shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+#ifdef CONFIG_NUMA_BALANCING
+	numa_balancing = READ_ONCE(mm->numa_balancing);
+#endif
 
 	/*
 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -75,6 +85,12 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		    " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
 	SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
 	seq_puts(m, " kB\n");
+#ifdef CONFIG_NUMA_BALANCING
+	if (!static_branch_unlikely(&sched_numa_balancing))
+		numa_balancing = 0;
+
+	seq_printf(m, "NumaBalancing_enabled:\t%d\n", numa_balancing);
+#endif
 	hugetlb_report_usage(m, mm);
 }
 #undef SEQ_PUT_DEC
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bb8c6f5f19bc..feeb6f639f87 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -612,6 +612,9 @@ struct mm_struct {
 
 		/* numa_scan_seq prevents two threads setting pte_numa */
 		int numa_scan_seq;
+
+		/* numa_balancing control the numa balancing of this mm */
+		int numa_balancing;
 #endif
 		/*
 		 * An operation with batched TLB flushing is going on. Anything
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b2e4dc1449b9..2235b75efd30 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -272,4 +272,9 @@ struct prctl_mm_map {
 # define PR_SCHED_CORE_SCOPE_THREAD_GROUP	1
 # define PR_SCHED_CORE_SCOPE_PROCESS_GROUP	2
 
+/* Set/get enabled per-process numa_balancing */
+#define PR_NUMA_BALANCING		63
+# define PR_SET_NUMA_BALANCING		0
+# define PR_GET_NUMA_BALANCING		1
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 2079f1ebfe63..39e9d5daf00a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1110,6 +1110,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_tlb_flush_pending(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+	mm->numa_balancing = 1;
 #endif
 	mm_init_uprobes_state(mm);
 	hugetlb_count_init(mm);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 87db481e8a56..1325253e3613 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2866,6 +2866,9 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
 		return;
 
+	if (!READ_ONCE(curr->mm->numa_balancing))
+		return;
+
 	/*
 	 * Using runtime rather than walltime has the dual advantage that
 	 * we (mostly) drive the selection from busy threads and that the
diff --git a/kernel/sys.c b/kernel/sys.c
index 8fdac0d90504..64aee3d63ea8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -154,6 +154,8 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;
 EXPORT_SYMBOL(fs_overflowuid);
 EXPORT_SYMBOL(fs_overflowgid);
 
+DECLARE_STATIC_KEY_FALSE(sched_numa_balancing);
+
 /*
  * Returns true if current's euid is same as p's uid or euid,
  * or has CAP_SYS_NICE to p's user_ns.
@@ -2081,6 +2083,28 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
 	return 0;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static int prctl_pid_numa_balancing_write(int numa_balancing)
+{
+	if (!static_branch_unlikely(&sched_numa_balancing))
+		return -EPERM;
+
+	if (numa_balancing != 0 && numa_balancing != 1)
+		return -EINVAL;
+
+	WRITE_ONCE(current->mm->numa_balancing, numa_balancing);
+	return 0;
+}
+
+static int prctl_pid_numa_balancing_read(void)
+{
+	if (!static_branch_unlikely(&sched_numa_balancing))
+		return 0;
+	else
+		return READ_ONCE(current->mm->numa_balancing);
+}
+#endif
+
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
@@ -2525,6 +2549,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		error = set_syscall_user_dispatch(arg2, arg3, arg4,
 						  (char __user *) arg5);
 		break;
+#ifdef CONFIG_NUMA_BALANCING
+	case PR_NUMA_BALANCING:
+		switch (arg2) {
+		case PR_SET_NUMA_BALANCING:
+			error = prctl_pid_numa_balancing_write((int)arg3);
+			break;
+		case PR_GET_NUMA_BALANCING:
+			put_user(prctl_pid_numa_balancing_read(), (int __user *)arg3);
+			break;
+		default:
+			error = -EINVAL;
+			break;
+		}
+		break;
+#endif
 #ifdef CONFIG_SCHED_CORE
 	case PR_SCHED_CORE:
 		error = sched_core_share_pid(arg2, arg3, arg4, arg5);
-- 
2.20.1