In current AutoNUMA implementation, the page tables of the processes are scanned periodically to trigger the NUMA hint page faults. The scanning runs in the context of the processes, so will delay the running of the processes. In a test with 64 threads pmbench memory accessing benchmark on a 2-socket server machine with 104 logical CPUs and 256 GB memory, there are more than 20000 latency outliers that are > 1 ms in 3600s run time. These latency outliers are almost all caused by the AutoNUMA page table scanning. Because they almost all disappear after applying this patch to scan the page tables asynchronously. Because there are idle CPUs in system, the asynchronous running page table scanning code can run on these idle CPUs instead of the CPUs the workload is running on. So on system with enough idle CPU time, it's better to scan the page tables asynchronously to take full advantages of these idle CPU time. Another scenario which can benefit from this is to scan the page tables on some service CPUs of the socket, so that the real workload can run on the isolated CPUs without the latency outliers caused by the page table scanning. But it's not perfect to scan page tables asynchronously too. For example, on system without enough idle CPU time, the CPU time isn't scheduled fairly because the page table scanning is charged to the workqueue thread instead of the process/thread it works for. And although the page tables are scanned for the target process, it may run on a CPU that is not in the cpuset of the target process. One possible solution is to let the system administrator to choose the better behavior for the system via a sysctl knob (implemented in the patch). But it's not perfect too. Because every user space knob adds maintenance overhead. A better solution may be to back-charge the CPU time to scan the page tables to the process/thread, and find a way to run the work on the proper cpuset. After some googling, I found there's some discussion about this as in the following thread, https://lkml.org/lkml/2019/6/13/1321 So this patch may be not ready to be merged by upstream yet. It quantizes the latency outliers caused by the page table scanning in AutoNUMA. And it provides a possible way to resolve the issue for users who cares about it. And it is a potential customer of the work related to the cgroup-aware workqueue or other asynchronous execution mechanisms. Signed-off-by: "Huang, Ying" <ying.huang@xxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: Mel Gorman <mgorman@xxxxxxx> Cc: Rik van Riel <riel@xxxxxxxxxxx> Cc: Daniel Jordan <daniel.m.jordan@xxxxxxxxxx> Cc: Tejun Heo <tj@xxxxxxxxxx> Cc: Dave Hansen <dave.hansen@xxxxxxxxx> Cc: Tim Chen <tim.c.chen@xxxxxxxxx> Cc: Aubrey Li <aubrey.li@xxxxxxxxx> --- include/linux/sched.h | 1 + include/linux/sched/sysctl.h | 1 + kernel/exit.c | 1 + kernel/sched/fair.c | 55 +++++++++++++++++++++++++++--------- kernel/sysctl.c | 9 ++++++ 5 files changed, 53 insertions(+), 14 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 04278493bf15..433f77436ed4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1087,6 +1087,7 @@ struct task_struct { u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; + struct work_struct numa_async_work; /* * This pointer is only modified for current in syscall and diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d4f6215ee03f..ea595ae9e573 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -37,6 +37,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; +extern unsigned int sysctl_numa_balancing_scan_async; #ifdef CONFIG_SCHED_DEBUG extern __read_mostly unsigned int sysctl_sched_migration_cost; diff --git a/kernel/exit.c b/kernel/exit.c index 0b81b26a872a..f5e84ff4b788 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -798,6 +798,7 @@ void __noreturn do_exit(long code) if (group_dead) disassociate_ctty(1); exit_task_namespaces(tsk); + cancel_work_sync(&tsk->numa_async_work); exit_task_work(tsk); exit_thread(tsk); exit_umh(tsk); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c1217bfe5e81..7d82c6826708 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1064,6 +1064,9 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +/* Scan asynchronously via work queue */ +unsigned int sysctl_numa_balancing_scan_async; + struct numa_group { refcount_t refcount; @@ -2481,24 +2484,17 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } -/* - * The expensive part of numa migration is done from task_work context. - * Triggered from task_tick_numa(). - */ -static void task_numa_work(struct callback_head *work) +static void numa_scan(struct task_struct *p) { unsigned long migrate, next_scan, now = jiffies; - struct task_struct *p = current; + struct task_struct *pc = current; struct mm_struct *mm = p->mm; - u64 runtime = p->se.sum_exec_runtime; + u64 runtime = pc->se.sum_exec_runtime; struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; long pages, virtpages; - SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); - - work->next = work; /* * Who cares about NUMA placement when they're dying. * @@ -2621,12 +2617,36 @@ static void task_numa_work(struct callback_head *work) * Usually update_task_scan_period slows down scanning enough; on an * overloaded system we need to limit overhead on a per task basis. */ - if (unlikely(p->se.sum_exec_runtime != runtime)) { - u64 diff = p->se.sum_exec_runtime - runtime; + if (unlikely(pc->se.sum_exec_runtime != runtime)) { + u64 diff = pc->se.sum_exec_runtime - runtime; p->node_stamp += 32 * diff; } } +/* + * The expensive part of numa migration is done from task_work context. + * Triggered from task_tick_numa(). + */ +static void task_numa_work(struct callback_head *work) +{ + struct task_struct *p = current; + + SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); + + work->next = work; + + numa_scan(p); +} + +static void numa_async_work(struct work_struct *work) +{ + struct task_struct *p; + + p = container_of(work, struct task_struct, numa_async_work); + + numa_scan(p); +} + void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) { int mm_users = 0; @@ -2650,6 +2670,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) p->last_sum_exec_runtime = 0; init_task_work(&p->numa_work, task_numa_work); + INIT_WORK(&p->numa_async_work, numa_async_work); /* New address space, reset the preferred nid */ if (!(clone_flags & CLONE_VM)) { @@ -2699,8 +2720,14 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->numa_scan_period = task_scan_start(curr); curr->node_stamp += period; - if (!time_before(jiffies, curr->mm->numa_next_scan)) - task_work_add(curr, work, true); + if (!time_before(jiffies, curr->mm->numa_next_scan)) { + if (sysctl_numa_balancing_scan_async) + queue_work_node(numa_node_id(), + system_unbound_wq, + &curr->numa_async_work); + else + task_work_add(curr, work, true); + } } } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ad5b88a53c5a..31a987230c58 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -418,6 +418,15 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "numa_balancing_scan_async", + .data = &sysctl_numa_balancing_scan_async, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { .procname = "numa_balancing", .data = NULL, /* filled in by handler */ -- 2.25.1