On Thu, 24 Feb 2022 17:28:19 -0800 Suren Baghdasaryan wrote: > Sending as an RFC to confirm if this is the right direction and to > clarify if other tasks currently executed on mm_percpu_wq should be > also moved to kthreads. The patch seems stable in testing but I want > to collect more performance data before submitting a non-RFC version. > > > Currently drain_all_pages uses mm_percpu_wq to drain pages from pcp > list during direct reclaim. The tasks on a workqueue can be delayed > by other tasks in the workqueues using the same per-cpu worker pool. The pending works may be freeing a couple of slabs/pages each. Who knows? > This results in sizable delays in drain_all_pages when cpus are highly > contended. > Memory management operations designed to relieve memory pressure should > not be allowed to block by other tasks, especially if the task in direct > reclaim has higher priority than the blocking tasks. Wonder why priority is the right cure to tight memory - otherwise it was not a problem given a direct reclaimer of higher priority. Off topic question - why is it making sense from begining for a task of lower priority to peel pages off from another of higher priority if priority is considered in direct reclaim? > Replace the usage of mm_percpu_wq with per-cpu low priority FIFO > kthreads to execute draining tasks. > > Suggested-by: Petr Mladek <pmladek@xxxxxxxx> > Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> > --- > mm/page_alloc.c | 84 ++++++++++++++++++++++++++++++++++++++++--------- > 1 file changed, 70 insertions(+), 14 deletions(-) > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 3589febc6d31..c9ab2cf4b05b 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -153,7 +153,8 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_); > /* work_structs for global per-cpu drains */ > struct pcpu_drain { > struct zone *zone; > - struct work_struct work; > + struct kthread_work work; > + struct kthread_worker *worker; > }; > static DEFINE_MUTEX(pcpu_drain_mutex); > static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); > @@ -2209,6 +2210,58 @@ _deferred_grow_zone(struct zone *zone, unsigned int order) > > #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ > > +static void drain_local_pages_func(struct kthread_work *work); > + > +static int alloc_drain_worker(unsigned int cpu) > +{ > + struct pcpu_drain *drain; > + > + mutex_lock(&pcpu_drain_mutex); Nit, see below. > + drain = per_cpu_ptr(&pcpu_drain, cpu); > + drain->worker = kthread_create_worker_on_cpu(cpu, 0, "pg_drain/%u", cpu); > + if (IS_ERR(drain->worker)) { > + drain->worker = NULL; > + pr_err("Failed to create pg_drain/%u\n", cpu); > + goto out; > + } > + /* Ensure the thread is not blocked by normal priority tasks */ > + sched_set_fifo_low(drain->worker->task); > + kthread_init_work(&drain->work, drain_local_pages_func); > +out: > + mutex_unlock(&pcpu_drain_mutex); > + > + return 0; > +} alloc_drain_worker(unsigned int cpu) mutex_lock(&pcpu_drain_mutex); drain->worker = kthread_create_worker_on_cpu(cpu, 0, "pg_drain/%u", cpu); __kthread_create_worker(cpu, flags, namefmt, args); kzalloc(sizeof(*worker), GFP_KERNEL); kmalloc slab_alloc new_slab alloc_pages __alloc_pages_slowpath __alloc_pages_direct_reclaim drain_all_pages(NULL); __drain_all_pages(zone, false); if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { if (!zone) return; mutex_lock(&pcpu_drain_mutex); } Either deadlock or no page drained wrt pcpu_drain_mutex if nothing missed. > + > +static int free_drain_worker(unsigned int cpu) > +{ > + struct pcpu_drain *drain; > + > + mutex_lock(&pcpu_drain_mutex); > + drain = per_cpu_ptr(&pcpu_drain, cpu); > + kthread_cancel_work_sync(&drain->work); > + kthread_destroy_worker(drain->worker); > + drain->worker = NULL; > + mutex_unlock(&pcpu_drain_mutex); > + > + return 0; > +} > + > +static void __init init_drain_workers(void) > +{ > + unsigned int cpu; > + > + for_each_online_cpu(cpu) > + alloc_drain_worker(cpu); > + > + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, > + "page_alloc/drain:online", > + alloc_drain_worker, > + free_drain_worker)) { > + pr_err("page_alloc_drain: Failed to allocate a hotplug state\n"); > + } > +} > + > void __init page_alloc_init_late(void) > { > struct zone *zone; > @@ -2245,6 +2298,8 @@ void __init page_alloc_init_late(void) > > for_each_populated_zone(zone) > set_zone_contiguous(zone); > + > + init_drain_workers(); > } > > #ifdef CONFIG_CMA > @@ -3144,7 +3199,7 @@ void drain_local_pages(struct zone *zone) > drain_pages(cpu); > } > > -static void drain_local_pages_wq(struct work_struct *work) > +static void drain_local_pages_func(struct kthread_work *work) > { > struct pcpu_drain *drain; > > @@ -3175,6 +3230,7 @@ static void drain_local_pages_wq(struct work_struct *work) > static void __drain_all_pages(struct zone *zone, bool force_all_cpus) > { > int cpu; > + struct pcpu_drain *drain; > > /* > * Allocate in the BSS so we won't require allocation in > @@ -3182,13 +3238,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) > */ > static cpumask_t cpus_with_pcps; > > - /* > - * Make sure nobody triggers this path before mm_percpu_wq is fully > - * initialized. > - */ > - if (WARN_ON_ONCE(!mm_percpu_wq)) > - return; > - > /* > * Do not drain if one is already in progress unless it's specific to > * a zone. Such callers are primarily CMA and memory hotplug and need > @@ -3238,14 +3287,21 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) > } > > for_each_cpu(cpu, &cpus_with_pcps) { > - struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); > + drain = per_cpu_ptr(&pcpu_drain, cpu); > > drain->zone = zone; > - INIT_WORK(&drain->work, drain_local_pages_wq); > - queue_work_on(cpu, mm_percpu_wq, &drain->work); > + if (likely(drain->worker)) > + kthread_queue_work(drain->worker, &drain->work); > + } > + /* Wait for kthreads to finish or drain itself */ > + for_each_cpu(cpu, &cpus_with_pcps) { > + drain = per_cpu_ptr(&pcpu_drain, cpu); > + > + if (likely(drain->worker)) > + kthread_flush_work(&drain->work); > + else > + drain_local_pages_func(&drain->work); > } > - for_each_cpu(cpu, &cpus_with_pcps) > - flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); > > mutex_unlock(&pcpu_drain_mutex); > } > -- > 2.35.1.574.g5d30c73bfb-goog > >