Make use of the new qpw_{un,}lock*() and queue_percpu_work_on() interface to improve performance & latency on PREEMTP_RT kernels. For functions that may be scheduled in a different cpu, replace local_{un,}lock*() by qpw_{un,}lock*(), and replace schedule_work_on() by queue_percpu_work_on(). The same happens for flush_work() and flush_percpu_work(). The change requires allocation of qpw_structs instead of a work_structs, and changing parameters of a few functions to include the cpu parameter. This should bring no relevant performance impact on non-RT kernels: For functions that may be scheduled in a different cpu, the local_*lock's this_cpu_ptr() becomes a per_cpu_ptr(smp_processor_id()). Signed-off-by: Leonardo Bras <leobras@xxxxxxxxxx> --- mm/swap.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 67786cb77130..c1a61b7cd71a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -28,21 +28,21 @@ #include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> -#include <linux/local_lock.h> +#include <linux/qpw.h> #include <linux/buffer_head.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; const int page_cluster_max = 31; @@ -758,45 +758,45 @@ void lru_add_drain(void) local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } /* * It's called from per-cpu workqueue context in SMP case so * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on * the same cpu. It shouldn't be a problem in !SMP case since * the core is only one and the locks will disable preemption. */ -static void lru_add_and_bh_lrus_drain(void) +static void lru_add_and_bh_lrus_drain(int cpu) { - local_lock(&cpu_fbatches.lock); - lru_add_drain_cpu(smp_processor_id()); - local_unlock(&cpu_fbatches.lock); + qpw_lock(&cpu_fbatches.lock, cpu); + lru_add_drain_cpu(cpu); + qpw_unlock(&cpu_fbatches.lock, cpu); invalidate_bh_lrus_cpu(); mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); +static DEFINE_PER_CPU(struct qpw_struct, lru_add_drain_qpw); -static void lru_add_drain_per_cpu(struct work_struct *dummy) +static void lru_add_drain_per_cpu(struct work_struct *w) { - lru_add_and_bh_lrus_drain(); + lru_add_and_bh_lrus_drain(qpw_get_cpu(w)); } static bool cpu_needs_drain(unsigned int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); /* Check these in order of likelihood that they're not zero */ return folio_batch_count(&fbatches->lru_add) || data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || folio_batch_count(&fbatches->lru_deactivate_file) || @@ -882,31 +882,31 @@ static inline void __lru_add_drain_all(bool force_all_cpus) * * If the paired barrier is done at any later step, e.g. after the * loop, CPU #x will just exit at (C) and miss flushing out all of its * added pages. */ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); smp_mb(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { - struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); + struct qpw_struct *qpw = &per_cpu(lru_add_drain_qpw, cpu); if (cpu_needs_drain(cpu)) { - INIT_WORK(work, lru_add_drain_per_cpu); - queue_work_on(cpu, mm_percpu_wq, work); + INIT_QPW(qpw, lru_add_drain_per_cpu, cpu); + queue_percpu_work_on(cpu, mm_percpu_wq, qpw); __cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) - flush_work(&per_cpu(lru_add_drain_work, cpu)); + flush_percpu_work(&per_cpu(lru_add_drain_qpw, cpu)); done: mutex_unlock(&lock); } void lru_add_drain_all(void) { __lru_add_drain_all(false); } #else @@ -939,21 +939,21 @@ void lru_cache_disable(void) * * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on * preempt_disable() regions of code. So any CPU which sees * lru_disable_count = 0 will have exited the critical * section when synchronize_rcu() returns. */ synchronize_rcu_expedited(); #ifdef CONFIG_SMP __lru_add_drain_all(true); #else - lru_add_and_bh_lrus_drain(); + lru_add_and_bh_lrus_drain(smp_processor_id()); #endif } /** * folios_put_refs - Reduce the reference count on a batch of folios. * @folios: The folios. * @refs: The number of refs to subtract from each folio. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need -- 2.45.2