On 05 Jan 2023 09:52:21 -0300 Marcelo Tosatti <mtosatti@xxxxxxxxxx> > For nohz full CPUs, we'd like the per-CPU vm statistics to be > synchronized when userspace is executing. Otherwise, > the vmstat_shepherd might queue a work item to synchronize them, > which is undesired intereference for isolated CPUs. > > This means that its necessary to check for, and possibly sync, > the statistics when returning to userspace. This means that > there are now two execution contexes, on different CPUs, > which require awareness about each other: context switch > and vmstat shepherd kernel threadr. > > To avoid the shared variables between these two contexes (which > would require atomic accesses), delegate the responsability > of statistics synchronization from vmstat_shepherd to local CPU > context, for nohz_full CPUs. > > Do that by queueing a delayed work when marking per-CPU vmstat dirty. > > When returning to userspace, fold the stats and cancel the delayed work. > > When entering idle, only fold the stats. > > Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx> > --- > include/linux/vmstat.h | 4 ++-- > kernel/time/tick-sched.c | 2 +- > mm/vmstat.c | 41 ++++++++++++++++++++++++++++++++--------- > 3 files changed, 35 insertions(+), 12 deletions(-) > > Index: linux-2.6/mm/vmstat.c > =================================================================== > --- linux-2.6.orig/mm/vmstat.c > +++ linux-2.6/mm/vmstat.c > @@ -28,6 +28,7 @@ > #include <linux/mm_inline.h> > #include <linux/page_ext.h> > #include <linux/page_owner.h> > +#include <linux/tick.h> > > #include "internal.h" > > @@ -194,21 +195,57 @@ void fold_vm_numa_events(void) > #endif > > #ifdef CONFIG_SMP > -static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty); > + > +struct vmstat_dirty { > + bool dirty; > +#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER > + bool cpu_offline; > +#endif > +}; > + > +static DEFINE_PER_CPU_ALIGNED(struct vmstat_dirty, vmstat_dirty_pcpu); > +static DEFINE_PER_CPU(struct delayed_work, vmstat_work); > +int sysctl_stat_interval __read_mostly = HZ; > + > +#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER > +static inline void vmstat_queue_local_work(void) > +{ > + bool vmstat_dirty = this_cpu_read(vmstat_dirty_pcpu.dirty); > + bool cpu_offline = this_cpu_read(vmstat_dirty_pcpu.cpu_offline); > + int cpu = smp_processor_id(); > + > + if (tick_nohz_full_cpu(cpu) && !vmstat_dirty) { > + struct delayed_work *dw; > + > + dw = this_cpu_ptr(&vmstat_work); > + if (!delayed_work_pending(dw) && !cpu_offline) { > + unsigned long delay; > + > + delay = round_jiffies_relative(sysctl_stat_interval); > + queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay); Regression wrt V12 if timer is added on the CPU that is not doing HK_TYPE_TIMER? > + } > + } > +} > +#else