For nohz full CPUs, manage per-CPU stat syncing from CPU context: start delayed work when marking per-CPU vmstat dirty. When returning to userspace, fold the stats and cancel the delayed work. When entering idle, only fold the stats. Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx> --- include/linux/vmstat.h | 4 ++-- kernel/time/tick-sched.c | 2 +- mm/vmstat.c | 41 ++++++++++++++++++++++++++++++++--------- 3 files changed, 35 insertions(+), 12 deletions(-) Index: linux-2.6/mm/vmstat.c =================================================================== --- linux-2.6.orig/mm/vmstat.c +++ linux-2.6/mm/vmstat.c @@ -28,6 +28,7 @@ #include <linux/mm_inline.h> #include <linux/page_ext.h> #include <linux/page_owner.h> +#include <linux/tick.h> #include "internal.h" @@ -195,9 +196,24 @@ void fold_vm_numa_events(void) #ifdef CONFIG_SMP static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty); +static DEFINE_PER_CPU(struct delayed_work, vmstat_work); +int sysctl_stat_interval __read_mostly = HZ; static inline void vmstat_mark_dirty(void) { + int cpu = smp_processor_id(); + + if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) { + struct delayed_work *dw; + + dw = &per_cpu(vmstat_work, cpu); + if (!delayed_work_pending(dw)) { + unsigned long delay; + + delay = round_jiffies_relative(sysctl_stat_interval); + queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay); + } + } this_cpu_write(vmstat_dirty, true); } @@ -1886,9 +1902,6 @@ static const struct seq_operations vmsta #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct delayed_work, vmstat_work); -int sysctl_stat_interval __read_mostly = HZ; - #ifdef CONFIG_PROC_FS static void refresh_vm_stats(struct work_struct *work) { @@ -1973,21 +1986,27 @@ static void vmstat_update(struct work_st * until the diffs stay at zero. The function is used by NOHZ and can only be * invoked when tick processing is not active. */ -void quiet_vmstat(void) +void quiet_vmstat(bool user) { + struct delayed_work *dw; + if (system_state != SYSTEM_RUNNING) return; if (!is_vmstat_dirty()) return; + refresh_cpu_vm_stats(false); + + if (!user) + return; /* - * Just refresh counters and do not care about the pending delayed - * vmstat_update. It doesn't fire that often to matter and canceling - * it would be too expensive from this path. - * vmstat_shepherd will take care about that for us. + * If the tick is stopped, cancel any delayed work to avoid + * interruptions to this CPU in the future. */ - refresh_cpu_vm_stats(false); + dw = &per_cpu(vmstat_work, smp_processor_id()); + if (delayed_work_pending(dw)) + cancel_delayed_work(dw); } /* @@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_ for_each_online_cpu(cpu) { struct delayed_work *dw = &per_cpu(vmstat_work, cpu); + /* NOHZ full CPUs manage their own vmstat flushing */ + if (tick_nohz_full_cpu(smp_processor_id())) + continue; + if (!delayed_work_pending(dw) && per_cpu(vmstat_dirty, cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); Index: linux-2.6/include/linux/vmstat.h =================================================================== --- linux-2.6.orig/include/linux/vmstat.h +++ linux-2.6/include/linux/vmstat.h @@ -290,7 +290,7 @@ extern void dec_zone_state(struct zone * extern void __dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_node_state(struct pglist_data *, enum node_stat_item); -void quiet_vmstat(void); +void quiet_vmstat(bool user); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); @@ -403,7 +403,7 @@ static inline void __dec_node_page_state static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } -static inline void quiet_vmstat(void) { } +static inline void quiet_vmstat(bool user) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) { } Index: linux-2.6/kernel/time/tick-sched.c =================================================================== --- linux-2.6.orig/kernel/time/tick-sched.c +++ linux-2.6/kernel/time/tick-sched.c @@ -911,7 +911,7 @@ static void tick_nohz_stop_tick(struct t */ if (!ts->tick_stopped) { calc_load_nohz_start(); - quiet_vmstat(); + quiet_vmstat(false); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1;