[PATCH v9 3/5] mm/vmstat: manage per-CPU stats from CPU context when NOHZ full

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



For nohz full CPUs, manage per-CPU stat syncing from CPU context:
start delayed work when marking per-CPU vmstat dirty.

When returning to userspace, fold the stats and cancel the delayed work.

When entering idle, only fold the stats.

Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx>
---
 include/linux/vmstat.h   |    4 ++--
 kernel/time/tick-sched.c |    2 +-
 mm/vmstat.c              |   41 ++++++++++++++++++++++++++++++++---------
 3 files changed, 35 insertions(+), 12 deletions(-)

Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c
+++ linux-2.6/mm/vmstat.c
@@ -28,6 +28,7 @@
 #include <linux/mm_inline.h>
 #include <linux/page_ext.h>
 #include <linux/page_owner.h>
+#include <linux/tick.h>
 
 #include "internal.h"
 
@@ -195,9 +196,24 @@ void fold_vm_numa_events(void)
 
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+int sysctl_stat_interval __read_mostly = HZ;
 
 static inline void vmstat_mark_dirty(void)
 {
+	int cpu = smp_processor_id();
+
+	if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
+		struct delayed_work *dw;
+
+		dw = &per_cpu(vmstat_work, cpu);
+		if (!delayed_work_pending(dw)) {
+			unsigned long delay;
+
+			delay = round_jiffies_relative(sysctl_stat_interval);
+			queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
+		}
+	}
 	this_cpu_write(vmstat_dirty, true);
 }
 
@@ -1886,9 +1902,6 @@ static const struct seq_operations vmsta
 #endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
-int sysctl_stat_interval __read_mostly = HZ;
-
 #ifdef CONFIG_PROC_FS
 static void refresh_vm_stats(struct work_struct *work)
 {
@@ -1973,21 +1986,27 @@ static void vmstat_update(struct work_st
  * until the diffs stay at zero. The function is used by NOHZ and can only be
  * invoked when tick processing is not active.
  */
-void quiet_vmstat(void)
+void quiet_vmstat(bool user)
 {
+	struct delayed_work *dw;
+
 	if (system_state != SYSTEM_RUNNING)
 		return;
 
 	if (!is_vmstat_dirty())
 		return;
 
+	refresh_cpu_vm_stats(false);
+
+	if (!user)
+		return;
 	/*
-	 * Just refresh counters and do not care about the pending delayed
-	 * vmstat_update. It doesn't fire that often to matter and canceling
-	 * it would be too expensive from this path.
-	 * vmstat_shepherd will take care about that for us.
+	 * If the tick is stopped, cancel any delayed work to avoid
+	 * interruptions to this CPU in the future.
 	 */
-	refresh_cpu_vm_stats(false);
+	dw = &per_cpu(vmstat_work, smp_processor_id());
+	if (delayed_work_pending(dw))
+		cancel_delayed_work(dw);
 }
 
 /*
@@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
 	for_each_online_cpu(cpu) {
 		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
 
+		/* NOHZ full CPUs manage their own vmstat flushing */
+		if (tick_nohz_full_cpu(smp_processor_id()))
+			continue;
+
 		if (!delayed_work_pending(dw) && per_cpu(vmstat_dirty, cpu))
 			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
 
Index: linux-2.6/include/linux/vmstat.h
===================================================================
--- linux-2.6.orig/include/linux/vmstat.h
+++ linux-2.6/include/linux/vmstat.h
@@ -290,7 +290,7 @@ extern void dec_zone_state(struct zone *
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_node_state(struct pglist_data *, enum node_stat_item);
 
-void quiet_vmstat(void);
+void quiet_vmstat(bool user);
 void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
 
@@ -403,7 +403,7 @@ static inline void __dec_node_page_state
 
 static inline void refresh_zone_stat_thresholds(void) { }
 static inline void cpu_vm_stats_fold(int cpu) { }
-static inline void quiet_vmstat(void) { }
+static inline void quiet_vmstat(bool user) { }
 
 static inline void drain_zonestat(struct zone *zone,
 			struct per_cpu_zonestat *pzstats) { }
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -911,7 +911,7 @@ static void tick_nohz_stop_tick(struct t
 	 */
 	if (!ts->tick_stopped) {
 		calc_load_nohz_start();
-		quiet_vmstat();
+		quiet_vmstat(false);
 
 		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 		ts->tick_stopped = 1;






[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux