Edmund Nadolski reported the same problem Kosaki did against the commit [88f5acf8: mm: page allocator: adjust the per-cpu counter threshold when memory is low] whereby kswapd was in an inconsistent locking state due to calling get_online_cpus(): See https://lkml.org/lkml/2011/3/2/398 for details. This is already fixed upstream by commit [b44129b3: mm: vmstat: use a single setter function and callback for adjusting percpu thresholds]. Unless there is an objection, can this be picked up for 2.6.37-stable please? Ideally it would apply against 2.6.36.x as well but that release is no longer maintained. ==== CUT HERE ==== reduce_pgdat_percpu_threshold() and restore_pgdat_percpu_threshold() exist to adjust the per-cpu vmstat thresholds while kswapd is awake to avoid errors due to counter drift. The functions duplicate some code so this patch replaces them with a single set_pgdat_percpu_threshold() that takes a callback function to calculate the desired threshold as a parameter. [akpm@xxxxxxxxxxxxxxxxxxxx: readability tweak] [kosaki.motohiro@xxxxxxxxxxxxxx: set_pgdat_percpu_threshold(): don't use for_each_online_cpu] Signed-off-by: Mel Gorman <mel@xxxxxxxxx> Reviewed-by: Christoph Lameter <cl@xxxxxxxxx> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index e4cc21c..833e676 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,8 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); -void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); -void restore_pgdat_percpu_threshold(pg_data_t *pgdat); + +int calculate_pressure_threshold(struct zone *zone); +int calculate_normal_threshold(struct zone *zone); +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* @@ -300,8 +303,7 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state -static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } -static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } +#define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_cpu_vm_stats(int cpu) { } #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 5da4295..86f8c34 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2448,9 +2448,24 @@ static int kswapd(void *p) */ if (!sleeping_prematurely(pgdat, order, remaining)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); - restore_pgdat_percpu_threshold(pgdat); + + /* + * vmstat counters are not perfectly + * accurate and the estimated value + * for counters such as NR_FREE_PAGES + * can deviate from the true value by + * nr_online_cpus * threshold. To + * avoid the zone watermarks being + * breached while under pressure, we + * reduce the per-cpu vmstat threshold + * while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, + calculate_normal_threshold); schedule(); - reduce_pgdat_percpu_threshold(pgdat); + set_pgdat_percpu_threshold(pgdat, + calculate_pressure_threshold); } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); diff --git a/mm/vmstat.c b/mm/vmstat.c index bc0f095..751a65e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -83,7 +83,7 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP -static int calculate_pressure_threshold(struct zone *zone) +int calculate_pressure_threshold(struct zone *zone) { int threshold; int watermark_distance; @@ -107,7 +107,7 @@ static int calculate_pressure_threshold(struct zone *zone) return threshold; } -static int calculate_threshold(struct zone *zone) +int calculate_normal_threshold(struct zone *zone) { int threshold; int mem; /* memory in 128 MB units */ @@ -166,7 +166,7 @@ static void refresh_zone_stat_thresholds(void) for_each_populated_zone(zone) { unsigned long max_drift, tolerate_drift; - threshold = calculate_threshold(zone); + threshold = calculate_normal_threshold(zone); for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold @@ -185,46 +185,24 @@ static void refresh_zone_stat_thresholds(void) } } -void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)) { struct zone *zone; int cpu; int threshold; int i; - get_online_cpus(); - for (i = 0; i < pgdat->nr_zones; i++) { - zone = &pgdat->node_zones[i]; - if (!zone->percpu_drift_mark) - continue; - - threshold = calculate_pressure_threshold(zone); - for_each_online_cpu(cpu) - per_cpu_ptr(zone->pageset, cpu)->stat_threshold - = threshold; - } - put_online_cpus(); -} - -void restore_pgdat_percpu_threshold(pg_data_t *pgdat) -{ - struct zone *zone; - int cpu; - int threshold; - int i; - - get_online_cpus(); for (i = 0; i < pgdat->nr_zones; i++) { zone = &pgdat->node_zones[i]; if (!zone->percpu_drift_mark) continue; - threshold = calculate_threshold(zone); - for_each_online_cpu(cpu) + threshold = (*calculate_pressure)(zone); + for_each_possible_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; } - put_online_cpus(); } /* -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>