Add two fields to task_struct. 1) account dirtied pages in the individual tasks, for accuracy 2) per-task balance_dirty_pages() call intervals, for flexibility The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will scale near-sqrt to the safety gap between dirty pages and threshold. XXX: The main problem of per-task nr_dirtied is, if 10k tasks start dirtying pages at exactly the same time, each task will be assigned a large initial nr_dirtied_pause, so that the dirty threshold will be exceeded long before each task reached its nr_dirtied_pause and hence call balance_dirty_pages(). Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx> --- include/linux/sched.h | 7 ++ mm/memory_hotplug.c | 3 - mm/page-writeback.c | 106 +++++++++------------------------------- 3 files changed, 32 insertions(+), 84 deletions(-) --- linux-next.orig/include/linux/sched.h 2011-08-05 15:36:23.000000000 +0800 +++ linux-next/include/linux/sched.h 2011-08-05 15:39:52.000000000 +0800 @@ -1525,6 +1525,13 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; + /* + * when (nr_dirtied >= nr_dirtied_pause), it's time to call + * balance_dirty_pages() for some dirty throttling pause + */ + int nr_dirtied; + int nr_dirtied_pause; + #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; --- linux-next.orig/mm/page-writeback.c 2011-08-05 15:39:48.000000000 +0800 +++ linux-next/mm/page-writeback.c 2011-08-05 15:39:52.000000000 +0800 @@ -48,26 +48,6 @@ #define BANDWIDTH_CALC_SHIFT 10 -/* - * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited - * will look to see if it needs to force writeback or throttling. - */ -static long ratelimit_pages = 32; - -/* - * When balance_dirty_pages decides that the caller needs to perform some - * non-background writeback, this is how many pages it will attempt to write. - * It should be somewhat larger than dirtied pages to ensure that reasonably - * large amounts of I/O are submitted. - */ -static inline long sync_writeback_pages(unsigned long dirtied) -{ - if (dirtied < ratelimit_pages) - dirtied = ratelimit_pages; - - return dirtied + dirtied / 2; -} - /* The following parameters are exported via /proc/sys/vm */ /* @@ -868,6 +848,23 @@ static void bdi_update_bandwidth(struct } /* + * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() + * will look to see if it needs to start dirty throttling. + * + * If ratelimit_pages is too low then big NUMA machines will call the expensive + * global_page_state() too often. So scale it near-sqrt to the safety margin + * (the number of pages we may dirty without exceeding the dirty limits). + */ +static unsigned long ratelimit_pages(unsigned long dirty, + unsigned long thresh) +{ + if (thresh > dirty) + return 1UL << (ilog2(thresh - dirty) >> 1); + + return 1; +} + +/* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to perform writeback if the system is over `vm_dirty_ratio'. @@ -1008,6 +1005,9 @@ static void balance_dirty_pages(struct a if (clear_dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; + current->nr_dirtied = 0; + current->nr_dirtied_pause = ratelimit_pages(nr_dirty, dirty_thresh); + if (writeback_in_progress(bdi)) return; @@ -1034,8 +1034,6 @@ void set_page_dirty_balance(struct page } } -static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; - /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -1055,30 +1053,17 @@ void balance_dirty_pages_ratelimited_nr( { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long ratelimit; - unsigned long *p; if (!bdi_cap_account_dirty(bdi)) return; - ratelimit = ratelimit_pages; - if (mapping->backing_dev_info->dirty_exceeded) + ratelimit = current->nr_dirtied_pause; + if (bdi->dirty_exceeded) ratelimit = 8; - /* - * Check the rate limiting. Also, we do not want to throttle real-time - * tasks in balance_dirty_pages(). Period. - */ - preempt_disable(); - p = &__get_cpu_var(bdp_ratelimits); - *p += nr_pages_dirtied; - if (unlikely(*p >= ratelimit)) { - ratelimit = sync_writeback_pages(*p); - *p = 0; - preempt_enable(); - balance_dirty_pages(mapping, ratelimit); - return; - } - preempt_enable(); + current->nr_dirtied += nr_pages_dirtied; + if (unlikely(current->nr_dirtied >= ratelimit)) + balance_dirty_pages(mapping, current->nr_dirtied); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); @@ -1166,44 +1151,6 @@ void laptop_sync_completion(void) #endif /* - * If ratelimit_pages is too high then we can get into dirty-data overload - * if a large number of processes all perform writes at the same time. - * If it is too low then SMP machines will call the (expensive) - * get_writeback_state too often. - * - * Here we set ratelimit_pages to a level which ensures that when all CPUs are - * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory - * thresholds before writeback cuts in. - * - * But the limit should not be set too high. Because it also controls the - * amount of memory which the balance_dirty_pages() caller has to write back. - * If this is too large then the caller will block on the IO queue all the - * time. So limit it to four megabytes - the balance_dirty_pages() caller - * will write six megabyte chunks, max. - */ - -void writeback_set_ratelimit(void) -{ - ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); - if (ratelimit_pages < 16) - ratelimit_pages = 16; - if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) - ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; -} - -static int __cpuinit -ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) -{ - writeback_set_ratelimit(); - return NOTIFY_DONE; -} - -static struct notifier_block __cpuinitdata ratelimit_nb = { - .notifier_call = ratelimit_handler, - .next = NULL, -}; - -/* * Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory @@ -1225,9 +1172,6 @@ void __init page_writeback_init(void) { int shift; - writeback_set_ratelimit(); - register_cpu_notifier(&ratelimit_nb); - shift = calc_period_shift(); prop_descriptor_init(&vm_completions, shift); prop_descriptor_init(&vm_dirties, shift); --- linux-next.orig/mm/memory_hotplug.c 2011-08-05 15:36:23.000000000 +0800 +++ linux-next/mm/memory_hotplug.c 2011-08-05 15:39:52.000000000 +0800 @@ -527,8 +527,6 @@ int __ref online_pages(unsigned long pfn vm_total_pages = nr_free_pagecache_pages(); - writeback_set_ratelimit(); - if (onlined_pages) memory_notify(MEM_ONLINE, &arg); unlock_memory_hotplug(); @@ -970,7 +968,6 @@ repeat: } vm_total_pages = nr_free_pagecache_pages(); - writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); unlock_memory_hotplug(); -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html