On Tue, Aug 16, 2011 at 10:20:10AM +0800, Wu Fengguang wrote: > Add two fields to task_struct. > > 1) account dirtied pages in the individual tasks, for accuracy > 2) per-task balance_dirty_pages() call intervals, for flexibility > > The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will > scale near-sqrt to the safety gap between dirty pages and threshold. > > The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying > pages at exactly the same time, each task will be assigned a large > initial nr_dirtied_pause, so that the dirty threshold will be exceeded > long before each task reached its nr_dirtied_pause and hence call > balance_dirty_pages(). > > The solution is to watch for the number of pages dirtied on each CPU in > between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages > (3% dirty threshold), force call balance_dirty_pages() for a chance to > set bdi->dirty_exceeded. In normal situations, this safeguarding > condition is not expected to trigger at all. > > peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case > > CC: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> > Reviewed-by: Andrea Righi <andrea@xxxxxxxxxxxxxxx> > Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx> > --- > include/linux/sched.h | 7 +++ > kernel/fork.c | 3 + > mm/page-writeback.c | 90 ++++++++++++++++++++++------------------ > 3 files changed, 61 insertions(+), 39 deletions(-) > > --- linux-next.orig/include/linux/sched.h 2011-08-14 18:03:44.000000000 +0800 > +++ linux-next/include/linux/sched.h 2011-08-15 10:26:05.000000000 +0800 > @@ -1525,6 +1525,13 @@ struct task_struct { > int make_it_fail; > #endif > struct prop_local_single dirties; > + /* > + * when (nr_dirtied >= nr_dirtied_pause), it's time to call > + * balance_dirty_pages() for some dirty throttling pause > + */ > + int nr_dirtied; > + int nr_dirtied_pause; > + > #ifdef CONFIG_LATENCYTOP > int latency_record_count; > struct latency_record latency_record[LT_SAVECOUNT]; > --- linux-next.orig/mm/page-writeback.c 2011-08-15 10:26:04.000000000 +0800 > +++ linux-next/mm/page-writeback.c 2011-08-15 13:51:16.000000000 +0800 > @@ -54,20 +54,6 @@ > */ > static long ratelimit_pages = 32; > > -/* > - * When balance_dirty_pages decides that the caller needs to perform some > - * non-background writeback, this is how many pages it will attempt to write. > - * It should be somewhat larger than dirtied pages to ensure that reasonably > - * large amounts of I/O are submitted. > - */ > -static inline long sync_writeback_pages(unsigned long dirtied) > -{ > - if (dirtied < ratelimit_pages) > - dirtied = ratelimit_pages; > - > - return dirtied + dirtied / 2; > -} > - > /* The following parameters are exported via /proc/sys/vm */ > > /* > @@ -169,6 +155,8 @@ static void update_completion_period(voi > int shift = calc_period_shift(); > prop_change_shift(&vm_completions, shift); > prop_change_shift(&vm_dirties, shift); > + > + writeback_set_ratelimit(); > } > > int dirty_background_ratio_handler(struct ctl_table *table, int write, > @@ -930,6 +918,23 @@ static void bdi_update_bandwidth(struct > } > > /* > + * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() > + * will look to see if it needs to start dirty throttling. > + * > + * If dirty_poll_interval is too low, big NUMA machines will call the expensive > + * global_page_state() too often. So scale it near-sqrt to the safety margin > + * (the number of pages we may dirty without exceeding the dirty limits). > + */ > +static unsigned long dirty_poll_interval(unsigned long dirty, > + unsigned long thresh) > +{ > + if (thresh > dirty) > + return 1UL << (ilog2(thresh - dirty) >> 1); > + > + return 1; > +} > + > +/* > * balance_dirty_pages() must be called by processes which are generating dirty > * data. It looks at the number of dirty pages in the machine and will force > * the caller to perform writeback if the system is over `vm_dirty_ratio'. > @@ -1072,6 +1077,9 @@ static void balance_dirty_pages(struct a > if (clear_dirty_exceeded && bdi->dirty_exceeded) > bdi->dirty_exceeded = 0; > > + current->nr_dirtied = 0; > + current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh); > + > if (writeback_in_progress(bdi)) > return; > > @@ -1098,7 +1106,7 @@ void set_page_dirty_balance(struct page > } > } > > -static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; > +static DEFINE_PER_CPU(int, bdp_ratelimits); > > /** > * balance_dirty_pages_ratelimited_nr - balance dirty memory state > @@ -1118,31 +1126,40 @@ void balance_dirty_pages_ratelimited_nr( > unsigned long nr_pages_dirtied) > { > struct backing_dev_info *bdi = mapping->backing_dev_info; > - unsigned long ratelimit; > - unsigned long *p; > + int ratelimit; > + int *p; > > if (!bdi_cap_account_dirty(bdi)) > return; > > - ratelimit = ratelimit_pages; > - if (mapping->backing_dev_info->dirty_exceeded) > - ratelimit = 8; > + if (!bdi->dirty_exceeded) > + ratelimit = current->nr_dirtied_pause; > + else > + ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); Usage of ratelimit before init? Maybe: ratelimit = current->nr_dirtied_pause; if (bdi->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); Thanks, -Andrea > + > + current->nr_dirtied += nr_pages_dirtied; > > + preempt_disable(); > /* > - * Check the rate limiting. Also, we do not want to throttle real-time > - * tasks in balance_dirty_pages(). Period. > + * This prevents one CPU to accumulate too many dirtied pages without > + * calling into balance_dirty_pages(), which can happen when there are > + * 1000+ tasks, all of them start dirtying pages at exactly the same > + * time, hence all honoured too large initial task->nr_dirtied_pause. > */ > - preempt_disable(); > p = &__get_cpu_var(bdp_ratelimits); > - *p += nr_pages_dirtied; > - if (unlikely(*p >= ratelimit)) { > - ratelimit = sync_writeback_pages(*p); > + if (unlikely(current->nr_dirtied >= ratelimit)) > *p = 0; > - preempt_enable(); > - balance_dirty_pages(mapping, ratelimit); > - return; > + else { > + *p += nr_pages_dirtied; > + if (unlikely(*p >= ratelimit_pages)) { > + *p = 0; > + ratelimit = 0; > + } > } > preempt_enable(); > + > + if (unlikely(current->nr_dirtied >= ratelimit)) > + balance_dirty_pages(mapping, current->nr_dirtied); > } > EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); > > @@ -1237,22 +1254,17 @@ void laptop_sync_completion(void) > * > * Here we set ratelimit_pages to a level which ensures that when all CPUs are > * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory > - * thresholds before writeback cuts in. > - * > - * But the limit should not be set too high. Because it also controls the > - * amount of memory which the balance_dirty_pages() caller has to write back. > - * If this is too large then the caller will block on the IO queue all the > - * time. So limit it to four megabytes - the balance_dirty_pages() caller > - * will write six megabyte chunks, max. > + * thresholds. > */ > > void writeback_set_ratelimit(void) > { > - ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); > + unsigned long background_thresh; > + unsigned long dirty_thresh; > + global_dirty_limits(&background_thresh, &dirty_thresh); > + ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); > if (ratelimit_pages < 16) > ratelimit_pages = 16; > - if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) > - ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; > } > > static int __cpuinit > --- linux-next.orig/kernel/fork.c 2011-08-14 18:03:44.000000000 +0800 > +++ linux-next/kernel/fork.c 2011-08-15 10:26:05.000000000 +0800 > @@ -1301,6 +1301,9 @@ static struct task_struct *copy_process( > p->pdeath_signal = 0; > p->exit_state = 0; > > + p->nr_dirtied = 0; > + p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); > + > /* > * Ok, make it visible to the rest of the system. > * We dont wake it up yet. > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html