Re: [PATCH 4/5] writeback: per task dirty rate limit

Andrea Righi <andrea@xxxxxxxxxxxxxxx> · Tue, 16 Aug 2011 09:17:09 +0200

On Tue, Aug 16, 2011 at 10:20:10AM +0800, Wu Fengguang wrote:
> Add two fields to task_struct.
> 
> 1) account dirtied pages in the individual tasks, for accuracy
> 2) per-task balance_dirty_pages() call intervals, for flexibility
> 
> The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will
> scale near-sqrt to the safety gap between dirty pages and threshold.
> 
> The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying
> pages at exactly the same time, each task will be assigned a large
> initial nr_dirtied_pause, so that the dirty threshold will be exceeded
> long before each task reached its nr_dirtied_pause and hence call
> balance_dirty_pages().
> 
> The solution is to watch for the number of pages dirtied on each CPU in
> between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages
> (3% dirty threshold), force call balance_dirty_pages() for a chance to
> set bdi->dirty_exceeded. In normal situations, this safeguarding
> condition is not expected to trigger at all.
> 
> peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case
> 
> CC: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
> Reviewed-by: Andrea Righi <andrea@xxxxxxxxxxxxxxx>
> Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx>
> ---
>  include/linux/sched.h |    7 +++
>  kernel/fork.c         |    3 +
>  mm/page-writeback.c   |   90 ++++++++++++++++++++++------------------
>  3 files changed, 61 insertions(+), 39 deletions(-)
> 
> --- linux-next.orig/include/linux/sched.h	2011-08-14 18:03:44.000000000 +0800
> +++ linux-next/include/linux/sched.h	2011-08-15 10:26:05.000000000 +0800
> @@ -1525,6 +1525,13 @@ struct task_struct {
>  	int make_it_fail;
>  #endif
>  	struct prop_local_single dirties;
> +	/*
> +	 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
> +	 * balance_dirty_pages() for some dirty throttling pause
> +	 */
> +	int nr_dirtied;
> +	int nr_dirtied_pause;
> +
>  #ifdef CONFIG_LATENCYTOP
>  	int latency_record_count;
>  	struct latency_record latency_record[LT_SAVECOUNT];
> --- linux-next.orig/mm/page-writeback.c	2011-08-15 10:26:04.000000000 +0800
> +++ linux-next/mm/page-writeback.c	2011-08-15 13:51:16.000000000 +0800
> @@ -54,20 +54,6 @@
>   */
>  static long ratelimit_pages = 32;
>  
> -/*
> - * When balance_dirty_pages decides that the caller needs to perform some
> - * non-background writeback, this is how many pages it will attempt to write.
> - * It should be somewhat larger than dirtied pages to ensure that reasonably
> - * large amounts of I/O are submitted.
> - */
> -static inline long sync_writeback_pages(unsigned long dirtied)
> -{
> -	if (dirtied < ratelimit_pages)
> -		dirtied = ratelimit_pages;
> -
> -	return dirtied + dirtied / 2;
> -}
> -
>  /* The following parameters are exported via /proc/sys/vm */
>  
>  /*
> @@ -169,6 +155,8 @@ static void update_completion_period(voi
>  	int shift = calc_period_shift();
>  	prop_change_shift(&vm_completions, shift);
>  	prop_change_shift(&vm_dirties, shift);
> +
> +	writeback_set_ratelimit();
>  }
>  
>  int dirty_background_ratio_handler(struct ctl_table *table, int write,
> @@ -930,6 +918,23 @@ static void bdi_update_bandwidth(struct 
>  }
>  
>  /*
> + * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
> + * will look to see if it needs to start dirty throttling.
> + *
> + * If dirty_poll_interval is too low, big NUMA machines will call the expensive
> + * global_page_state() too often. So scale it near-sqrt to the safety margin
> + * (the number of pages we may dirty without exceeding the dirty limits).
> + */
> +static unsigned long dirty_poll_interval(unsigned long dirty,
> +					 unsigned long thresh)
> +{
> +	if (thresh > dirty)
> +		return 1UL << (ilog2(thresh - dirty) >> 1);
> +
> +	return 1;
> +}
> +
> +/*
>   * balance_dirty_pages() must be called by processes which are generating dirty
>   * data.  It looks at the number of dirty pages in the machine and will force
>   * the caller to perform writeback if the system is over `vm_dirty_ratio'.
> @@ -1072,6 +1077,9 @@ static void balance_dirty_pages(struct a
>  	if (clear_dirty_exceeded && bdi->dirty_exceeded)
>  		bdi->dirty_exceeded = 0;
>  
> +	current->nr_dirtied = 0;
> +	current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh);
> +
>  	if (writeback_in_progress(bdi))
>  		return;
>  
> @@ -1098,7 +1106,7 @@ void set_page_dirty_balance(struct page 
>  	}
>  }
>  
> -static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
> +static DEFINE_PER_CPU(int, bdp_ratelimits);
>  
>  /**
>   * balance_dirty_pages_ratelimited_nr - balance dirty memory state
> @@ -1118,31 +1126,40 @@ void balance_dirty_pages_ratelimited_nr(
>  					unsigned long nr_pages_dirtied)
>  {
>  	struct backing_dev_info *bdi = mapping->backing_dev_info;
> -	unsigned long ratelimit;
> -	unsigned long *p;
> +	int ratelimit;
> +	int *p;
>  
>  	if (!bdi_cap_account_dirty(bdi))
>  		return;
>  
> -	ratelimit = ratelimit_pages;
> -	if (mapping->backing_dev_info->dirty_exceeded)
> -		ratelimit = 8;
> +	if (!bdi->dirty_exceeded)
> +		ratelimit = current->nr_dirtied_pause;
> +	else
> +		ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));

Usage of ratelimit before init?

Maybe:

	ratelimit = current->nr_dirtied_pause;
	if (bdi->dirty_exceeded)
		ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));

Thanks,
-Andrea

> +
> +	current->nr_dirtied += nr_pages_dirtied;
>  
> +	preempt_disable();
>  	/*
> -	 * Check the rate limiting. Also, we do not want to throttle real-time
> -	 * tasks in balance_dirty_pages(). Period.
> +	 * This prevents one CPU to accumulate too many dirtied pages without
> +	 * calling into balance_dirty_pages(), which can happen when there are
> +	 * 1000+ tasks, all of them start dirtying pages at exactly the same
> +	 * time, hence all honoured too large initial task->nr_dirtied_pause.
>  	 */
> -	preempt_disable();
>  	p =  &__get_cpu_var(bdp_ratelimits);
> -	*p += nr_pages_dirtied;
> -	if (unlikely(*p >= ratelimit)) {
> -		ratelimit = sync_writeback_pages(*p);
> +	if (unlikely(current->nr_dirtied >= ratelimit))
>  		*p = 0;
> -		preempt_enable();
> -		balance_dirty_pages(mapping, ratelimit);
> -		return;
> +	else {
> +		*p += nr_pages_dirtied;
> +		if (unlikely(*p >= ratelimit_pages)) {
> +			*p = 0;
> +			ratelimit = 0;
> +		}
>  	}
>  	preempt_enable();
> +
> +	if (unlikely(current->nr_dirtied >= ratelimit))
> +		balance_dirty_pages(mapping, current->nr_dirtied);
>  }
>  EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
>  
> @@ -1237,22 +1254,17 @@ void laptop_sync_completion(void)
>   *
>   * Here we set ratelimit_pages to a level which ensures that when all CPUs are
>   * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
> - * thresholds before writeback cuts in.
> - *
> - * But the limit should not be set too high.  Because it also controls the
> - * amount of memory which the balance_dirty_pages() caller has to write back.
> - * If this is too large then the caller will block on the IO queue all the
> - * time.  So limit it to four megabytes - the balance_dirty_pages() caller
> - * will write six megabyte chunks, max.
> + * thresholds.
>   */
>  
>  void writeback_set_ratelimit(void)
>  {
> -	ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
> +	unsigned long background_thresh;
> +	unsigned long dirty_thresh;
> +	global_dirty_limits(&background_thresh, &dirty_thresh);
> +	ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
>  	if (ratelimit_pages < 16)
>  		ratelimit_pages = 16;
> -	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
> -		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
>  }
>  
>  static int __cpuinit
> --- linux-next.orig/kernel/fork.c	2011-08-14 18:03:44.000000000 +0800
> +++ linux-next/kernel/fork.c	2011-08-15 10:26:05.000000000 +0800
> @@ -1301,6 +1301,9 @@ static struct task_struct *copy_process(
>  	p->pdeath_signal = 0;
>  	p->exit_state = 0;
>  
> +	p->nr_dirtied = 0;
> +	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
> +
>  	/*
>  	 * Ok, make it visible to the rest of the system.
>  	 * We dont wake it up yet.
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html