[RFC PATCH v2 12/16] mm: factor out _balance_dirty_pages() from balance_dirty_pages()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This factors out the for loop in balance_dirty_pages() into a new
function called _balance_dirty_pages(). By factoring out this function
the async write code can determine if it has to wait to throttle writes
or not. The function _balance_dirty_pages() returns the sleep time.
If the sleep time is greater 0, then the async write code needs to throttle.

To maintain the context for consecutive calls of _balance_dirty_pages()
and maintain the current behavior a new data structure called bdp_ctx
has been introduced.

Signed-off-by: Stefan Roesch <shr@xxxxxx>
---
 mm/page-writeback.c | 452 +++++++++++++++++++++++---------------------
 1 file changed, 239 insertions(+), 213 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7e2da284e427..cbb74c0666c6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -140,6 +140,29 @@ struct dirty_throttle_control {
 	unsigned long		pos_ratio;
 };
 
+/* context for consecutive calls to _balance_dirty_pages() */
+struct bdp_ctx {
+	long			pause;
+	unsigned long		now;
+	unsigned long		start_time;
+	unsigned long		task_ratelimit;
+	unsigned long		dirty_ratelimit;
+	unsigned long		nr_reclaimable;
+	int			nr_dirtied_pause;
+	bool			dirty_exceeded;
+
+	struct dirty_throttle_control gdtc_stor;
+	struct dirty_throttle_control mdtc_stor;
+	struct dirty_throttle_control *sdtc;
+} bdp_ctx;
+
+/* initialize _balance_dirty_pages() context */
+#define BDP_CTX_INIT(ctx, wb)				\
+	.gdtc_stor = { GDTC_INIT(wb) },			\
+	.mdtc_stor = { MDTC_INIT(wb, &ctx.gdtc_stor) },	\
+	.start_time = jiffies,				\
+	.dirty_exceeded = false
+
 /*
  * Length of period for aging writeout fractions of bdis. This is an
  * arbitrarily chosen number. The longer the period, the slower fractions will
@@ -1538,261 +1561,264 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
 	}
 }
 
-/*
- * balance_dirty_pages() must be called by processes which are generating dirty
- * data.  It looks at the number of dirty pages in the machine and will force
- * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
- * If we're over `background_thresh' then the writeback threads are woken to
- * perform some writeout.
- */
-static void balance_dirty_pages(struct bdi_writeback *wb,
-				unsigned long pages_dirtied)
+static inline int _balance_dirty_pages(struct bdi_writeback *wb,
+					unsigned long pages_dirtied, struct bdp_ctx *ctx)
 {
-	struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
-	struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
-	struct dirty_throttle_control * const gdtc = &gdtc_stor;
-	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
-						     &mdtc_stor : NULL;
-	struct dirty_throttle_control *sdtc;
-	unsigned long nr_reclaimable;	/* = file_dirty */
+	struct dirty_throttle_control * const gdtc = &ctx->gdtc_stor;
+	struct dirty_throttle_control * const mdtc = mdtc_valid(&ctx->mdtc_stor) ?
+						     &ctx->mdtc_stor : NULL;
 	long period;
-	long pause;
 	long max_pause;
 	long min_pause;
-	int nr_dirtied_pause;
-	bool dirty_exceeded = false;
-	unsigned long task_ratelimit;
-	unsigned long dirty_ratelimit;
 	struct backing_dev_info *bdi = wb->bdi;
 	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
-	unsigned long start_time = jiffies;
 
-	for (;;) {
-		unsigned long now = jiffies;
-		unsigned long dirty, thresh, bg_thresh;
-		unsigned long m_dirty = 0;	/* stop bogus uninit warnings */
-		unsigned long m_thresh = 0;
-		unsigned long m_bg_thresh = 0;
-
-		nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
-		gdtc->avail = global_dirtyable_memory();
-		gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
+	unsigned long dirty, thresh, bg_thresh;
+	unsigned long m_dirty = 0;	/* stop bogus uninit warnings */
+	unsigned long m_thresh = 0;
+	unsigned long m_bg_thresh = 0;
 
-		domain_dirty_limits(gdtc);
+	ctx->now = jiffies;
+	ctx->nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
+	gdtc->avail = global_dirtyable_memory();
+	gdtc->dirty = ctx->nr_reclaimable + global_node_page_state(NR_WRITEBACK);
 
-		if (unlikely(strictlimit)) {
-			wb_dirty_limits(gdtc);
+	domain_dirty_limits(gdtc);
 
-			dirty = gdtc->wb_dirty;
-			thresh = gdtc->wb_thresh;
-			bg_thresh = gdtc->wb_bg_thresh;
-		} else {
-			dirty = gdtc->dirty;
-			thresh = gdtc->thresh;
-			bg_thresh = gdtc->bg_thresh;
-		}
+	if (unlikely(strictlimit)) {
+		wb_dirty_limits(gdtc);
 
-		if (mdtc) {
-			unsigned long filepages, headroom, writeback;
+		dirty = gdtc->wb_dirty;
+		thresh = gdtc->wb_thresh;
+		bg_thresh = gdtc->wb_bg_thresh;
+	} else {
+		dirty = gdtc->dirty;
+		thresh = gdtc->thresh;
+		bg_thresh = gdtc->bg_thresh;
+	}
 
-			/*
-			 * If @wb belongs to !root memcg, repeat the same
-			 * basic calculations for the memcg domain.
-			 */
-			mem_cgroup_wb_stats(wb, &filepages, &headroom,
-					    &mdtc->dirty, &writeback);
-			mdtc->dirty += writeback;
-			mdtc_calc_avail(mdtc, filepages, headroom);
-
-			domain_dirty_limits(mdtc);
-
-			if (unlikely(strictlimit)) {
-				wb_dirty_limits(mdtc);
-				m_dirty = mdtc->wb_dirty;
-				m_thresh = mdtc->wb_thresh;
-				m_bg_thresh = mdtc->wb_bg_thresh;
-			} else {
-				m_dirty = mdtc->dirty;
-				m_thresh = mdtc->thresh;
-				m_bg_thresh = mdtc->bg_thresh;
-			}
-		}
+	if (mdtc) {
+		unsigned long filepages, headroom, writeback;
 
 		/*
-		 * Throttle it only when the background writeback cannot
-		 * catch-up. This avoids (excessively) small writeouts
-		 * when the wb limits are ramping up in case of !strictlimit.
-		 *
-		 * In strictlimit case make decision based on the wb counters
-		 * and limits. Small writeouts when the wb limits are ramping
-		 * up are the price we consciously pay for strictlimit-ing.
-		 *
-		 * If memcg domain is in effect, @dirty should be under
-		 * both global and memcg freerun ceilings.
+		 * If @wb belongs to !root memcg, repeat the same
+		 * basic calculations for the memcg domain.
 		 */
-		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
-		    (!mdtc ||
-		     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
-			unsigned long intv;
-			unsigned long m_intv;
+		mem_cgroup_wb_stats(wb, &filepages, &headroom,
+				    &mdtc->dirty, &writeback);
+		mdtc->dirty += writeback;
+		mdtc_calc_avail(mdtc, filepages, headroom);
 
-free_running:
-			intv = dirty_poll_interval(dirty, thresh);
-			m_intv = ULONG_MAX;
+		domain_dirty_limits(mdtc);
 
-			current->dirty_paused_when = now;
-			current->nr_dirtied = 0;
-			if (mdtc)
-				m_intv = dirty_poll_interval(m_dirty, m_thresh);
-			current->nr_dirtied_pause = min(intv, m_intv);
-			break;
+		if (unlikely(strictlimit)) {
+			wb_dirty_limits(mdtc);
+			m_dirty = mdtc->wb_dirty;
+			m_thresh = mdtc->wb_thresh;
+			m_bg_thresh = mdtc->wb_bg_thresh;
+		} else {
+			m_dirty = mdtc->dirty;
+			m_thresh = mdtc->thresh;
+			m_bg_thresh = mdtc->bg_thresh;
 		}
+	}
 
-		if (unlikely(!writeback_in_progress(wb)))
-			wb_start_background_writeback(wb);
+	/*
+	 * Throttle it only when the background writeback cannot
+	 * catch-up. This avoids (excessively) small writeouts
+	 * when the wb limits are ramping up in case of !strictlimit.
+	 *
+	 * In strictlimit case make decision based on the wb counters
+	 * and limits. Small writeouts when the wb limits are ramping
+	 * up are the price we consciously pay for strictlimit-ing.
+	 *
+	 * If memcg domain is in effect, @dirty should be under
+	 * both global and memcg freerun ceilings.
+	 */
+	if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
+	    (!mdtc ||
+	     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
+		unsigned long intv;
+		unsigned long m_intv;
 
-		mem_cgroup_flush_foreign(wb);
+free_running:
+		intv = dirty_poll_interval(dirty, thresh);
+		m_intv = ULONG_MAX;
+
+		current->dirty_paused_when = ctx->now;
+		current->nr_dirtied = 0;
+		if (mdtc)
+			m_intv = dirty_poll_interval(m_dirty, m_thresh);
+		current->nr_dirtied_pause = min(intv, m_intv);
+		return 0;
+	}
+
+	if (unlikely(!writeback_in_progress(wb)))
+		wb_start_background_writeback(wb);
 
+	mem_cgroup_flush_foreign(wb);
+
+	/*
+	 * Calculate global domain's pos_ratio and select the
+	 * global dtc by default.
+	 */
+	if (!strictlimit) {
+		wb_dirty_limits(gdtc);
+
+		if ((current->flags & PF_LOCAL_THROTTLE) &&
+		    gdtc->wb_dirty <
+		    dirty_freerun_ceiling(gdtc->wb_thresh,
+					  gdtc->wb_bg_thresh))
+			/*
+			 * LOCAL_THROTTLE tasks must not be throttled
+			 * when below the per-wb freerun ceiling.
+			 */
+			goto free_running;
+	}
+
+	ctx->dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
+		((gdtc->dirty > gdtc->thresh) || strictlimit);
+
+	wb_position_ratio(gdtc);
+	ctx->sdtc = gdtc;
+
+	if (mdtc) {
 		/*
-		 * Calculate global domain's pos_ratio and select the
-		 * global dtc by default.
+		 * If memcg domain is in effect, calculate its
+		 * pos_ratio.  @wb should satisfy constraints from
+		 * both global and memcg domains.  Choose the one
+		 * w/ lower pos_ratio.
 		 */
 		if (!strictlimit) {
-			wb_dirty_limits(gdtc);
+			wb_dirty_limits(mdtc);
 
 			if ((current->flags & PF_LOCAL_THROTTLE) &&
-			    gdtc->wb_dirty <
-			    dirty_freerun_ceiling(gdtc->wb_thresh,
-						  gdtc->wb_bg_thresh))
+			    mdtc->wb_dirty <
+			    dirty_freerun_ceiling(mdtc->wb_thresh,
+						  mdtc->wb_bg_thresh))
 				/*
-				 * LOCAL_THROTTLE tasks must not be throttled
-				 * when below the per-wb freerun ceiling.
+				 * LOCAL_THROTTLE tasks must not be
+				 * throttled when below the per-wb
+				 * freerun ceiling.
 				 */
 				goto free_running;
 		}
+		ctx->dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
+			((mdtc->dirty > mdtc->thresh) || strictlimit);
 
-		dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
-			((gdtc->dirty > gdtc->thresh) || strictlimit);
+		wb_position_ratio(mdtc);
+		if (mdtc->pos_ratio < gdtc->pos_ratio)
+			ctx->sdtc = mdtc;
+	}
 
-		wb_position_ratio(gdtc);
-		sdtc = gdtc;
+	if (ctx->dirty_exceeded && !wb->dirty_exceeded)
+		wb->dirty_exceeded = 1;
 
-		if (mdtc) {
-			/*
-			 * If memcg domain is in effect, calculate its
-			 * pos_ratio.  @wb should satisfy constraints from
-			 * both global and memcg domains.  Choose the one
-			 * w/ lower pos_ratio.
-			 */
-			if (!strictlimit) {
-				wb_dirty_limits(mdtc);
-
-				if ((current->flags & PF_LOCAL_THROTTLE) &&
-				    mdtc->wb_dirty <
-				    dirty_freerun_ceiling(mdtc->wb_thresh,
-							  mdtc->wb_bg_thresh))
-					/*
-					 * LOCAL_THROTTLE tasks must not be
-					 * throttled when below the per-wb
-					 * freerun ceiling.
-					 */
-					goto free_running;
-			}
-			dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
-				((mdtc->dirty > mdtc->thresh) || strictlimit);
+	if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+				   BANDWIDTH_INTERVAL))
+		__wb_update_bandwidth(gdtc, mdtc, true);
+
+	/* throttle according to the chosen dtc */
+	ctx->dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
+	ctx->task_ratelimit = ((u64)ctx->dirty_ratelimit * ctx->sdtc->pos_ratio) >>
+						RATELIMIT_CALC_SHIFT;
+	max_pause = wb_max_pause(wb, ctx->sdtc->wb_dirty);
+	min_pause = wb_min_pause(wb, max_pause,
+				 ctx->task_ratelimit, ctx->dirty_ratelimit,
+				 &ctx->nr_dirtied_pause);
+
+	if (unlikely(ctx->task_ratelimit == 0)) {
+		period = max_pause;
+		ctx->pause = max_pause;
+		goto pause;
+	}
+	period = HZ * pages_dirtied / ctx->task_ratelimit;
+	ctx->pause = period;
+	if (current->dirty_paused_when)
+		ctx->pause -= ctx->now - current->dirty_paused_when;
+	/*
+	 * For less than 1s think time (ext3/4 may block the dirtier
+	 * for up to 800ms from time to time on 1-HDD; so does xfs,
+	 * however at much less frequency), try to compensate it in
+	 * future periods by updating the virtual time; otherwise just
+	 * do a reset, as it may be a light dirtier.
+	 */
+	if (ctx->pause < min_pause) {
+		trace_balance_dirty_pages(wb,
+					  ctx->sdtc->thresh,
+					  ctx->sdtc->bg_thresh,
+					  ctx->sdtc->dirty,
+					  ctx->sdtc->wb_thresh,
+					  ctx->sdtc->wb_dirty,
+					  ctx->dirty_ratelimit,
+					  ctx->task_ratelimit,
+					  pages_dirtied,
+					  period,
+					  min(ctx->pause, 0L),
+					  ctx->start_time);
+		if (ctx->pause < -HZ) {
+			current->dirty_paused_when = ctx->now;
+			current->nr_dirtied = 0;
+		} else if (period) {
+			current->dirty_paused_when += period;
+			current->nr_dirtied = 0;
+		} else if (current->nr_dirtied_pause <= pages_dirtied)
+			current->nr_dirtied_pause += pages_dirtied;
+		return 0;
+	}
+	if (unlikely(ctx->pause > max_pause)) {
+		/* for occasional dropped task_ratelimit */
+		ctx->now += min(ctx->pause - max_pause, max_pause);
+		ctx->pause = max_pause;
+	}
 
-			wb_position_ratio(mdtc);
-			if (mdtc->pos_ratio < gdtc->pos_ratio)
-				sdtc = mdtc;
-		}
+pause:
+	trace_balance_dirty_pages(wb,
+				  ctx->sdtc->thresh,
+				  ctx->sdtc->bg_thresh,
+				  ctx->sdtc->dirty,
+				  ctx->sdtc->wb_thresh,
+				  ctx->sdtc->wb_dirty,
+				  ctx->dirty_ratelimit,
+				  ctx->task_ratelimit,
+				  pages_dirtied,
+				  period,
+				  ctx->pause,
+				  ctx->start_time);
+
+	return ctx->pause;
+}
 
-		if (dirty_exceeded && !wb->dirty_exceeded)
-			wb->dirty_exceeded = 1;
-
-		if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
-					   BANDWIDTH_INTERVAL))
-			__wb_update_bandwidth(gdtc, mdtc, true);
-
-		/* throttle according to the chosen dtc */
-		dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
-		task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
-							RATELIMIT_CALC_SHIFT;
-		max_pause = wb_max_pause(wb, sdtc->wb_dirty);
-		min_pause = wb_min_pause(wb, max_pause,
-					 task_ratelimit, dirty_ratelimit,
-					 &nr_dirtied_pause);
-
-		if (unlikely(task_ratelimit == 0)) {
-			period = max_pause;
-			pause = max_pause;
-			goto pause;
-		}
-		period = HZ * pages_dirtied / task_ratelimit;
-		pause = period;
-		if (current->dirty_paused_when)
-			pause -= now - current->dirty_paused_when;
-		/*
-		 * For less than 1s think time (ext3/4 may block the dirtier
-		 * for up to 800ms from time to time on 1-HDD; so does xfs,
-		 * however at much less frequency), try to compensate it in
-		 * future periods by updating the virtual time; otherwise just
-		 * do a reset, as it may be a light dirtier.
-		 */
-		if (pause < min_pause) {
-			trace_balance_dirty_pages(wb,
-						  sdtc->thresh,
-						  sdtc->bg_thresh,
-						  sdtc->dirty,
-						  sdtc->wb_thresh,
-						  sdtc->wb_dirty,
-						  dirty_ratelimit,
-						  task_ratelimit,
-						  pages_dirtied,
-						  period,
-						  min(pause, 0L),
-						  start_time);
-			if (pause < -HZ) {
-				current->dirty_paused_when = now;
-				current->nr_dirtied = 0;
-			} else if (period) {
-				current->dirty_paused_when += period;
-				current->nr_dirtied = 0;
-			} else if (current->nr_dirtied_pause <= pages_dirtied)
-				current->nr_dirtied_pause += pages_dirtied;
+/*
+ * balance_dirty_pages() must be called by processes which are generating dirty
+ * data.  It looks at the number of dirty pages in the machine and will force
+ * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
+ * If we're over `background_thresh' then the writeback threads are woken to
+ * perform some writeout.
+ */
+static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied)
+{
+	int ret;
+	struct bdp_ctx ctx = { BDP_CTX_INIT(ctx, wb) };
+
+	for (;;) {
+		ret = _balance_dirty_pages(wb, current->nr_dirtied, &ctx);
+		if (!ret)
 			break;
-		}
-		if (unlikely(pause > max_pause)) {
-			/* for occasional dropped task_ratelimit */
-			now += min(pause - max_pause, max_pause);
-			pause = max_pause;
-		}
 
-pause:
-		trace_balance_dirty_pages(wb,
-					  sdtc->thresh,
-					  sdtc->bg_thresh,
-					  sdtc->dirty,
-					  sdtc->wb_thresh,
-					  sdtc->wb_dirty,
-					  dirty_ratelimit,
-					  task_ratelimit,
-					  pages_dirtied,
-					  period,
-					  pause,
-					  start_time);
 		__set_current_state(TASK_KILLABLE);
-		wb->dirty_sleep = now;
-		io_schedule_timeout(pause);
+		wb->dirty_sleep = ctx.now;
+		io_schedule_timeout(ctx.pause);
 
-		current->dirty_paused_when = now + pause;
+		current->dirty_paused_when = ctx.now + ctx.pause;
 		current->nr_dirtied = 0;
-		current->nr_dirtied_pause = nr_dirtied_pause;
+		current->nr_dirtied_pause = ctx.nr_dirtied_pause;
 
 		/*
 		 * This is typically equal to (dirty < thresh) and can also
 		 * keep "1000+ dd on a slow USB stick" under control.
 		 */
-		if (task_ratelimit)
+		if (ctx.task_ratelimit)
 			break;
 
 		/*
@@ -1805,14 +1831,14 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
 		 * more page. However wb_dirty has accounting errors.  So use
 		 * the larger and more IO friendly wb_stat_error.
 		 */
-		if (sdtc->wb_dirty <= wb_stat_error())
+		if (ctx.sdtc->wb_dirty <= wb_stat_error())
 			break;
 
 		if (fatal_signal_pending(current))
 			break;
 	}
 
-	if (!dirty_exceeded && wb->dirty_exceeded)
+	if (!ctx.dirty_exceeded && wb->dirty_exceeded)
 		wb->dirty_exceeded = 0;
 
 	if (writeback_in_progress(wb))
@@ -1829,7 +1855,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
 	if (laptop_mode)
 		return;
 
-	if (nr_reclaimable > gdtc->bg_thresh)
+	if (ctx.nr_reclaimable > ctx.gdtc_stor.bg_thresh)
 		wb_start_background_writeback(wb);
 }
 
-- 
2.30.2





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux