This factors out the for loop in balance_dirty_pages() into a new function called _balance_dirty_pages(). By factoring out this function the async write code can determine if it has to wait to throttle writes or not. The function _balance_dirty_pages() returns the sleep time. If the sleep time is greater 0, then the async write code needs to throttle. To maintain the context for consecutive calls of _balance_dirty_pages() and maintain the current behavior a new data structure called bdp_ctx has been introduced. Signed-off-by: Stefan Roesch <shr@xxxxxx> --- mm/page-writeback.c | 452 +++++++++++++++++++++++--------------------- 1 file changed, 239 insertions(+), 213 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7e2da284e427..cbb74c0666c6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -140,6 +140,29 @@ struct dirty_throttle_control { unsigned long pos_ratio; }; +/* context for consecutive calls to _balance_dirty_pages() */ +struct bdp_ctx { + long pause; + unsigned long now; + unsigned long start_time; + unsigned long task_ratelimit; + unsigned long dirty_ratelimit; + unsigned long nr_reclaimable; + int nr_dirtied_pause; + bool dirty_exceeded; + + struct dirty_throttle_control gdtc_stor; + struct dirty_throttle_control mdtc_stor; + struct dirty_throttle_control *sdtc; +} bdp_ctx; + +/* initialize _balance_dirty_pages() context */ +#define BDP_CTX_INIT(ctx, wb) \ + .gdtc_stor = { GDTC_INIT(wb) }, \ + .mdtc_stor = { MDTC_INIT(wb, &ctx.gdtc_stor) }, \ + .start_time = jiffies, \ + .dirty_exceeded = false + /* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will @@ -1538,261 +1561,264 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) } } -/* - * balance_dirty_pages() must be called by processes which are generating dirty - * data. It looks at the number of dirty pages in the machine and will force - * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. - * If we're over `background_thresh' then the writeback threads are woken to - * perform some writeout. - */ -static void balance_dirty_pages(struct bdi_writeback *wb, - unsigned long pages_dirtied) +static inline int _balance_dirty_pages(struct bdi_writeback *wb, + unsigned long pages_dirtied, struct bdp_ctx *ctx) { - struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; - struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; - struct dirty_throttle_control * const gdtc = &gdtc_stor; - struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? - &mdtc_stor : NULL; - struct dirty_throttle_control *sdtc; - unsigned long nr_reclaimable; /* = file_dirty */ + struct dirty_throttle_control * const gdtc = &ctx->gdtc_stor; + struct dirty_throttle_control * const mdtc = mdtc_valid(&ctx->mdtc_stor) ? + &ctx->mdtc_stor : NULL; long period; - long pause; long max_pause; long min_pause; - int nr_dirtied_pause; - bool dirty_exceeded = false; - unsigned long task_ratelimit; - unsigned long dirty_ratelimit; struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; - unsigned long start_time = jiffies; - for (;;) { - unsigned long now = jiffies; - unsigned long dirty, thresh, bg_thresh; - unsigned long m_dirty = 0; /* stop bogus uninit warnings */ - unsigned long m_thresh = 0; - unsigned long m_bg_thresh = 0; - - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); - gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); + unsigned long dirty, thresh, bg_thresh; + unsigned long m_dirty = 0; /* stop bogus uninit warnings */ + unsigned long m_thresh = 0; + unsigned long m_bg_thresh = 0; - domain_dirty_limits(gdtc); + ctx->now = jiffies; + ctx->nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); + gdtc->avail = global_dirtyable_memory(); + gdtc->dirty = ctx->nr_reclaimable + global_node_page_state(NR_WRITEBACK); - if (unlikely(strictlimit)) { - wb_dirty_limits(gdtc); + domain_dirty_limits(gdtc); - dirty = gdtc->wb_dirty; - thresh = gdtc->wb_thresh; - bg_thresh = gdtc->wb_bg_thresh; - } else { - dirty = gdtc->dirty; - thresh = gdtc->thresh; - bg_thresh = gdtc->bg_thresh; - } + if (unlikely(strictlimit)) { + wb_dirty_limits(gdtc); - if (mdtc) { - unsigned long filepages, headroom, writeback; + dirty = gdtc->wb_dirty; + thresh = gdtc->wb_thresh; + bg_thresh = gdtc->wb_bg_thresh; + } else { + dirty = gdtc->dirty; + thresh = gdtc->thresh; + bg_thresh = gdtc->bg_thresh; + } - /* - * If @wb belongs to !root memcg, repeat the same - * basic calculations for the memcg domain. - */ - mem_cgroup_wb_stats(wb, &filepages, &headroom, - &mdtc->dirty, &writeback); - mdtc->dirty += writeback; - mdtc_calc_avail(mdtc, filepages, headroom); - - domain_dirty_limits(mdtc); - - if (unlikely(strictlimit)) { - wb_dirty_limits(mdtc); - m_dirty = mdtc->wb_dirty; - m_thresh = mdtc->wb_thresh; - m_bg_thresh = mdtc->wb_bg_thresh; - } else { - m_dirty = mdtc->dirty; - m_thresh = mdtc->thresh; - m_bg_thresh = mdtc->bg_thresh; - } - } + if (mdtc) { + unsigned long filepages, headroom, writeback; /* - * Throttle it only when the background writeback cannot - * catch-up. This avoids (excessively) small writeouts - * when the wb limits are ramping up in case of !strictlimit. - * - * In strictlimit case make decision based on the wb counters - * and limits. Small writeouts when the wb limits are ramping - * up are the price we consciously pay for strictlimit-ing. - * - * If memcg domain is in effect, @dirty should be under - * both global and memcg freerun ceilings. + * If @wb belongs to !root memcg, repeat the same + * basic calculations for the memcg domain. */ - if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && - (!mdtc || - m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { - unsigned long intv; - unsigned long m_intv; + mem_cgroup_wb_stats(wb, &filepages, &headroom, + &mdtc->dirty, &writeback); + mdtc->dirty += writeback; + mdtc_calc_avail(mdtc, filepages, headroom); -free_running: - intv = dirty_poll_interval(dirty, thresh); - m_intv = ULONG_MAX; + domain_dirty_limits(mdtc); - current->dirty_paused_when = now; - current->nr_dirtied = 0; - if (mdtc) - m_intv = dirty_poll_interval(m_dirty, m_thresh); - current->nr_dirtied_pause = min(intv, m_intv); - break; + if (unlikely(strictlimit)) { + wb_dirty_limits(mdtc); + m_dirty = mdtc->wb_dirty; + m_thresh = mdtc->wb_thresh; + m_bg_thresh = mdtc->wb_bg_thresh; + } else { + m_dirty = mdtc->dirty; + m_thresh = mdtc->thresh; + m_bg_thresh = mdtc->bg_thresh; } + } - if (unlikely(!writeback_in_progress(wb))) - wb_start_background_writeback(wb); + /* + * Throttle it only when the background writeback cannot + * catch-up. This avoids (excessively) small writeouts + * when the wb limits are ramping up in case of !strictlimit. + * + * In strictlimit case make decision based on the wb counters + * and limits. Small writeouts when the wb limits are ramping + * up are the price we consciously pay for strictlimit-ing. + * + * If memcg domain is in effect, @dirty should be under + * both global and memcg freerun ceilings. + */ + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && + (!mdtc || + m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { + unsigned long intv; + unsigned long m_intv; - mem_cgroup_flush_foreign(wb); +free_running: + intv = dirty_poll_interval(dirty, thresh); + m_intv = ULONG_MAX; + + current->dirty_paused_when = ctx->now; + current->nr_dirtied = 0; + if (mdtc) + m_intv = dirty_poll_interval(m_dirty, m_thresh); + current->nr_dirtied_pause = min(intv, m_intv); + return 0; + } + + if (unlikely(!writeback_in_progress(wb))) + wb_start_background_writeback(wb); + mem_cgroup_flush_foreign(wb); + + /* + * Calculate global domain's pos_ratio and select the + * global dtc by default. + */ + if (!strictlimit) { + wb_dirty_limits(gdtc); + + if ((current->flags & PF_LOCAL_THROTTLE) && + gdtc->wb_dirty < + dirty_freerun_ceiling(gdtc->wb_thresh, + gdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be throttled + * when below the per-wb freerun ceiling. + */ + goto free_running; + } + + ctx->dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && + ((gdtc->dirty > gdtc->thresh) || strictlimit); + + wb_position_ratio(gdtc); + ctx->sdtc = gdtc; + + if (mdtc) { /* - * Calculate global domain's pos_ratio and select the - * global dtc by default. + * If memcg domain is in effect, calculate its + * pos_ratio. @wb should satisfy constraints from + * both global and memcg domains. Choose the one + * w/ lower pos_ratio. */ if (!strictlimit) { - wb_dirty_limits(gdtc); + wb_dirty_limits(mdtc); if ((current->flags & PF_LOCAL_THROTTLE) && - gdtc->wb_dirty < - dirty_freerun_ceiling(gdtc->wb_thresh, - gdtc->wb_bg_thresh)) + mdtc->wb_dirty < + dirty_freerun_ceiling(mdtc->wb_thresh, + mdtc->wb_bg_thresh)) /* - * LOCAL_THROTTLE tasks must not be throttled - * when below the per-wb freerun ceiling. + * LOCAL_THROTTLE tasks must not be + * throttled when below the per-wb + * freerun ceiling. */ goto free_running; } + ctx->dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && + ((mdtc->dirty > mdtc->thresh) || strictlimit); - dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && - ((gdtc->dirty > gdtc->thresh) || strictlimit); + wb_position_ratio(mdtc); + if (mdtc->pos_ratio < gdtc->pos_ratio) + ctx->sdtc = mdtc; + } - wb_position_ratio(gdtc); - sdtc = gdtc; + if (ctx->dirty_exceeded && !wb->dirty_exceeded) + wb->dirty_exceeded = 1; - if (mdtc) { - /* - * If memcg domain is in effect, calculate its - * pos_ratio. @wb should satisfy constraints from - * both global and memcg domains. Choose the one - * w/ lower pos_ratio. - */ - if (!strictlimit) { - wb_dirty_limits(mdtc); - - if ((current->flags & PF_LOCAL_THROTTLE) && - mdtc->wb_dirty < - dirty_freerun_ceiling(mdtc->wb_thresh, - mdtc->wb_bg_thresh)) - /* - * LOCAL_THROTTLE tasks must not be - * throttled when below the per-wb - * freerun ceiling. - */ - goto free_running; - } - dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && - ((mdtc->dirty > mdtc->thresh) || strictlimit); + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + + BANDWIDTH_INTERVAL)) + __wb_update_bandwidth(gdtc, mdtc, true); + + /* throttle according to the chosen dtc */ + ctx->dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); + ctx->task_ratelimit = ((u64)ctx->dirty_ratelimit * ctx->sdtc->pos_ratio) >> + RATELIMIT_CALC_SHIFT; + max_pause = wb_max_pause(wb, ctx->sdtc->wb_dirty); + min_pause = wb_min_pause(wb, max_pause, + ctx->task_ratelimit, ctx->dirty_ratelimit, + &ctx->nr_dirtied_pause); + + if (unlikely(ctx->task_ratelimit == 0)) { + period = max_pause; + ctx->pause = max_pause; + goto pause; + } + period = HZ * pages_dirtied / ctx->task_ratelimit; + ctx->pause = period; + if (current->dirty_paused_when) + ctx->pause -= ctx->now - current->dirty_paused_when; + /* + * For less than 1s think time (ext3/4 may block the dirtier + * for up to 800ms from time to time on 1-HDD; so does xfs, + * however at much less frequency), try to compensate it in + * future periods by updating the virtual time; otherwise just + * do a reset, as it may be a light dirtier. + */ + if (ctx->pause < min_pause) { + trace_balance_dirty_pages(wb, + ctx->sdtc->thresh, + ctx->sdtc->bg_thresh, + ctx->sdtc->dirty, + ctx->sdtc->wb_thresh, + ctx->sdtc->wb_dirty, + ctx->dirty_ratelimit, + ctx->task_ratelimit, + pages_dirtied, + period, + min(ctx->pause, 0L), + ctx->start_time); + if (ctx->pause < -HZ) { + current->dirty_paused_when = ctx->now; + current->nr_dirtied = 0; + } else if (period) { + current->dirty_paused_when += period; + current->nr_dirtied = 0; + } else if (current->nr_dirtied_pause <= pages_dirtied) + current->nr_dirtied_pause += pages_dirtied; + return 0; + } + if (unlikely(ctx->pause > max_pause)) { + /* for occasional dropped task_ratelimit */ + ctx->now += min(ctx->pause - max_pause, max_pause); + ctx->pause = max_pause; + } - wb_position_ratio(mdtc); - if (mdtc->pos_ratio < gdtc->pos_ratio) - sdtc = mdtc; - } +pause: + trace_balance_dirty_pages(wb, + ctx->sdtc->thresh, + ctx->sdtc->bg_thresh, + ctx->sdtc->dirty, + ctx->sdtc->wb_thresh, + ctx->sdtc->wb_dirty, + ctx->dirty_ratelimit, + ctx->task_ratelimit, + pages_dirtied, + period, + ctx->pause, + ctx->start_time); + + return ctx->pause; +} - if (dirty_exceeded && !wb->dirty_exceeded) - wb->dirty_exceeded = 1; - - if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + - BANDWIDTH_INTERVAL)) - __wb_update_bandwidth(gdtc, mdtc, true); - - /* throttle according to the chosen dtc */ - dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); - task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> - RATELIMIT_CALC_SHIFT; - max_pause = wb_max_pause(wb, sdtc->wb_dirty); - min_pause = wb_min_pause(wb, max_pause, - task_ratelimit, dirty_ratelimit, - &nr_dirtied_pause); - - if (unlikely(task_ratelimit == 0)) { - period = max_pause; - pause = max_pause; - goto pause; - } - period = HZ * pages_dirtied / task_ratelimit; - pause = period; - if (current->dirty_paused_when) - pause -= now - current->dirty_paused_when; - /* - * For less than 1s think time (ext3/4 may block the dirtier - * for up to 800ms from time to time on 1-HDD; so does xfs, - * however at much less frequency), try to compensate it in - * future periods by updating the virtual time; otherwise just - * do a reset, as it may be a light dirtier. - */ - if (pause < min_pause) { - trace_balance_dirty_pages(wb, - sdtc->thresh, - sdtc->bg_thresh, - sdtc->dirty, - sdtc->wb_thresh, - sdtc->wb_dirty, - dirty_ratelimit, - task_ratelimit, - pages_dirtied, - period, - min(pause, 0L), - start_time); - if (pause < -HZ) { - current->dirty_paused_when = now; - current->nr_dirtied = 0; - } else if (period) { - current->dirty_paused_when += period; - current->nr_dirtied = 0; - } else if (current->nr_dirtied_pause <= pages_dirtied) - current->nr_dirtied_pause += pages_dirtied; +/* + * balance_dirty_pages() must be called by processes which are generating dirty + * data. It looks at the number of dirty pages in the machine and will force + * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. + */ +static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied) +{ + int ret; + struct bdp_ctx ctx = { BDP_CTX_INIT(ctx, wb) }; + + for (;;) { + ret = _balance_dirty_pages(wb, current->nr_dirtied, &ctx); + if (!ret) break; - } - if (unlikely(pause > max_pause)) { - /* for occasional dropped task_ratelimit */ - now += min(pause - max_pause, max_pause); - pause = max_pause; - } -pause: - trace_balance_dirty_pages(wb, - sdtc->thresh, - sdtc->bg_thresh, - sdtc->dirty, - sdtc->wb_thresh, - sdtc->wb_dirty, - dirty_ratelimit, - task_ratelimit, - pages_dirtied, - period, - pause, - start_time); __set_current_state(TASK_KILLABLE); - wb->dirty_sleep = now; - io_schedule_timeout(pause); + wb->dirty_sleep = ctx.now; + io_schedule_timeout(ctx.pause); - current->dirty_paused_when = now + pause; + current->dirty_paused_when = ctx.now + ctx.pause; current->nr_dirtied = 0; - current->nr_dirtied_pause = nr_dirtied_pause; + current->nr_dirtied_pause = ctx.nr_dirtied_pause; /* * This is typically equal to (dirty < thresh) and can also * keep "1000+ dd on a slow USB stick" under control. */ - if (task_ratelimit) + if (ctx.task_ratelimit) break; /* @@ -1805,14 +1831,14 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * more page. However wb_dirty has accounting errors. So use * the larger and more IO friendly wb_stat_error. */ - if (sdtc->wb_dirty <= wb_stat_error()) + if (ctx.sdtc->wb_dirty <= wb_stat_error()) break; if (fatal_signal_pending(current)) break; } - if (!dirty_exceeded && wb->dirty_exceeded) + if (!ctx.dirty_exceeded && wb->dirty_exceeded) wb->dirty_exceeded = 0; if (writeback_in_progress(wb)) @@ -1829,7 +1855,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (laptop_mode) return; - if (nr_reclaimable > gdtc->bg_thresh) + if (ctx.nr_reclaimable > ctx.gdtc_stor.bg_thresh) wb_start_background_writeback(wb); } -- 2.30.2