On 5/18/22 4:07 AM, Jan Kara wrote: > On Mon 16-05-22 09:47:14, Stefan Roesch wrote: >> This factors out the for loop in balance_dirty_pages() into a new >> function called _balance_dirty_pages(). By factoring out this function >> the async write code can determine if it has to wait to throttle writes >> or not. The function _balance_dirty_pages() returns the sleep time. >> If the sleep time is greater 0, then the async write code needs to throttle. >> >> To maintain the context for consecutive calls of _balance_dirty_pages() >> and maintain the current behavior a new data structure called bdp_ctx >> has been introduced. >> >> Signed-off-by: Stefan Roesch <shr@xxxxxx> > > ... > >> --- >> mm/page-writeback.c | 452 +++++++++++++++++++++++--------------------- >> 1 file changed, 239 insertions(+), 213 deletions(-) >> >> diff --git a/mm/page-writeback.c b/mm/page-writeback.c >> index 7e2da284e427..cbb74c0666c6 100644 >> --- a/mm/page-writeback.c >> +++ b/mm/page-writeback.c >> @@ -140,6 +140,29 @@ struct dirty_throttle_control { >> unsigned long pos_ratio; >> }; >> >> +/* context for consecutive calls to _balance_dirty_pages() */ >> +struct bdp_ctx { >> + long pause; >> + unsigned long now; >> + unsigned long start_time; >> + unsigned long task_ratelimit; >> + unsigned long dirty_ratelimit; >> + unsigned long nr_reclaimable; >> + int nr_dirtied_pause; >> + bool dirty_exceeded; >> + >> + struct dirty_throttle_control gdtc_stor; >> + struct dirty_throttle_control mdtc_stor; >> + struct dirty_throttle_control *sdtc; >> +} bdp_ctx; > > Looking at how much context you propagate into _balance_dirty_pages() I > don't think this suggestion was as great as I've hoped. I'm sorry for that. > We could actually significantly reduce the amount of context passed in/out > but some things would be difficult to get rid of and some interactions of > code in _balance_dirty_pages() and the caller are actually pretty subtle. > > I think something like attached three patches should make things NOWAIT > support in balance_dirty_pages() reasonably readable. > Thanks a lot Jan for the three patches. I integrated them in the patch series and changed the callers accordingly. > Honza > >> + >> +/* initialize _balance_dirty_pages() context */ >> +#define BDP_CTX_INIT(ctx, wb) \ >> + .gdtc_stor = { GDTC_INIT(wb) }, \ >> + .mdtc_stor = { MDTC_INIT(wb, &ctx.gdtc_stor) }, \ >> + .start_time = jiffies, \ >> + .dirty_exceeded = false >> + >> /* >> * Length of period for aging writeout fractions of bdis. This is an >> * arbitrarily chosen number. The longer the period, the slower fractions will >> @@ -1538,261 +1561,264 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) >> } >> } >> >> -/* >> - * balance_dirty_pages() must be called by processes which are generating dirty >> - * data. It looks at the number of dirty pages in the machine and will force >> - * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. >> - * If we're over `background_thresh' then the writeback threads are woken to >> - * perform some writeout. >> - */ >> -static void balance_dirty_pages(struct bdi_writeback *wb, >> - unsigned long pages_dirtied) >> +static inline int _balance_dirty_pages(struct bdi_writeback *wb, >> + unsigned long pages_dirtied, struct bdp_ctx *ctx) >> { >> - struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; >> - struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; >> - struct dirty_throttle_control * const gdtc = &gdtc_stor; >> - struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? >> - &mdtc_stor : NULL; >> - struct dirty_throttle_control *sdtc; >> - unsigned long nr_reclaimable; /* = file_dirty */ >> + struct dirty_throttle_control * const gdtc = &ctx->gdtc_stor; >> + struct dirty_throttle_control * const mdtc = mdtc_valid(&ctx->mdtc_stor) ? >> + &ctx->mdtc_stor : NULL; >> long period; >> - long pause; >> long max_pause; >> long min_pause; >> - int nr_dirtied_pause; >> - bool dirty_exceeded = false; >> - unsigned long task_ratelimit; >> - unsigned long dirty_ratelimit; >> struct backing_dev_info *bdi = wb->bdi; >> bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; >> - unsigned long start_time = jiffies; >> >> - for (;;) { >> - unsigned long now = jiffies; >> - unsigned long dirty, thresh, bg_thresh; >> - unsigned long m_dirty = 0; /* stop bogus uninit warnings */ >> - unsigned long m_thresh = 0; >> - unsigned long m_bg_thresh = 0; >> - >> - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); >> - gdtc->avail = global_dirtyable_memory(); >> - gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); >> + unsigned long dirty, thresh, bg_thresh; >> + unsigned long m_dirty = 0; /* stop bogus uninit warnings */ >> + unsigned long m_thresh = 0; >> + unsigned long m_bg_thresh = 0; >> >> - domain_dirty_limits(gdtc); >> + ctx->now = jiffies; >> + ctx->nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); >> + gdtc->avail = global_dirtyable_memory(); >> + gdtc->dirty = ctx->nr_reclaimable + global_node_page_state(NR_WRITEBACK); >> >> - if (unlikely(strictlimit)) { >> - wb_dirty_limits(gdtc); >> + domain_dirty_limits(gdtc); >> >> - dirty = gdtc->wb_dirty; >> - thresh = gdtc->wb_thresh; >> - bg_thresh = gdtc->wb_bg_thresh; >> - } else { >> - dirty = gdtc->dirty; >> - thresh = gdtc->thresh; >> - bg_thresh = gdtc->bg_thresh; >> - } >> + if (unlikely(strictlimit)) { >> + wb_dirty_limits(gdtc); >> >> - if (mdtc) { >> - unsigned long filepages, headroom, writeback; >> + dirty = gdtc->wb_dirty; >> + thresh = gdtc->wb_thresh; >> + bg_thresh = gdtc->wb_bg_thresh; >> + } else { >> + dirty = gdtc->dirty; >> + thresh = gdtc->thresh; >> + bg_thresh = gdtc->bg_thresh; >> + } >> >> - /* >> - * If @wb belongs to !root memcg, repeat the same >> - * basic calculations for the memcg domain. >> - */ >> - mem_cgroup_wb_stats(wb, &filepages, &headroom, >> - &mdtc->dirty, &writeback); >> - mdtc->dirty += writeback; >> - mdtc_calc_avail(mdtc, filepages, headroom); >> - >> - domain_dirty_limits(mdtc); >> - >> - if (unlikely(strictlimit)) { >> - wb_dirty_limits(mdtc); >> - m_dirty = mdtc->wb_dirty; >> - m_thresh = mdtc->wb_thresh; >> - m_bg_thresh = mdtc->wb_bg_thresh; >> - } else { >> - m_dirty = mdtc->dirty; >> - m_thresh = mdtc->thresh; >> - m_bg_thresh = mdtc->bg_thresh; >> - } >> - } >> + if (mdtc) { >> + unsigned long filepages, headroom, writeback; >> >> /* >> - * Throttle it only when the background writeback cannot >> - * catch-up. This avoids (excessively) small writeouts >> - * when the wb limits are ramping up in case of !strictlimit. >> - * >> - * In strictlimit case make decision based on the wb counters >> - * and limits. Small writeouts when the wb limits are ramping >> - * up are the price we consciously pay for strictlimit-ing. >> - * >> - * If memcg domain is in effect, @dirty should be under >> - * both global and memcg freerun ceilings. >> + * If @wb belongs to !root memcg, repeat the same >> + * basic calculations for the memcg domain. >> */ >> - if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && >> - (!mdtc || >> - m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { >> - unsigned long intv; >> - unsigned long m_intv; >> + mem_cgroup_wb_stats(wb, &filepages, &headroom, >> + &mdtc->dirty, &writeback); >> + mdtc->dirty += writeback; >> + mdtc_calc_avail(mdtc, filepages, headroom); >> >> -free_running: >> - intv = dirty_poll_interval(dirty, thresh); >> - m_intv = ULONG_MAX; >> + domain_dirty_limits(mdtc); >> >> - current->dirty_paused_when = now; >> - current->nr_dirtied = 0; >> - if (mdtc) >> - m_intv = dirty_poll_interval(m_dirty, m_thresh); >> - current->nr_dirtied_pause = min(intv, m_intv); >> - break; >> + if (unlikely(strictlimit)) { >> + wb_dirty_limits(mdtc); >> + m_dirty = mdtc->wb_dirty; >> + m_thresh = mdtc->wb_thresh; >> + m_bg_thresh = mdtc->wb_bg_thresh; >> + } else { >> + m_dirty = mdtc->dirty; >> + m_thresh = mdtc->thresh; >> + m_bg_thresh = mdtc->bg_thresh; >> } >> + } >> >> - if (unlikely(!writeback_in_progress(wb))) >> - wb_start_background_writeback(wb); >> + /* >> + * Throttle it only when the background writeback cannot >> + * catch-up. This avoids (excessively) small writeouts >> + * when the wb limits are ramping up in case of !strictlimit. >> + * >> + * In strictlimit case make decision based on the wb counters >> + * and limits. Small writeouts when the wb limits are ramping >> + * up are the price we consciously pay for strictlimit-ing. >> + * >> + * If memcg domain is in effect, @dirty should be under >> + * both global and memcg freerun ceilings. >> + */ >> + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && >> + (!mdtc || >> + m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { >> + unsigned long intv; >> + unsigned long m_intv; >> >> - mem_cgroup_flush_foreign(wb); >> +free_running: >> + intv = dirty_poll_interval(dirty, thresh); >> + m_intv = ULONG_MAX; >> + >> + current->dirty_paused_when = ctx->now; >> + current->nr_dirtied = 0; >> + if (mdtc) >> + m_intv = dirty_poll_interval(m_dirty, m_thresh); >> + current->nr_dirtied_pause = min(intv, m_intv); >> + return 0; >> + } >> + >> + if (unlikely(!writeback_in_progress(wb))) >> + wb_start_background_writeback(wb); >> >> + mem_cgroup_flush_foreign(wb); >> + >> + /* >> + * Calculate global domain's pos_ratio and select the >> + * global dtc by default. >> + */ >> + if (!strictlimit) { >> + wb_dirty_limits(gdtc); >> + >> + if ((current->flags & PF_LOCAL_THROTTLE) && >> + gdtc->wb_dirty < >> + dirty_freerun_ceiling(gdtc->wb_thresh, >> + gdtc->wb_bg_thresh)) >> + /* >> + * LOCAL_THROTTLE tasks must not be throttled >> + * when below the per-wb freerun ceiling. >> + */ >> + goto free_running; >> + } >> + >> + ctx->dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && >> + ((gdtc->dirty > gdtc->thresh) || strictlimit); >> + >> + wb_position_ratio(gdtc); >> + ctx->sdtc = gdtc; >> + >> + if (mdtc) { >> /* >> - * Calculate global domain's pos_ratio and select the >> - * global dtc by default. >> + * If memcg domain is in effect, calculate its >> + * pos_ratio. @wb should satisfy constraints from >> + * both global and memcg domains. Choose the one >> + * w/ lower pos_ratio. >> */ >> if (!strictlimit) { >> - wb_dirty_limits(gdtc); >> + wb_dirty_limits(mdtc); >> >> if ((current->flags & PF_LOCAL_THROTTLE) && >> - gdtc->wb_dirty < >> - dirty_freerun_ceiling(gdtc->wb_thresh, >> - gdtc->wb_bg_thresh)) >> + mdtc->wb_dirty < >> + dirty_freerun_ceiling(mdtc->wb_thresh, >> + mdtc->wb_bg_thresh)) >> /* >> - * LOCAL_THROTTLE tasks must not be throttled >> - * when below the per-wb freerun ceiling. >> + * LOCAL_THROTTLE tasks must not be >> + * throttled when below the per-wb >> + * freerun ceiling. >> */ >> goto free_running; >> } >> + ctx->dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && >> + ((mdtc->dirty > mdtc->thresh) || strictlimit); >> >> - dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && >> - ((gdtc->dirty > gdtc->thresh) || strictlimit); >> + wb_position_ratio(mdtc); >> + if (mdtc->pos_ratio < gdtc->pos_ratio) >> + ctx->sdtc = mdtc; >> + } >> >> - wb_position_ratio(gdtc); >> - sdtc = gdtc; >> + if (ctx->dirty_exceeded && !wb->dirty_exceeded) >> + wb->dirty_exceeded = 1; >> >> - if (mdtc) { >> - /* >> - * If memcg domain is in effect, calculate its >> - * pos_ratio. @wb should satisfy constraints from >> - * both global and memcg domains. Choose the one >> - * w/ lower pos_ratio. >> - */ >> - if (!strictlimit) { >> - wb_dirty_limits(mdtc); >> - >> - if ((current->flags & PF_LOCAL_THROTTLE) && >> - mdtc->wb_dirty < >> - dirty_freerun_ceiling(mdtc->wb_thresh, >> - mdtc->wb_bg_thresh)) >> - /* >> - * LOCAL_THROTTLE tasks must not be >> - * throttled when below the per-wb >> - * freerun ceiling. >> - */ >> - goto free_running; >> - } >> - dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && >> - ((mdtc->dirty > mdtc->thresh) || strictlimit); >> + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + >> + BANDWIDTH_INTERVAL)) >> + __wb_update_bandwidth(gdtc, mdtc, true); >> + >> + /* throttle according to the chosen dtc */ >> + ctx->dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); >> + ctx->task_ratelimit = ((u64)ctx->dirty_ratelimit * ctx->sdtc->pos_ratio) >> >> + RATELIMIT_CALC_SHIFT; >> + max_pause = wb_max_pause(wb, ctx->sdtc->wb_dirty); >> + min_pause = wb_min_pause(wb, max_pause, >> + ctx->task_ratelimit, ctx->dirty_ratelimit, >> + &ctx->nr_dirtied_pause); >> + >> + if (unlikely(ctx->task_ratelimit == 0)) { >> + period = max_pause; >> + ctx->pause = max_pause; >> + goto pause; >> + } >> + period = HZ * pages_dirtied / ctx->task_ratelimit; >> + ctx->pause = period; >> + if (current->dirty_paused_when) >> + ctx->pause -= ctx->now - current->dirty_paused_when; >> + /* >> + * For less than 1s think time (ext3/4 may block the dirtier >> + * for up to 800ms from time to time on 1-HDD; so does xfs, >> + * however at much less frequency), try to compensate it in >> + * future periods by updating the virtual time; otherwise just >> + * do a reset, as it may be a light dirtier. >> + */ >> + if (ctx->pause < min_pause) { >> + trace_balance_dirty_pages(wb, >> + ctx->sdtc->thresh, >> + ctx->sdtc->bg_thresh, >> + ctx->sdtc->dirty, >> + ctx->sdtc->wb_thresh, >> + ctx->sdtc->wb_dirty, >> + ctx->dirty_ratelimit, >> + ctx->task_ratelimit, >> + pages_dirtied, >> + period, >> + min(ctx->pause, 0L), >> + ctx->start_time); >> + if (ctx->pause < -HZ) { >> + current->dirty_paused_when = ctx->now; >> + current->nr_dirtied = 0; >> + } else if (period) { >> + current->dirty_paused_when += period; >> + current->nr_dirtied = 0; >> + } else if (current->nr_dirtied_pause <= pages_dirtied) >> + current->nr_dirtied_pause += pages_dirtied; >> + return 0; >> + } >> + if (unlikely(ctx->pause > max_pause)) { >> + /* for occasional dropped task_ratelimit */ >> + ctx->now += min(ctx->pause - max_pause, max_pause); >> + ctx->pause = max_pause; >> + } >> >> - wb_position_ratio(mdtc); >> - if (mdtc->pos_ratio < gdtc->pos_ratio) >> - sdtc = mdtc; >> - } >> +pause: >> + trace_balance_dirty_pages(wb, >> + ctx->sdtc->thresh, >> + ctx->sdtc->bg_thresh, >> + ctx->sdtc->dirty, >> + ctx->sdtc->wb_thresh, >> + ctx->sdtc->wb_dirty, >> + ctx->dirty_ratelimit, >> + ctx->task_ratelimit, >> + pages_dirtied, >> + period, >> + ctx->pause, >> + ctx->start_time); >> + >> + return ctx->pause; >> +} >> >> - if (dirty_exceeded && !wb->dirty_exceeded) >> - wb->dirty_exceeded = 1; >> - >> - if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + >> - BANDWIDTH_INTERVAL)) >> - __wb_update_bandwidth(gdtc, mdtc, true); >> - >> - /* throttle according to the chosen dtc */ >> - dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); >> - task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> >> - RATELIMIT_CALC_SHIFT; >> - max_pause = wb_max_pause(wb, sdtc->wb_dirty); >> - min_pause = wb_min_pause(wb, max_pause, >> - task_ratelimit, dirty_ratelimit, >> - &nr_dirtied_pause); >> - >> - if (unlikely(task_ratelimit == 0)) { >> - period = max_pause; >> - pause = max_pause; >> - goto pause; >> - } >> - period = HZ * pages_dirtied / task_ratelimit; >> - pause = period; >> - if (current->dirty_paused_when) >> - pause -= now - current->dirty_paused_when; >> - /* >> - * For less than 1s think time (ext3/4 may block the dirtier >> - * for up to 800ms from time to time on 1-HDD; so does xfs, >> - * however at much less frequency), try to compensate it in >> - * future periods by updating the virtual time; otherwise just >> - * do a reset, as it may be a light dirtier. >> - */ >> - if (pause < min_pause) { >> - trace_balance_dirty_pages(wb, >> - sdtc->thresh, >> - sdtc->bg_thresh, >> - sdtc->dirty, >> - sdtc->wb_thresh, >> - sdtc->wb_dirty, >> - dirty_ratelimit, >> - task_ratelimit, >> - pages_dirtied, >> - period, >> - min(pause, 0L), >> - start_time); >> - if (pause < -HZ) { >> - current->dirty_paused_when = now; >> - current->nr_dirtied = 0; >> - } else if (period) { >> - current->dirty_paused_when += period; >> - current->nr_dirtied = 0; >> - } else if (current->nr_dirtied_pause <= pages_dirtied) >> - current->nr_dirtied_pause += pages_dirtied; >> +/* >> + * balance_dirty_pages() must be called by processes which are generating dirty >> + * data. It looks at the number of dirty pages in the machine and will force >> + * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. >> + * If we're over `background_thresh' then the writeback threads are woken to >> + * perform some writeout. >> + */ >> +static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied) >> +{ >> + int ret; >> + struct bdp_ctx ctx = { BDP_CTX_INIT(ctx, wb) }; >> + >> + for (;;) { >> + ret = _balance_dirty_pages(wb, current->nr_dirtied, &ctx); >> + if (!ret) >> break; >> - } >> - if (unlikely(pause > max_pause)) { >> - /* for occasional dropped task_ratelimit */ >> - now += min(pause - max_pause, max_pause); >> - pause = max_pause; >> - } >> >> -pause: >> - trace_balance_dirty_pages(wb, >> - sdtc->thresh, >> - sdtc->bg_thresh, >> - sdtc->dirty, >> - sdtc->wb_thresh, >> - sdtc->wb_dirty, >> - dirty_ratelimit, >> - task_ratelimit, >> - pages_dirtied, >> - period, >> - pause, >> - start_time); >> __set_current_state(TASK_KILLABLE); >> - wb->dirty_sleep = now; >> - io_schedule_timeout(pause); >> + wb->dirty_sleep = ctx.now; >> + io_schedule_timeout(ctx.pause); >> >> - current->dirty_paused_when = now + pause; >> + current->dirty_paused_when = ctx.now + ctx.pause; >> current->nr_dirtied = 0; >> - current->nr_dirtied_pause = nr_dirtied_pause; >> + current->nr_dirtied_pause = ctx.nr_dirtied_pause; >> >> /* >> * This is typically equal to (dirty < thresh) and can also >> * keep "1000+ dd on a slow USB stick" under control. >> */ >> - if (task_ratelimit) >> + if (ctx.task_ratelimit) >> break; >> >> /* >> @@ -1805,14 +1831,14 @@ static void balance_dirty_pages(struct bdi_writeback *wb, >> * more page. However wb_dirty has accounting errors. So use >> * the larger and more IO friendly wb_stat_error. >> */ >> - if (sdtc->wb_dirty <= wb_stat_error()) >> + if (ctx.sdtc->wb_dirty <= wb_stat_error()) >> break; >> >> if (fatal_signal_pending(current)) >> break; >> } >> >> - if (!dirty_exceeded && wb->dirty_exceeded) >> + if (!ctx.dirty_exceeded && wb->dirty_exceeded) >> wb->dirty_exceeded = 0; >> >> if (writeback_in_progress(wb)) >> @@ -1829,7 +1855,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, >> if (laptop_mode) >> return; >> >> - if (nr_reclaimable > gdtc->bg_thresh) >> + if (ctx.nr_reclaimable > ctx.gdtc_stor.bg_thresh) >> wb_start_background_writeback(wb); >> } >> >> -- >> 2.30.2 >>