On Mon, 5 Jul 2021 18:23:17 +0200 Jan Kara wrote: > >Michael Stapelberg has reported that for workload with short big spikes >of writes (GCC linker seem to trigger this frequently) the write >throughput is heavily underestimated and tends to steadily sink until it >reaches zero. This has rather bad impact on writeback throttling >(causing stalls). The problem is that writeback throughput estimate gets >updated at most once per 200 ms. One update happens early after we >submit pages for writeback (at that point writeout of only small >fraction of pages is completed and thus observed throughput is tiny). >Next update happens only during the next write spike (updates happen >only from inode writeback and dirty throttling code) and if that is >more than 1s after previous spike, we decide system was idle and just >ignore whatever was written until this moment. > >Fix the problem by making sure writeback throughput estimate is also >updated shortly after writeback completes to get reasonable estimate of >throughput for spiky workloads. > >Link: https://lore.kernel.org/lkml/20210617095309.3542373-1-stapelberg+li>nux@xxxxxxxxxx >Reported-by: Michael Stapelberg <stapelberg+linux@xxxxxxxxxx> >Signed-off-by: Jan Kara <jack@xxxxxxx> >--- > include/linux/backing-dev-defs.h | 1 + > include/linux/writeback.h | 1 + > mm/backing-dev.c | 10 ++++++++++ > mm/page-writeback.c | 32 ++++++++++++++++---------------- > 4 files changed, 28 insertions(+), 16 deletions(-) > >diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev>-defs.h >index 148d889f2f7f..57395f7bb192 100644 >--- a/include/linux/backing-dev-defs.h >+++ b/include/linux/backing-dev-defs.h >@@ -143,6 +143,7 @@ struct bdi_writeback { > spinlock_t work_lock; /* protects work_list & dwork scheduling */ > struct list_head work_list; > struct delayed_work dwork; /* work item used for writeback */ >+ struct delayed_work bw_dwork; /* work item used for bandwidth estimate >*/ > > unsigned long dirty_sleep; /* last wait */ > >diff --git a/include/linux/writeback.h b/include/linux/writeback.h >index 47cd732e012e..a45e09ed0711 100644 >--- a/include/linux/writeback.h >+++ b/include/linux/writeback.h >@@ -379,6 +379,7 @@ int dirty_writeback_centisecs_handler(struct ctl_tabl>e *table, int write, > void global_dirty_limits(unsigned long *pbackground, unsigned long *pdir>ty); > unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thr>esh); > >+void wb_update_bandwidth(struct bdi_writeback *wb); > void balance_dirty_pages_ratelimited(struct address_space *mapping); > bool wb_over_bg_thresh(struct bdi_writeback *wb); > >diff --git a/mm/backing-dev.c b/mm/backing-dev.c >index 342394ef1e02..9baa59d68110 100644 >--- a/mm/backing-dev.c >+++ b/mm/backing-dev.c >@@ -271,6 +271,14 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) > spin_unlock_bh(&wb->work_lock); > } > >+static void wb_update_bandwidth_workfn(struct work_struct *work) >+{ >+ struct bdi_writeback *wb = container_of(to_delayed_work(work), >+ struct bdi_writeback, bw_dwork); >+ >+ wb_update_bandwidth(wb); >+} >+ > /* > * Initial write bandwidth: 100 MB/s > */ >@@ -303,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct b>acking_dev_info *bdi, > spin_lock_init(&wb->work_lock); > INIT_LIST_HEAD(&wb->work_list); > INIT_DELAYED_WORK(&wb->dwork, wb_workfn); >+ INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); > wb->dirty_sleep = jiffies; > > err = fprop_local_init_percpu(&wb->completions, gfp); >@@ -351,6 +360,7 @@ static void wb_shutdown(struct bdi_writeback *wb) > mod_delayed_work(bdi_wq, &wb->dwork, 0); > flush_delayed_work(&wb->dwork); > WARN_ON(!list_empty(&wb->work_list)); >+ flush_delayed_work(&wb->bw_dwork); > } > > static void wb_exit(struct bdi_writeback *wb) >diff --git a/mm/page-writeback.c b/mm/page-writeback.c >index 1fecf8ebadb0..6a99ddca95c0 100644 >--- a/mm/page-writeback.c >+++ b/mm/page-writeback.c >@@ -1346,14 +1346,7 @@ static void __wb_update_bandwidth(struct dirty_thr>ottle_control *gdtc, > unsigned long dirtied; > unsigned long written; > >- lockdep_assert_held(&wb->list_lock); >- >- /* >- * rate-limit, only update once every 200ms. >- */ >- if (elapsed < BANDWIDTH_INTERVAL) >- return; Please leave it as it is if you are not dumping the 200ms rule. >- >+ spin_lock(&wb->list_lock); > dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); > written = percpu_counter_read(&wb->stat[WB_WRITTEN]); > >@@ -1375,15 +1368,14 @@ static void __wb_update_bandwidth(struct dirty_th>rottle_control *gdtc, > wb->dirtied_stamp = dirtied; > wb->written_stamp = written; > wb->bw_time_stamp = now; >+ spin_unlock(&wb->list_lock); > } > >-static void wb_update_bandwidth(struct bdi_writeback *wb) >+void wb_update_bandwidth(struct bdi_writeback *wb) > { > struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; > >- spin_lock(&wb->list_lock); > __wb_update_bandwidth(&gdtc, NULL, false); >- spin_unlock(&wb->list_lock); > } > > /* Interval after which we consider wb idle and don't estimate bandwidth> */ >@@ -1728,11 +1720,8 @@ static void balance_dirty_pages(struct bdi_writeba>ck *wb, > wb->dirty_exceeded = 1; > > if (time_is_before_jiffies(wb->bw_time_stamp + >- BANDWIDTH_INTERVAL)) { >- spin_lock(&wb->list_lock); >+ BANDWIDTH_INTERVAL)) > __wb_update_bandwidth(gdtc, mdtc, true); >- spin_unlock(&wb->list_lock); >- } > > /* throttle according to the chosen dtc */ > dirty_ratelimit = wb->dirty_ratelimit; >@@ -2371,7 +2360,13 @@ int do_writepages(struct address_space *mapping, s>truct writeback_control *wbc) > cond_resched(); > congestion_wait(BLK_RW_ASYNC, HZ/50); > } >- wb_update_bandwidth(wb); >+ /* >+ * Usually few pages are written by now from those we've just submitted >+ * but if there's constant writeback being submitted, this makes sure >+ * writeback bandwidth is updated once in a while. >+ */ >+ if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) >+ wb_update_bandwidth(wb); > return ret; > } > >@@ -2742,6 +2737,11 @@ static void wb_inode_writeback_start(struct bdi_wr>iteback *wb) > static void wb_inode_writeback_end(struct bdi_writeback *wb) > { > atomic_dec(&wb->writeback_inodes); >+ /* >+ * Make sure estimate of writeback throughput gets >+ * updated after writeback completed. >+ */ >+ queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); > } This is a bogus estimate - it does not break the 200ms rule but walks around it without specifying why 300ms is not good.