On 2018/7/23 2:38 PM, Stefan Priebe - Profihost AG wrote: > Hi Coly, > > thanks for this patch. I was right on the way to debug why i have lot's > of stalled I/O on latest SLES12-SP3 kernel. I'll add this patch and try > if this solves my issues. > > Each cache set has two backing devices. Hi Stefan, Thanks for the testing. Coly Li > > Am 22.07.2018 um 18:13 schrieb Coly Li: >> Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle") >> allows the writeback rate to be faster if there is no I/O request on a >> bcache device. It works well if there is only one bcache device attached >> to the cache set. If there are many bcache devices attached to a cache >> set, it may introduce performance regression because multiple faster >> writeback threads of the idle bcache devices will compete the btree level >> locks with the bcache device who have I/O requests coming. >> >> This patch fixes the above issue by only permitting fast writebac when >> all bcache devices attached on the cache set are idle. And if one of the >> bcache devices has new I/O request coming, minimized all writeback >> throughput immediately and let PI controller __update_writeback_rate() >> to decide the upcoming writeback rate for each bcache device. >> >> Also when all bcache devices are idle, limited wrieback rate to a small >> number is wast of thoughput, especially when backing devices are slower >> non-rotation devices (e.g. SATA SSD). This patch sets a max writeback >> rate for each backing device if the whole cache set is idle. A faster >> writeback rate in idle time means new I/Os may have more available space >> for dirty data, and people may observe a better write performance then. >> >> Please note bcache may change its cache mode in run time, and this patch >> still works if the cache mode is switched from writeback mode and there >> is still dirty data on cache. >> >> Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle") >> Cc: stable@xxxxxxxxxxxxxxx #4.16+ >> Signed-off-by: Coly Li <colyli@xxxxxxx> >> Cc: Michael Lyle <mlyle@xxxxxxxx> >> --- >> drivers/md/bcache/bcache.h | 9 +--- >> drivers/md/bcache/request.c | 42 ++++++++++++++- >> drivers/md/bcache/sysfs.c | 14 +++-- >> drivers/md/bcache/util.c | 2 +- >> drivers/md/bcache/util.h | 2 +- >> drivers/md/bcache/writeback.c | 98 +++++++++++++++++++++++++---------- >> 6 files changed, 126 insertions(+), 41 deletions(-) >> >> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h >> index d6bf294f3907..f7451e8be03c 100644 >> --- a/drivers/md/bcache/bcache.h >> +++ b/drivers/md/bcache/bcache.h >> @@ -328,13 +328,6 @@ struct cached_dev { >> */ >> atomic_t has_dirty; >> >> - /* >> - * Set to zero by things that touch the backing volume-- except >> - * writeback. Incremented by writeback. Used to determine when to >> - * accelerate idle writeback. >> - */ >> - atomic_t backing_idle; >> - >> struct bch_ratelimit writeback_rate; >> struct delayed_work writeback_rate_update; >> >> @@ -514,6 +507,8 @@ struct cache_set { >> struct cache_accounting accounting; >> >> unsigned long flags; >> + atomic_t idle_counter; >> + atomic_t at_max_writeback_rate; >> >> struct cache_sb sb; >> >> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c >> index ae67f5fa8047..fe45f561a054 100644 >> --- a/drivers/md/bcache/request.c >> +++ b/drivers/md/bcache/request.c >> @@ -1104,6 +1104,34 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) >> >> /* Cached devices - read & write stuff */ >> >> +static void quit_max_writeback_rate(struct cache_set *c) >> +{ >> + int i; >> + struct bcache_device *d; >> + struct cached_dev *dc; >> + >> + mutex_lock(&bch_register_lock); >> + >> + for (i = 0; i < c->devices_max_used; i++) { >> + if (!c->devices[i]) >> + continue; >> + >> + if (UUID_FLASH_ONLY(&c->uuids[i])) >> + continue; >> + >> + d = c->devices[i]; >> + dc = container_of(d, struct cached_dev, disk); >> + /* >> + * set writeback rate to default minimum value, >> + * then let update_writeback_rate() to decide the >> + * upcoming rate. >> + */ >> + atomic64_set(&dc->writeback_rate.rate, 1); >> + } >> + >> + mutex_unlock(&bch_register_lock); >> +} >> + >> static blk_qc_t cached_dev_make_request(struct request_queue *q, >> struct bio *bio) >> { >> @@ -1119,7 +1147,19 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, >> return BLK_QC_T_NONE; >> } >> >> - atomic_set(&dc->backing_idle, 0); >> + if (d->c) { >> + atomic_set(&d->c->idle_counter, 0); >> + /* >> + * If at_max_writeback_rate of cache set is true and new I/O >> + * comes, quit max writeback rate of all cached devices >> + * attached to this cache set, and set at_max_writeback_rate >> + * to false. >> + */ >> + if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) { >> + atomic_set(&d->c->at_max_writeback_rate, 0); >> + quit_max_writeback_rate(d->c); >> + } >> + } >> generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); >> >> bio_set_dev(bio, dc->bdev); >> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c >> index 225b15aa0340..d719021bff81 100644 >> --- a/drivers/md/bcache/sysfs.c >> +++ b/drivers/md/bcache/sysfs.c >> @@ -170,7 +170,8 @@ SHOW(__bch_cached_dev) >> var_printf(writeback_running, "%i"); >> var_print(writeback_delay); >> var_print(writeback_percent); >> - sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); >> + sysfs_hprint(writeback_rate, >> + atomic64_read(&dc->writeback_rate.rate) << 9); >> sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); >> sysfs_printf(io_error_limit, "%i", dc->error_limit); >> sysfs_printf(io_disable, "%i", dc->io_disable); >> @@ -188,7 +189,8 @@ SHOW(__bch_cached_dev) >> char change[20]; >> s64 next_io; >> >> - bch_hprint(rate, dc->writeback_rate.rate << 9); >> + bch_hprint(rate, >> + atomic64_read(&dc->writeback_rate.rate) << 9); >> bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); >> bch_hprint(target, dc->writeback_rate_target << 9); >> bch_hprint(proportional,dc->writeback_rate_proportional << 9); >> @@ -255,8 +257,12 @@ STORE(__cached_dev) >> >> sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); >> >> - sysfs_strtoul_clamp(writeback_rate, >> - dc->writeback_rate.rate, 1, INT_MAX); >> + if (attr == &sysfs_writeback_rate) { >> + int v; >> + >> + sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX); >> + atomic64_set(&dc->writeback_rate.rate, v); >> + } >> >> sysfs_strtoul_clamp(writeback_rate_update_seconds, >> dc->writeback_rate_update_seconds, >> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c >> index fc479b026d6d..84f90c3d996d 100644 >> --- a/drivers/md/bcache/util.c >> +++ b/drivers/md/bcache/util.c >> @@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) >> { >> uint64_t now = local_clock(); >> >> - d->next += div_u64(done * NSEC_PER_SEC, d->rate); >> + d->next += div_u64(done * NSEC_PER_SEC, atomic64_read(&d->rate)); >> >> /* Bound the time. Don't let us fall further than 2 seconds behind >> * (this prevents unnecessary backlog that would make it impossible >> diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h >> index cced87f8eb27..7e17f32ab563 100644 >> --- a/drivers/md/bcache/util.h >> +++ b/drivers/md/bcache/util.h >> @@ -442,7 +442,7 @@ struct bch_ratelimit { >> * Rate at which we want to do work, in units per second >> * The units here correspond to the units passed to bch_next_delay() >> */ >> - uint32_t rate; >> + atomic64_t rate; >> }; >> >> static inline void bch_ratelimit_reset(struct bch_ratelimit *d) >> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c >> index ad45ebe1a74b..72059f910230 100644 >> --- a/drivers/md/bcache/writeback.c >> +++ b/drivers/md/bcache/writeback.c >> @@ -49,6 +49,63 @@ static uint64_t __calc_target_rate(struct cached_dev *dc) >> return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT; >> } >> >> +static bool set_at_max_writeback_rate(struct cache_set *c, >> + struct cached_dev *dc) >> +{ >> + int i, dirty_dc_nr = 0; >> + struct bcache_device *d; >> + >> + mutex_lock(&bch_register_lock); >> + for (i = 0; i < c->devices_max_used; i++) { >> + if (!c->devices[i]) >> + continue; >> + if (UUID_FLASH_ONLY(&c->uuids[i])) >> + continue; >> + d = c->devices[i]; >> + dc = container_of(d, struct cached_dev, disk); >> + if (atomic_read(&dc->has_dirty)) >> + dirty_dc_nr++; >> + } >> + mutex_unlock(&bch_register_lock); >> + >> + /* >> + * Idle_counter is increased everytime when update_writeback_rate() >> + * is rescheduled in. If all backing devices attached to the same >> + * cache set has same dc->writeback_rate_update_seconds value, it >> + * is about 10 rounds of update_writeback_rate() is called on each >> + * backing device, then the code will fall through at set 1 to >> + * c->at_max_writeback_rate, and a max wrteback rate to each >> + * dc->writeback_rate.rate. This is not very accurate but works well >> + * to make sure the whole cache set has no new I/O coming before >> + * writeback rate is set to a max number. >> + */ >> + if (atomic_inc_return(&c->idle_counter) < dirty_dc_nr * 10) >> + return false; >> + >> + if (atomic_read(&c->at_max_writeback_rate) != 1) >> + atomic_set(&c->at_max_writeback_rate, 1); >> + >> + >> + atomic64_set(&dc->writeback_rate.rate, INT_MAX); >> + >> + /* keep writeback_rate_target as existing value */ >> + dc->writeback_rate_proportional = 0; >> + dc->writeback_rate_integral_scaled = 0; >> + dc->writeback_rate_change = 0; >> + >> + /* >> + * Check c->idle_counter and c->at_max_writeback_rate agagain in case >> + * new I/O arrives during before set_at_max_writeback_rate() returns. >> + * Then the writeback rate is set to 1, and its new value should be >> + * decided via __update_writeback_rate(). >> + */ >> + if (atomic_read(&c->idle_counter) < dirty_dc_nr * 10 || >> + !atomic_read(&c->at_max_writeback_rate)) >> + return false; >> + >> + return true; >> +} >> + >> static void __update_writeback_rate(struct cached_dev *dc) >> { >> /* >> @@ -104,8 +161,9 @@ static void __update_writeback_rate(struct cached_dev *dc) >> >> dc->writeback_rate_proportional = proportional_scaled; >> dc->writeback_rate_integral_scaled = integral_scaled; >> - dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; >> - dc->writeback_rate.rate = new_rate; >> + dc->writeback_rate_change = new_rate - >> + atomic64_read(&dc->writeback_rate.rate); >> + atomic64_set(&dc->writeback_rate.rate, new_rate); >> dc->writeback_rate_target = target; >> } >> >> @@ -138,9 +196,16 @@ static void update_writeback_rate(struct work_struct *work) >> >> down_read(&dc->writeback_lock); >> >> - if (atomic_read(&dc->has_dirty) && >> - dc->writeback_percent) >> - __update_writeback_rate(dc); >> + if (atomic_read(&dc->has_dirty) && dc->writeback_percent) { >> + /* >> + * If the whole cache set is idle, set_at_max_writeback_rate() >> + * will set writeback rate to a max number. Then it is >> + * unncessary to update writeback rate for an idle cache set >> + * in maximum writeback rate number(s). >> + */ >> + if (!set_at_max_writeback_rate(c, dc)) >> + __update_writeback_rate(dc); >> + } >> >> up_read(&dc->writeback_lock); >> >> @@ -422,27 +487,6 @@ static void read_dirty(struct cached_dev *dc) >> >> delay = writeback_delay(dc, size); >> >> - /* If the control system would wait for at least half a >> - * second, and there's been no reqs hitting the backing disk >> - * for awhile: use an alternate mode where we have at most >> - * one contiguous set of writebacks in flight at a time. If >> - * someone wants to do IO it will be quick, as it will only >> - * have to contend with one operation in flight, and we'll >> - * be round-tripping data to the backing disk as quickly as >> - * it can accept it. >> - */ >> - if (delay >= HZ / 2) { >> - /* 3 means at least 1.5 seconds, up to 7.5 if we >> - * have slowed way down. >> - */ >> - if (atomic_inc_return(&dc->backing_idle) >= 3) { >> - /* Wait for current I/Os to finish */ >> - closure_sync(&cl); >> - /* And immediately launch a new set. */ >> - delay = 0; >> - } >> - } >> - >> while (!kthread_should_stop() && >> !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && >> delay) { >> @@ -715,7 +759,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) >> dc->writeback_running = true; >> dc->writeback_percent = 10; >> dc->writeback_delay = 30; >> - dc->writeback_rate.rate = 1024; >> + atomic64_set(&dc->writeback_rate.rate, 1024); >> dc->writeback_rate_minimum = 8; >> >> dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; >> -- To unsubscribe from this list: send the line "unsubscribe linux-bcache" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html