Am 26.07.2018 um 12:42 schrieb Coly Li: > Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle") > allows the writeback rate to be faster if there is no I/O request on a > bcache device. It works well if there is only one bcache device attached > to the cache set. If there are many bcache devices attached to a cache > set, it may introduce performance regression because multiple faster > writeback threads of the idle bcache devices will compete the btree level > locks with the bcache device who have I/O requests coming. > > This patch fixes the above issue by only permitting fast writebac when > all bcache devices attached on the cache set are idle. And if one of the > bcache devices has new I/O request coming, minimized all writeback > throughput immediately and let PI controller __update_writeback_rate() > to decide the upcoming writeback rate for each bcache device. > > Also when all bcache devices are idle, limited wrieback rate to a small > number is wast of thoughput, especially when backing devices are slower > non-rotation devices (e.g. SATA SSD). This patch sets a max writeback > rate for each backing device if the whole cache set is idle. A faster > writeback rate in idle time means new I/Os may have more available space > for dirty data, and people may observe a better write performance then. > > Please note bcache may change its cache mode in run time, and this patch > still works if the cache mode is switched from writeback mode and there > is still dirty data on cache. Tested-by: Stefan Priebe <s.priebe@xxxxxxxxxxxx> Working fine now. > Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle") > Cc: stable@xxxxxxxxxxxxxxx #4.16+ > Signed-off-by: Coly Li <colyli@xxxxxxx> > Tested-by: Kai Krakow <kai@xxxxxxxxxxx> > Cc: Michael Lyle <mlyle@xxxxxxxx> > Cc: Stefan Priebe <s.priebe@xxxxxxxxxxxx> > --- > Channgelog: > v3, Do not acquire bch_register_lock in set_at_max_writeback_rate(). > v2, Fix a deadlock reported by Stefan Priebe. > v1, Initial version. > > drivers/md/bcache/bcache.h | 10 ++-- > drivers/md/bcache/request.c | 54 ++++++++++++++++++++- > drivers/md/bcache/super.c | 4 ++ > drivers/md/bcache/sysfs.c | 14 ++++-- > drivers/md/bcache/util.c | 2 +- > drivers/md/bcache/util.h | 2 +- > drivers/md/bcache/writeback.c | 91 +++++++++++++++++++++++------------ > 7 files changed, 133 insertions(+), 44 deletions(-) > > diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h > index 872ef4d67711..13f908be42ba 100644 > --- a/drivers/md/bcache/bcache.h > +++ b/drivers/md/bcache/bcache.h > @@ -328,13 +328,6 @@ struct cached_dev { > */ > atomic_t has_dirty; > > - /* > - * Set to zero by things that touch the backing volume-- except > - * writeback. Incremented by writeback. Used to determine when to > - * accelerate idle writeback. > - */ > - atomic_t backing_idle; > - > struct bch_ratelimit writeback_rate; > struct delayed_work writeback_rate_update; > > @@ -515,6 +508,8 @@ struct cache_set { > struct cache_accounting accounting; > > unsigned long flags; > + atomic_t idle_counter; > + atomic_t at_max_writeback_rate; > > struct cache_sb sb; > > @@ -524,6 +519,7 @@ struct cache_set { > > struct bcache_device **devices; > unsigned devices_max_used; > + atomic_t attached_dev_nr; > struct list_head cached_devs; > uint64_t cached_dev_sectors; > atomic_long_t flash_dev_dirty_sectors; > diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c > index 8eece9ef9f46..26f97acde403 100644 > --- a/drivers/md/bcache/request.c > +++ b/drivers/md/bcache/request.c > @@ -1105,6 +1105,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) > generic_make_request(bio); > } > > +static void quit_max_writeback_rate(struct cache_set *c, > + struct cached_dev *this_dc) > +{ > + int i; > + struct bcache_device *d; > + struct cached_dev *dc; > + > + /* > + * mutex bch_register_lock may compete with other parallel requesters, > + * or attach/detach operations on other backing device. Waiting to > + * the mutex lock may increase I/O request latency for seconds or more. > + * To avoid such situation, if mutext_trylock() failed, only writeback > + * rate of current cached device is set to 1, and __update_write_back() > + * will decide writeback rate of other cached devices (remember now > + * c->idle_counter is 0 already). > + */ > + if (mutex_trylock(&bch_register_lock)) { > + for (i = 0; i < c->devices_max_used; i++) { > + if (!c->devices[i]) > + continue; > + > + if (UUID_FLASH_ONLY(&c->uuids[i])) > + continue; > + > + d = c->devices[i]; > + dc = container_of(d, struct cached_dev, disk); > + /* > + * set writeback rate to default minimum value, > + * then let update_writeback_rate() to decide the > + * upcoming rate. > + */ > + atomic_long_set(&dc->writeback_rate.rate, 1); > + } > + mutex_unlock(&bch_register_lock); > + } else > + atomic_long_set(&this_dc->writeback_rate.rate, 1); > +} > + > /* Cached devices - read & write stuff */ > > static blk_qc_t cached_dev_make_request(struct request_queue *q, > @@ -1122,7 +1160,21 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, > return BLK_QC_T_NONE; > } > > - atomic_set(&dc->backing_idle, 0); > + if (likely(d->c)) { > + if (atomic_read(&d->c->idle_counter)) > + atomic_set(&d->c->idle_counter, 0); > + /* > + * If at_max_writeback_rate of cache set is true and new I/O > + * comes, quit max writeback rate of all cached devices > + * attached to this cache set, and set at_max_writeback_rate > + * to false. > + */ > + if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) { > + atomic_set(&d->c->at_max_writeback_rate, 0); > + quit_max_writeback_rate(d->c, dc); > + } > + } > + > generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); > > bio_set_dev(bio, dc->bdev); > diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c > index e0a92104ca23..8db6696e2bff 100644 > --- a/drivers/md/bcache/super.c > +++ b/drivers/md/bcache/super.c > @@ -696,6 +696,8 @@ static void bcache_device_detach(struct bcache_device *d) > { > lockdep_assert_held(&bch_register_lock); > > + atomic_dec(&d->c->attached_dev_nr); > + > if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { > struct uuid_entry *u = d->c->uuids + d->id; > > @@ -1144,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, > > bch_cached_dev_run(dc); > bcache_device_link(&dc->disk, c, "bdev"); > + atomic_inc(&c->attached_dev_nr); > > /* Allow the writeback thread to proceed */ > up_write(&dc->writeback_lock); > @@ -1695,6 +1698,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) > c->block_bits = ilog2(sb->block_size); > c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); > c->devices_max_used = 0; > + atomic_set(&c->attached_dev_nr, 0); > c->btree_pages = bucket_pages(c); > if (c->btree_pages > BTREE_MAX_PAGES) > c->btree_pages = max_t(int, c->btree_pages / 4, > diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c > index 225b15aa0340..a56067e80b10 100644 > --- a/drivers/md/bcache/sysfs.c > +++ b/drivers/md/bcache/sysfs.c > @@ -170,7 +170,8 @@ SHOW(__bch_cached_dev) > var_printf(writeback_running, "%i"); > var_print(writeback_delay); > var_print(writeback_percent); > - sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); > + sysfs_hprint(writeback_rate, > + atomic_long_read(&dc->writeback_rate.rate) << 9); > sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); > sysfs_printf(io_error_limit, "%i", dc->error_limit); > sysfs_printf(io_disable, "%i", dc->io_disable); > @@ -188,7 +189,8 @@ SHOW(__bch_cached_dev) > char change[20]; > s64 next_io; > > - bch_hprint(rate, dc->writeback_rate.rate << 9); > + bch_hprint(rate, > + atomic_long_read(&dc->writeback_rate.rate) << 9); > bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); > bch_hprint(target, dc->writeback_rate_target << 9); > bch_hprint(proportional,dc->writeback_rate_proportional << 9); > @@ -255,8 +257,12 @@ STORE(__cached_dev) > > sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); > > - sysfs_strtoul_clamp(writeback_rate, > - dc->writeback_rate.rate, 1, INT_MAX); > + if (attr == &sysfs_writeback_rate) { > + int v; > + > + sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX); > + atomic_long_set(&dc->writeback_rate.rate, v); > + } > > sysfs_strtoul_clamp(writeback_rate_update_seconds, > dc->writeback_rate_update_seconds, > diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c > index f912c372978c..c6a99dfa1ad9 100644 > --- a/drivers/md/bcache/util.c > +++ b/drivers/md/bcache/util.c > @@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) > { > uint64_t now = local_clock(); > > - d->next += div_u64(done * NSEC_PER_SEC, d->rate); > + d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate)); > > /* Bound the time. Don't let us fall further than 2 seconds behind > * (this prevents unnecessary backlog that would make it impossible > diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h > index a1579e28049f..5ff055f0a653 100644 > --- a/drivers/md/bcache/util.h > +++ b/drivers/md/bcache/util.h > @@ -443,7 +443,7 @@ struct bch_ratelimit { > * Rate at which we want to do work, in units per second > * The units here correspond to the units passed to bch_next_delay() > */ > - uint32_t rate; > + atomic_long_t rate; > }; > > static inline void bch_ratelimit_reset(struct bch_ratelimit *d) > diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c > index 912e969fedba..907fa6c0d192 100644 > --- a/drivers/md/bcache/writeback.c > +++ b/drivers/md/bcache/writeback.c > @@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc) > > dc->writeback_rate_proportional = proportional_scaled; > dc->writeback_rate_integral_scaled = integral_scaled; > - dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; > - dc->writeback_rate.rate = new_rate; > + dc->writeback_rate_change = new_rate - > + atomic_long_read(&dc->writeback_rate.rate); > + atomic_long_set(&dc->writeback_rate.rate, new_rate); > dc->writeback_rate_target = target; > } > > +static bool set_at_max_writeback_rate(struct cache_set *c, > + struct cached_dev *dc) > +{ > + /* > + * Idle_counter is increased everytime when update_writeback_rate() is > + * called. If all backing devices attached to the same cache set have > + * identical dc->writeback_rate_update_seconds values, it is about 6 > + * rounds of update_writeback_rate() on each backing device before > + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set > + * to each dc->writeback_rate.rate. > + * In order to avoid extra locking cost for counting exact dirty cached > + * devices number, c->attached_dev_nr is used to calculate the idle > + * throushold. It might be bigger if not all cached device are in write- > + * back mode, but it still works well with limited extra rounds of > + * update_writeback_rate(). > + */ > + if (atomic_inc_return(&c->idle_counter) < > + atomic_read(&c->attached_dev_nr) * 6) > + return false; > + > + if (atomic_read(&c->at_max_writeback_rate) != 1) > + atomic_set(&c->at_max_writeback_rate, 1); > + > + atomic_long_set(&dc->writeback_rate.rate, INT_MAX); > + > + /* keep writeback_rate_target as existing value */ > + dc->writeback_rate_proportional = 0; > + dc->writeback_rate_integral_scaled = 0; > + dc->writeback_rate_change = 0; > + > + /* > + * Check c->idle_counter and c->at_max_writeback_rate agagain in case > + * new I/O arrives during before set_at_max_writeback_rate() returns. > + * Then the writeback rate is set to 1, and its new value should be > + * decided via __update_writeback_rate(). > + */ > + if ((atomic_read(&c->idle_counter) < > + atomic_read(&c->attached_dev_nr) * 6) || > + !atomic_read(&c->at_max_writeback_rate)) > + return false; > + > + return true; > +} > + > static void update_writeback_rate(struct work_struct *work) > { > struct cached_dev *dc = container_of(to_delayed_work(work), > @@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work) > return; > } > > - down_read(&dc->writeback_lock); > - > - if (atomic_read(&dc->has_dirty) && > - dc->writeback_percent) > - __update_writeback_rate(dc); > + if (atomic_read(&dc->has_dirty) && dc->writeback_percent) { > + /* > + * If the whole cache set is idle, set_at_max_writeback_rate() > + * will set writeback rate to a max number. Then it is > + * unncessary to update writeback rate for an idle cache set > + * in maximum writeback rate number(s). > + */ > + if (!set_at_max_writeback_rate(c, dc)) { > + down_read(&dc->writeback_lock); > + __update_writeback_rate(dc); > + up_read(&dc->writeback_lock); > + } > + } > > - up_read(&dc->writeback_lock); > > /* > * CACHE_SET_IO_DISABLE might be set via sysfs interface, > @@ -422,27 +474,6 @@ static void read_dirty(struct cached_dev *dc) > > delay = writeback_delay(dc, size); > > - /* If the control system would wait for at least half a > - * second, and there's been no reqs hitting the backing disk > - * for awhile: use an alternate mode where we have at most > - * one contiguous set of writebacks in flight at a time. If > - * someone wants to do IO it will be quick, as it will only > - * have to contend with one operation in flight, and we'll > - * be round-tripping data to the backing disk as quickly as > - * it can accept it. > - */ > - if (delay >= HZ / 2) { > - /* 3 means at least 1.5 seconds, up to 7.5 if we > - * have slowed way down. > - */ > - if (atomic_inc_return(&dc->backing_idle) >= 3) { > - /* Wait for current I/Os to finish */ > - closure_sync(&cl); > - /* And immediately launch a new set. */ > - delay = 0; > - } > - } > - > while (!kthread_should_stop() && > !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && > delay) { > @@ -741,7 +772,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) > dc->writeback_running = true; > dc->writeback_percent = 10; > dc->writeback_delay = 30; > - dc->writeback_rate.rate = 1024; > + atomic_long_set(&dc->writeback_rate.rate, 1024); > dc->writeback_rate_minimum = 8; > > dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; > -- To unsubscribe from this list: send the line "unsubscribe linux-bcache" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html