The functionanlity of sysfs entry writeback_percent is explained in file Document/ABI/testing/sysfs-block-bcache: "For backing devices: If nonzero, writeback from cache to backing device only takes place when more than this percentage of the cache is used, allowing more write coalescing to take place and reducing total number of writes sent to the backing device. Integer between 0 and 40." Indeed currently in writeback mode, even dirty data percent is not exceed writeback_percent of the cached device, writeback still happens at least one key per second. This is not a behavior as designed. This patch does three things to fix the above issue, 1) Move writeback thread related checks from bch_writeback_thread() to no_writeback_now(). 2) Add a duration to count time duration since last I/O request received to the moment when no_writeback_now() is called. When duration is more than BCH_IDLE_DURATION_MSECS, perform a proactive writeback no matter dirty data exceeds writeback_percent or not. 3) If duration is not timeout, and dirty data is not more than cached device's writeback_percent, writeback thread just schedules out and waits for another try after BCH_IDLE_DURATION_MSECS. By this patch, if a write request's dirty data does not exceed writeback_percent, writeback will not happen. In this case dirty data may stay in cache device for a very long time if I/O on bcache device is idle. To fix this, bch_writeback_queue() is called in update_writeback_rate() to wake up the writeback thread in period of writeback_rate_update_seconds. Then if no I/O happens on bcache device for BCH_IDLE_DURATION_MSECS milliseconds, at least we have chance to wake up writeback thread to find the duration timeout in a not-that-much delay. Now writeback_percent acts as what it is defined for. Signed-off-by: Coly Li <colyli@xxxxxxx> --- drivers/md/bcache/bcache.h | 4 ++++ drivers/md/bcache/request.c | 6 +++++ drivers/md/bcache/writeback.c | 55 +++++++++++++++++++++++++++++++++++++++---- 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index dee542fff68e..ab7e60336edb 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -662,6 +662,8 @@ struct cache_set { #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; + + uint64_t last_request_time; }; struct bbio { @@ -680,6 +682,8 @@ struct bbio { #define BTREE_PRIO USHRT_MAX #define INITIAL_PRIO 32768U +#define BCH_IDLE_DURATION_MSECS 10000U + #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) #define btree_blocks(b) \ ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 019b3df9f1c6..98971486f7f1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -957,10 +957,13 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, struct search *s; struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; struct cached_dev *dc = container_of(d, struct cached_dev, disk); + struct cache_set *c = d->c; int rw = bio_data_dir(bio); generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0); + if (c) + c->last_request_time = local_clock(); bio->bi_bdev = dc->bdev; bio->bi_iter.bi_sector += dc->sb.data_offset; @@ -991,6 +994,9 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, else generic_make_request(bio); } + /* update last_request_time again to make it to be more accurate */ + if (c) + c->last_request_time = local_clock(); return BLK_QC_T_NONE; } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 42c66e76f05e..13594dd7f564 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -78,8 +78,15 @@ static void update_writeback_rate(struct work_struct *work) down_read(&dc->writeback_lock); if (atomic_read(&dc->has_dirty) && - dc->writeback_percent) + dc->writeback_percent) { __update_writeback_rate(dc); + /* + * wake up writeback thread to check whether request + * duration is timeout in no_writeback_now(). If yes, + * existing dirty data should be handled. + */ + bch_writeback_queue(dc); + } up_read(&dc->writeback_lock); @@ -412,6 +419,48 @@ static bool refill_dirty(struct cached_dev *dc) return bkey_cmp(&buf->last_scanned, &start_pos) >= 0; } +static bool no_writeback_now(struct cached_dev *dc) +{ + uint64_t duration; + struct cache_set *c = dc->disk.c; + uint64_t cache_set_sectors; + uint64_t dirty_sectors; + uint64_t percent_sectors; + + if (!atomic_read(&dc->has_dirty)) + return true; + + if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && + !dc->writeback_running) + return true; + + /* + * last_request_time is initialized to 0, it is possible that + * duration is larger than BCH_IDLE_DURATION_MSECS when the first + * time it is calculated. If there is dirty data of the cached + * device, bcache will try to perform a proactive writeback. + */ + duration = local_clock() - c->last_request_time; + if ((duration/NSEC_PER_MSEC) > BCH_IDLE_DURATION_MSECS) + return false; + + /* + * If duration is not timeout, and dirty data does not exceed + * writeback_percent, no writeback now. + * This is what writeback_percent is designed for, see + * /sys/block/<disk>/bcache/writeback_percentkernel in + * Documentation/ABI/testing/sysfs-block-bcache + */ + cache_set_sectors = c->nbuckets * c->sb.bucket_size; + percent_sectors = div64_u64( + cache_set_sectors * dc->writeback_percent, 100); + dirty_sectors = bcache_dev_sectors_dirty(&dc->disk); + if (dirty_sectors <= percent_sectors) + return true; + + return false; +} + static int bch_writeback_thread(void *arg) { struct cached_dev *dc = arg; @@ -419,9 +468,7 @@ static int bch_writeback_thread(void *arg) while (!kthread_should_stop()) { down_write(&dc->writeback_lock); - if (!atomic_read(&dc->has_dirty) || - (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && - !dc->writeback_running)) { + if (no_writeback_now(dc)) { up_write(&dc->writeback_lock); set_current_state(TASK_INTERRUPTIBLE); -- 2.12.0 -- To unsubscribe from this list: send the line "unsubscribe linux-bcache" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html