Hi Tang Junhui--- On 12/28/2017 06:00 PM, tang.junhui@xxxxxxxxxx wrote: > LGTM, I would like it much more if MAX_WRITEBACKS_IN_PASS(5) defines a > little bigger value. Yes-- we can always turn it up further depending on experience. 5 was chosen to provide a clear benefit without changing things too much. > Reviewed-by: Tang Junhui <tang.junhui@xxxxxxxxxx> Thank you very much. Applied to my staging tree. Mike > >> Previously, there was some logic that attempted to immediately issue >> writeback of backing-contiguous blocks when the writeback rate was >> fast. >> >> The previous logic did not have any limits on the aggregate size it >> would issue, nor the number of keys it would combine at once. It >> would also discard the chance to do a contiguous write when the >> writeback rate was low-- e.g. at "background" writeback of target >> rate = 8, it would not combine two adjacent 4k writes and would >> instead seek the disk twice. >> >> This patch imposes limits and explicitly understands the size of >> contiguous I/O during issue. It also will combine contiguous I/O >> in all circumstances, not just when writeback is requested to be >> relatively fast. >> >> It is a win on its own, but also lays the groundwork for skip writes to >> short keys to make the I/O more sequential/contiguous. It also gets >> ready to start using blk_*_plug, and to allow issuing of non-contig >> I/O in parallel if requested by the user (to make use of disk >> throughput benefits available from higher queue depths). >> >> This patch fixes a previous version where the contiguous information >> was not calculated properly, thanks to suggestions by Tang Junhui. >> >> Signed-off-by: Michael Lyle <mlyle@xxxxxxxx> >> --- >> drivers/md/bcache/bcache.h | 6 --- >> drivers/md/bcache/writeback.c | 120 +++++++++++++++++++++++++++++------------- >> drivers/md/bcache/writeback.h | 3 ++ >> 3 files changed, 85 insertions(+), 44 deletions(-) >> >> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h >> index 843877e017e1..1784e50eb857 100644 >> --- a/drivers/md/bcache/bcache.h >> +++ b/drivers/md/bcache/bcache.h >> @@ -323,12 +323,6 @@ struct cached_dev { >> struct bch_ratelimit writeback_rate; >> struct delayed_work writeback_rate_update; >> >> - /* >> - * Internal to the writeback code, so read_dirty() can keep track of >> - * where it's at. >> - */ >> - sector_t last_read; >> - >> /* Limit number of writeback bios in flight */ >> struct semaphore in_flight; >> struct task_struct *writeback_thread; >> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c >> index f3d680c907ae..6975003454d9 100644 >> --- a/drivers/md/bcache/writeback.c >> +++ b/drivers/md/bcache/writeback.c >> @@ -251,7 +251,9 @@ static void read_dirty_submit(struct closure *cl) >> static void read_dirty(struct cached_dev *dc) >> { >> unsigned delay = 0; >> - struct keybuf_key *w; >> + struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w; >> + size_t size; >> + int nk, i; >> struct dirty_io *io; >> struct closure cl; >> >> @@ -262,45 +264,87 @@ static void read_dirty(struct cached_dev *dc) >> * mempools. >> */ >> >> - while (!kthread_should_stop()) { >> - >> - w = bch_keybuf_next(&dc->writeback_keys); >> - if (!w) >> - break; >> - >> - BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); >> - >> - if (KEY_START(&w->key) != dc->last_read || >> - jiffies_to_msecs(delay) > 50) >> - while (!kthread_should_stop() && delay) >> - delay = schedule_timeout_interruptible(delay); >> - >> - dc->last_read = KEY_OFFSET(&w->key); >> - >> - io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) >> - * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), >> - GFP_KERNEL); >> - if (!io) >> - goto err; >> - >> - w->private = io; >> - io->dc = dc; >> - >> - dirty_init(w); >> - bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); >> - io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); >> - bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); >> - io->bio.bi_end_io = read_dirty_endio; >> - >> - if (bio_alloc_pages(&io->bio, GFP_KERNEL)) >> - goto err_free; >> - >> - trace_bcache_writeback(&w->key); >> + next = bch_keybuf_next(&dc->writeback_keys); >> + >> + while (!kthread_should_stop() && next) { >> + size = 0; >> + nk = 0; >> + >> + do { >> + BUG_ON(ptr_stale(dc->disk.c, &next->key, 0)); >> + >> + /* >> + * Don't combine too many operations, even if they >> + * are all small. >> + */ >> + if (nk >= MAX_WRITEBACKS_IN_PASS) >> + break; >> + >> + /* >> + * If the current operation is very large, don't >> + * further combine operations. >> + */ >> + if (size >= MAX_WRITESIZE_IN_PASS) >> + break; >> + >> + /* >> + * Operations are only eligible to be combined >> + * if they are contiguous. >> + * >> + * TODO: add a heuristic willing to fire a >> + * certain amount of non-contiguous IO per pass, >> + * so that we can benefit from backing device >> + * command queueing. >> + */ >> + if ((nk != 0) && bkey_cmp(&keys[nk-1]->key, >> + &START_KEY(&next->key))) >> + break; >> + >> + size += KEY_SIZE(&next->key); >> + keys[nk++] = next; >> + } while ((next = bch_keybuf_next(&dc->writeback_keys))); >> + >> + /* Now we have gathered a set of 1..5 keys to write back. */ >> + for (i = 0; i < nk; i++) { >> + w = keys[i]; >> + >> + io = kzalloc(sizeof(struct dirty_io) + >> + sizeof(struct bio_vec) * >> + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), >> + GFP_KERNEL); >> + if (!io) >> + goto err; >> + >> + w->private = io; >> + io->dc = dc; >> + >> + dirty_init(w); >> + bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); >> + io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); >> + bio_set_dev(&io->bio, >> + PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); >> + io->bio.bi_end_io = read_dirty_endio; >> + >> + if (bio_alloc_pages(&io->bio, GFP_KERNEL)) >> + goto err_free; >> + >> + trace_bcache_writeback(&w->key); >> + >> + down(&dc->in_flight); >> + >> + /* We've acquired a semaphore for the maximum >> + * simultaneous number of writebacks; from here >> + * everything happens asynchronously. >> + */ >> + closure_call(&io->cl, read_dirty_submit, NULL, &cl); >> + } >> >> - down(&dc->in_flight); >> - closure_call(&io->cl, read_dirty_submit, NULL, &cl); >> + delay = writeback_delay(dc, size); >> >> - delay = writeback_delay(dc, KEY_SIZE(&w->key)); >> + while (!kthread_should_stop() && delay) { >> + schedule_timeout_interruptible(delay); >> + delay = writeback_delay(dc, 0); >> + } >> } >> >> if (0) { >> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h >> index a9e3ffb4b03c..6d26927267f8 100644 >> --- a/drivers/md/bcache/writeback.h >> +++ b/drivers/md/bcache/writeback.h >> @@ -5,6 +5,9 @@ >> #define CUTOFF_WRITEBACK 40 >> #define CUTOFF_WRITEBACK_SYNC 70 >> >> +#define MAX_WRITEBACKS_IN_PASS 5 >> +#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ >> + >> static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) >> { >> uint64_t i, ret = 0; >> -- >> 2.14.1 > > > Thanks, > Tang > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-bcache" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html >