[PATCH 2/3] bcache: submit writeback inflight dirty writes in batch

mingzhe.zou@xxxxxxxxxxxx · Wed, 1 Feb 2023 14:52:01 +0800

From: Dongsheng Yang <dongsheng.yang@xxxxxxxxxxxx>

If we have a backing device of log-structured block device (such as bcache flash dev),
there is a possibility to merge the writes in writeback, as the all writes into bcache flash_dev
are stored in bucket as log-structured.

That means, if we have a cached_dev as below:
        ----------------------------
        | bcache2 (cached_dev)     |
        | ------------------------ |
        | |   sdb (cache_dev)    | |
        | ------------------------ |
        | ------------------------ |
        | |   bcache1 (flash_dev)| |
        | ------------------------ |
        ----------------------------

we can merge the dirty writes in writeback, if we can submit the dirty writes in batch and around start_plug/finish_plug.

So this commit change the dirty_write to add the a dirty_io into a rb_tree, and queue a worker to submit all dirty_io,
This provide a timing to merge these writes, which can improve the writeback bandwidth.

Signed-off-by: Dongsheng Yang <dongsheng.yang@xxxxxxxxxxxx>
Signed-off-by: mingzhe <mingzhe.zou@xxxxxxxxxxxx>
---
 drivers/md/bcache/bcache.h    |   4 ++
 drivers/md/bcache/writeback.c | 102 ++++++++++++++++++++++------------
 2 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 74434a7730bb..a82974aefc90 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -356,6 +356,10 @@ struct cached_dev {
 	struct closure_waitlist writeback_ordering_wait;
 	atomic_t		writeback_sequence_next;
 
+	struct rb_root		writeback_ios;
+	spinlock_t		writeback_ios_lock;
+	struct work_struct	write_dirty_work;
+
 	/* For tracking sequential IO */
 #define RECENT_IO_BITS	7
 #define RECENT_IO	(1 << RECENT_IO_BITS)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 0c5f25816e2e..315fb91a8066 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -323,6 +323,7 @@ struct dirty_io {
 	struct closure		cl;
 	struct cached_dev	*dc;
 	uint16_t		sequence;
+	struct rb_node		node;
 	struct bio		bio;
 };
 
@@ -401,53 +402,81 @@ static void dirty_endio(struct bio *bio)
 	closure_put(&io->cl);
 }
 
-static void write_dirty(struct closure *cl)
+static inline int dirty_io_cmp(struct dirty_io *l, struct dirty_io *r)
+{
+	return (l->sequence < r->sequence) ? -1 : (l->sequence > r->sequence);
+}
+
+static void queue_dirty_write(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
-	struct keybuf_key *w = io->bio.bi_private;
 	struct cached_dev *dc = io->dc;
 
-	uint16_t next_sequence;
+	spin_lock(&dc->writeback_ios_lock);
+	BUG_ON(RB_INSERT(&dc->writeback_ios, io, node, dirty_io_cmp));
+	spin_unlock(&dc->writeback_ios_lock);
 
-	if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
-		/* Not our turn to write; wait for a write to complete */
-		closure_wait(&dc->writeback_ordering_wait, cl);
+	queue_work(dc->writeback_write_wq, &dc->write_dirty_work);
+}
 
-		if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
-			/*
-			 * Edge case-- it happened in indeterminate order
-			 * relative to when we were added to wait list..
-			 */
-			closure_wake_up(&dc->writeback_ordering_wait);
-		}
+static void write_dirty(struct work_struct *work)
+{
+	struct cached_dev *dc = container_of(work, struct cached_dev,
+						write_dirty_work);
+	struct dirty_io *io;
+	struct keybuf_key *w;
+	uint16_t next_sequence;
+	struct blk_plug plug;
 
-		continue_at(cl, write_dirty, io->dc->writeback_write_wq);
+	spin_lock(&dc->writeback_ios_lock);
+	if (RB_EMPTY_ROOT(&dc->writeback_ios)) {
+		spin_unlock(&dc->writeback_ios_lock);
 		return;
 	}
 
-	next_sequence = io->sequence + 1;
+	io = RB_FIRST(&dc->writeback_ios, struct dirty_io, node);
+	if (io->sequence != atomic_read(&dc->writeback_sequence_next)) {
+		spin_unlock(&dc->writeback_ios_lock);
+		return;
+	}
 
-	/*
-	 * IO errors are signalled using the dirty bit on the key.
-	 * If we failed to read, we should not attempt to write to the
-	 * backing device.  Instead, immediately go to write_dirty_finish
-	 * to clean up.
-	 */
-	if (KEY_DIRTY(&w->key)) {
-		dirty_init(w);
-		io->bio.bi_opf = REQ_OP_WRITE;
-		io->bio.bi_iter.bi_sector = KEY_START(&w->key);
-		bio_set_dev(&io->bio, io->dc->bdev);
-		io->bio.bi_end_io	= dirty_endio;
-
-		/* I/O request sent to backing device */
-		closure_bio_submit(io->dc->disk.c, &io->bio, cl);
+	blk_start_plug(&plug);
+	next_sequence = io->sequence;
+
+	while(io) {
+		if (io->sequence != next_sequence)
+			break;
+
+		rb_erase(&io->node, &dc->writeback_ios);
+		spin_unlock(&dc->writeback_ios_lock);
+		w = io->bio.bi_private;
+		/*
+		 * IO errors are signalled using the dirty bit on the key.
+		 * If we failed to read, we should not attempt to write to the
+		 * backing device.  Instead, immediately go to write_dirty_finish
+		 * to clean up.
+		 */
+		if (KEY_DIRTY(&w->key)) {
+			dirty_init(w);
+			io->bio.bi_opf = REQ_OP_WRITE;
+			io->bio.bi_iter.bi_sector = KEY_START(&w->key);
+			bio_set_dev(&io->bio, io->dc->bdev);
+			io->bio.bi_end_io	= dirty_endio;
+
+			/* I/O request sent to backing device */
+			closure_bio_submit(io->dc->disk.c, &io->bio, &io->cl);
+		}
+
+		continue_at(&io->cl, write_dirty_finish, io->dc->writeback_write_wq);
+
+		spin_lock(&dc->writeback_ios_lock);
+		io = RB_FIRST(&dc->writeback_ios, struct dirty_io, node);
+		next_sequence++;
 	}
 
 	atomic_set(&dc->writeback_sequence_next, next_sequence);
-	closure_wake_up(&dc->writeback_ordering_wait);
-
-	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
+	spin_unlock(&dc->writeback_ios_lock);
+	blk_finish_plug(&plug);
 }
 
 static void read_dirty_endio(struct bio *bio)
@@ -469,7 +498,7 @@ static void read_dirty_submit(struct closure *cl)
 
 	closure_bio_submit(io->dc->disk.c, &io->bio, cl);
 
-	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
+	continue_at(cl, queue_dirty_write, io->dc->writeback_write_wq);
 }
 
 static void start_wb_inflight(struct cached_dev *dc)
@@ -578,6 +607,7 @@ static void read_dirty(struct cached_dev *dc)
 			w->private	= io;
 			io->dc		= dc;
 			io->sequence    = sequence++;
+			RB_CLEAR_NODE(&io->node);
 
 			dirty_init(w);
 			io->bio.bi_opf = REQ_OP_READ;
@@ -1066,6 +1096,10 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	init_rwsem(&dc->writeback_lock);
 	bch_keybuf_init(&dc->writeback_keys);
 
+	spin_lock_init(&dc->writeback_ios_lock);
+	dc->writeback_ios		= RB_ROOT;
+	INIT_WORK(&dc->write_dirty_work, write_dirty);
+
 	dc->writeback_metadata		= true;
 	dc->writeback_running		= false;
 	dc->writeback_consider_fragment = true;
-- 
2.17.1