Re: [PATCH v4 02/10] block: Add copy offload support infrastructure

Hannes Reinecke <hare@xxxxxxx> · Wed, 27 Apr 2022 12:29:15 +0200

On 4/26/22 12:12, Nitesh Shetty wrote:
Introduce blkdev_issue_copy which supports source and destination bdevs,
and an array of (source, destination and copy length) tuples.
Introduce REQ_COPY copy offload operation flag. Create a read-write
bio pair with a token as payload and submitted to the device in order.
Read request populates token with source specific information which
is then passed with write request.
This design is courtesy Mikulas Patocka's token based copy

Larger copy will be divided, based on max_copy_sectors,
max_copy_range_sector limits.

Signed-off-by: Nitesh Shetty <nj.shetty@xxxxxxxxxxx>
Signed-off-by: Arnav Dawn <arnav.dawn@xxxxxxxxxxx>
---
  block/blk-lib.c           | 232 ++++++++++++++++++++++++++++++++++++++
  block/blk.h               |   2 +
  include/linux/blk_types.h |  21 ++++
  include/linux/blkdev.h    |   2 +
  include/uapi/linux/fs.h   |  14 +++
  5 files changed, 271 insertions(+)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 09b7e1200c0f..ba9da2d2f429 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -117,6 +117,238 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
  }
  EXPORT_SYMBOL(blkdev_issue_discard);
  
+/*
+ * Wait on and process all in-flight BIOs.  This must only be called once
+ * all bios have been issued so that the refcount can only decrease.
+ * This just waits for all bios to make it through bio_copy_end_io. IO
+ * errors are propagated through cio->io_error.
+ */
+static int cio_await_completion(struct cio *cio)
+{
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cio->lock, flags);
+	if (cio->refcount) {
+		cio->waiter = current;
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		spin_unlock_irqrestore(&cio->lock, flags);
+		blk_io_schedule();
+		/* wake up sets us TASK_RUNNING */
+		spin_lock_irqsave(&cio->lock, flags);
+		cio->waiter = NULL;
+		ret = cio->io_err;
+	}
+	spin_unlock_irqrestore(&cio->lock, flags);
+	kvfree(cio);
+
+	return ret;
+}
+
+static void bio_copy_end_io(struct bio *bio)
+{
+	struct copy_ctx *ctx = bio->bi_private;
+	struct cio *cio = ctx->cio;
+	sector_t clen;
+	int ri = ctx->range_idx;
+	unsigned long flags;
+	bool wake = false;
+
+	if (bio->bi_status) {
+		cio->io_err = bio->bi_status;
+		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - ctx->start_sec;
+		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
+	}
+	__free_page(bio->bi_io_vec[0].bv_page);
+	kfree(ctx);
+	bio_put(bio);
+
+	spin_lock_irqsave(&cio->lock, flags);
+	if (((--cio->refcount) <= 0) && cio->waiter)
+		wake = true;
+	spin_unlock_irqrestore(&cio->lock, flags);
+	if (wake)
+		wake_up_process(cio->waiter);
+}
+
+/*
+ * blk_copy_offload	- Use device's native copy offload feature
+ * Go through user provide payload, prepare new payload based on device's copy offload limits.
+ */
+int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
+		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
+{
+	struct request_queue *sq = bdev_get_queue(src_bdev);
+	struct request_queue *dq = bdev_get_queue(dst_bdev);
+	struct bio *read_bio, *write_bio;
+	struct copy_ctx *ctx;
+	struct cio *cio;
+	struct page *token;
+	sector_t src_blk, copy_len, dst_blk;
+	sector_t remaining, max_copy_len = LONG_MAX;
+	unsigned long flags;
+	int ri = 0, ret = 0;
+
+	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
+	if (!cio)
+		return -ENOMEM;
+	cio->rlist = rlist;
+	spin_lock_init(&cio->lock);
+
+	max_copy_len = min_t(sector_t, sq->limits.max_copy_sectors, dq->limits.max_copy_sectors);
+	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
+			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
+
+	for (ri = 0; ri < nr_srcs; ri++) {
+		cio->rlist[ri].comp_len = rlist[ri].len;
+		src_blk = rlist[ri].src;
+		dst_blk = rlist[ri].dst;
+		for (remaining = rlist[ri].len; remaining > 0; remaining -= copy_len) {
+			copy_len = min(remaining, max_copy_len);
+
+			token = alloc_page(gfp_mask);
+			if (unlikely(!token)) {
+				ret = -ENOMEM;
+				goto err_token;
+			}
+
+			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
+			if (!ctx) {
+				ret = -ENOMEM;
+				goto err_ctx;
+			}
+			ctx->cio = cio;
+			ctx->range_idx = ri;
+			ctx->start_sec = dst_blk;
+
+			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
+					gfp_mask);
+			if (!read_bio) {
+				ret = -ENOMEM;
+				goto err_read_bio;
+			}
+			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
+			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
+			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
+			read_bio->bi_iter.bi_size = copy_len;
+			ret = submit_bio_wait(read_bio);
+			bio_put(read_bio);
+			if (ret)
+				goto err_read_bio;
+
+			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
+					gfp_mask);
+			if (!write_bio) {
+				ret = -ENOMEM;
+				goto err_read_bio;
+			}
+			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
+			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
+			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
+			write_bio->bi_iter.bi_size = copy_len;
+			write_bio->bi_end_io = bio_copy_end_io;
+			write_bio->bi_private = ctx;
+
+			spin_lock_irqsave(&cio->lock, flags);
+			++cio->refcount;
+			spin_unlock_irqrestore(&cio->lock, flags);
+
+			submit_bio(write_bio);
+			src_blk += copy_len;
+			dst_blk += copy_len;
+		}
+	}
+

Hmm. I'm not sure if I like the copy loop.
What I definitely would do is to allocate the write bio before reading 
data; after all, if we can't allocate the write bio reading is pretty 
much pointless.

But the real issue I have with this is that it's doing synchronous 
reads, thereby limiting the performance.

Can't you submit the write bio from the end_io function of the read bio?
That would disentangle things, and we should be getting a better 
performance.

Cheers,

Hannes
--
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@xxxxxxx			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer