Re: [PATCH v4 02/10] block: Add copy offload support infrastructure

Nitesh Shetty <nj.shetty@xxxxxxxxxxx> · Wed, 27 Apr 2022 21:18:47 +0530

On Wed, Apr 27, 2022 at 12:29:15PM +0200, Hannes Reinecke wrote:
> On 4/26/22 12:12, Nitesh Shetty wrote:
> > Introduce blkdev_issue_copy which supports source and destination bdevs,
> > and an array of (source, destination and copy length) tuples.
> > Introduce REQ_COPY copy offload operation flag. Create a read-write
> > bio pair with a token as payload and submitted to the device in order.
> > Read request populates token with source specific information which
> > is then passed with write request.
> > This design is courtesy Mikulas Patocka's token based copy
> > 
> > Larger copy will be divided, based on max_copy_sectors,
> > max_copy_range_sector limits.
> > 
> > Signed-off-by: Nitesh Shetty <nj.shetty@xxxxxxxxxxx>
> > Signed-off-by: Arnav Dawn <arnav.dawn@xxxxxxxxxxx>
> > ---
> >   block/blk-lib.c           | 232 ++++++++++++++++++++++++++++++++++++++
> >   block/blk.h               |   2 +
> >   include/linux/blk_types.h |  21 ++++
> >   include/linux/blkdev.h    |   2 +
> >   include/uapi/linux/fs.h   |  14 +++
> >   5 files changed, 271 insertions(+)
> > 
> > diff --git a/block/blk-lib.c b/block/blk-lib.c
> > index 09b7e1200c0f..ba9da2d2f429 100644
> > --- a/block/blk-lib.c
> > +++ b/block/blk-lib.c
> > @@ -117,6 +117,238 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> >   }
> >   EXPORT_SYMBOL(blkdev_issue_discard);
> > +/*
> > + * Wait on and process all in-flight BIOs.  This must only be called once
> > + * all bios have been issued so that the refcount can only decrease.
> > + * This just waits for all bios to make it through bio_copy_end_io. IO
> > + * errors are propagated through cio->io_error.
> > + */
> > +static int cio_await_completion(struct cio *cio)
> > +{
> > +	int ret = 0;
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&cio->lock, flags);
> > +	if (cio->refcount) {
> > +		cio->waiter = current;
> > +		__set_current_state(TASK_UNINTERRUPTIBLE);
> > +		spin_unlock_irqrestore(&cio->lock, flags);
> > +		blk_io_schedule();
> > +		/* wake up sets us TASK_RUNNING */
> > +		spin_lock_irqsave(&cio->lock, flags);
> > +		cio->waiter = NULL;
> > +		ret = cio->io_err;
> > +	}
> > +	spin_unlock_irqrestore(&cio->lock, flags);
> > +	kvfree(cio);
> > +
> > +	return ret;
> > +}
> > +
> > +static void bio_copy_end_io(struct bio *bio)
> > +{
> > +	struct copy_ctx *ctx = bio->bi_private;
> > +	struct cio *cio = ctx->cio;
> > +	sector_t clen;
> > +	int ri = ctx->range_idx;
> > +	unsigned long flags;
> > +	bool wake = false;
> > +
> > +	if (bio->bi_status) {
> > +		cio->io_err = bio->bi_status;
> > +		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - ctx->start_sec;
> > +		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
> > +	}
> > +	__free_page(bio->bi_io_vec[0].bv_page);
> > +	kfree(ctx);
> > +	bio_put(bio);
> > +
> > +	spin_lock_irqsave(&cio->lock, flags);
> > +	if (((--cio->refcount) <= 0) && cio->waiter)
> > +		wake = true;
> > +	spin_unlock_irqrestore(&cio->lock, flags);
> > +	if (wake)
> > +		wake_up_process(cio->waiter);
> > +}
> > +
> > +/*
> > + * blk_copy_offload	- Use device's native copy offload feature
> > + * Go through user provide payload, prepare new payload based on device's copy offload limits.
> > + */
> > +int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
> > +		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
> > +{
> > +	struct request_queue *sq = bdev_get_queue(src_bdev);
> > +	struct request_queue *dq = bdev_get_queue(dst_bdev);
> > +	struct bio *read_bio, *write_bio;
> > +	struct copy_ctx *ctx;
> > +	struct cio *cio;
> > +	struct page *token;
> > +	sector_t src_blk, copy_len, dst_blk;
> > +	sector_t remaining, max_copy_len = LONG_MAX;
> > +	unsigned long flags;
> > +	int ri = 0, ret = 0;
> > +
> > +	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
> > +	if (!cio)
> > +		return -ENOMEM;
> > +	cio->rlist = rlist;
> > +	spin_lock_init(&cio->lock);
> > +
> > +	max_copy_len = min_t(sector_t, sq->limits.max_copy_sectors, dq->limits.max_copy_sectors);
> > +	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
> > +			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
> > +
> > +	for (ri = 0; ri < nr_srcs; ri++) {
> > +		cio->rlist[ri].comp_len = rlist[ri].len;
> > +		src_blk = rlist[ri].src;
> > +		dst_blk = rlist[ri].dst;
> > +		for (remaining = rlist[ri].len; remaining > 0; remaining -= copy_len) {
> > +			copy_len = min(remaining, max_copy_len);
> > +
> > +			token = alloc_page(gfp_mask);
> > +			if (unlikely(!token)) {
> > +				ret = -ENOMEM;
> > +				goto err_token;
> > +			}
> > +
> > +			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
> > +			if (!ctx) {
> > +				ret = -ENOMEM;
> > +				goto err_ctx;
> > +			}
> > +			ctx->cio = cio;
> > +			ctx->range_idx = ri;
> > +			ctx->start_sec = dst_blk;
> > +
> > +			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
> > +					gfp_mask);
> > +			if (!read_bio) {
> > +				ret = -ENOMEM;
> > +				goto err_read_bio;
> > +			}
> > +			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
> > +			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
> > +			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
> > +			read_bio->bi_iter.bi_size = copy_len;
> > +			ret = submit_bio_wait(read_bio);
> > +			bio_put(read_bio);
> > +			if (ret)
> > +				goto err_read_bio;
> > +
> > +			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
> > +					gfp_mask);
> > +			if (!write_bio) {
> > +				ret = -ENOMEM;
> > +				goto err_read_bio;
> > +			}
> > +			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
> > +			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
> > +			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
> > +			write_bio->bi_iter.bi_size = copy_len;
> > +			write_bio->bi_end_io = bio_copy_end_io;
> > +			write_bio->bi_private = ctx;
> > +
> > +			spin_lock_irqsave(&cio->lock, flags);
> > +			++cio->refcount;
> > +			spin_unlock_irqrestore(&cio->lock, flags);
> > +
> > +			submit_bio(write_bio);
> > +			src_blk += copy_len;
> > +			dst_blk += copy_len;
> > +		}
> > +	}
> > +
> 
> Hmm. I'm not sure if I like the copy loop.
> What I definitely would do is to allocate the write bio before reading data;
> after all, if we can't allocate the write bio reading is pretty much
> pointless.
> 
> But the real issue I have with this is that it's doing synchronous reads,
> thereby limiting the performance.
> 
> Can't you submit the write bio from the end_io function of the read bio?
> That would disentangle things, and we should be getting a better
> performance.
> 

Agree, it will make code efficient.

--
Thank you 
Nitesh Shetty