On Wed, Apr 27, 2022 at 12:29:15PM +0200, Hannes Reinecke wrote: > On 4/26/22 12:12, Nitesh Shetty wrote: > > Introduce blkdev_issue_copy which supports source and destination bdevs, > > and an array of (source, destination and copy length) tuples. > > Introduce REQ_COPY copy offload operation flag. Create a read-write > > bio pair with a token as payload and submitted to the device in order. > > Read request populates token with source specific information which > > is then passed with write request. > > This design is courtesy Mikulas Patocka's token based copy > > > > Larger copy will be divided, based on max_copy_sectors, > > max_copy_range_sector limits. > > > > Signed-off-by: Nitesh Shetty <nj.shetty@xxxxxxxxxxx> > > Signed-off-by: Arnav Dawn <arnav.dawn@xxxxxxxxxxx> > > --- > > block/blk-lib.c | 232 ++++++++++++++++++++++++++++++++++++++ > > block/blk.h | 2 + > > include/linux/blk_types.h | 21 ++++ > > include/linux/blkdev.h | 2 + > > include/uapi/linux/fs.h | 14 +++ > > 5 files changed, 271 insertions(+) > > > > diff --git a/block/blk-lib.c b/block/blk-lib.c > > index 09b7e1200c0f..ba9da2d2f429 100644 > > --- a/block/blk-lib.c > > +++ b/block/blk-lib.c > > @@ -117,6 +117,238 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, > > } > > EXPORT_SYMBOL(blkdev_issue_discard); > > +/* > > + * Wait on and process all in-flight BIOs. This must only be called once > > + * all bios have been issued so that the refcount can only decrease. > > + * This just waits for all bios to make it through bio_copy_end_io. IO > > + * errors are propagated through cio->io_error. > > + */ > > +static int cio_await_completion(struct cio *cio) > > +{ > > + int ret = 0; > > + unsigned long flags; > > + > > + spin_lock_irqsave(&cio->lock, flags); > > + if (cio->refcount) { > > + cio->waiter = current; > > + __set_current_state(TASK_UNINTERRUPTIBLE); > > + spin_unlock_irqrestore(&cio->lock, flags); > > + blk_io_schedule(); > > + /* wake up sets us TASK_RUNNING */ > > + spin_lock_irqsave(&cio->lock, flags); > > + cio->waiter = NULL; > > + ret = cio->io_err; > > + } > > + spin_unlock_irqrestore(&cio->lock, flags); > > + kvfree(cio); > > + > > + return ret; > > +} > > + > > +static void bio_copy_end_io(struct bio *bio) > > +{ > > + struct copy_ctx *ctx = bio->bi_private; > > + struct cio *cio = ctx->cio; > > + sector_t clen; > > + int ri = ctx->range_idx; > > + unsigned long flags; > > + bool wake = false; > > + > > + if (bio->bi_status) { > > + cio->io_err = bio->bi_status; > > + clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - ctx->start_sec; > > + cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len); > > + } > > + __free_page(bio->bi_io_vec[0].bv_page); > > + kfree(ctx); > > + bio_put(bio); > > + > > + spin_lock_irqsave(&cio->lock, flags); > > + if (((--cio->refcount) <= 0) && cio->waiter) > > + wake = true; > > + spin_unlock_irqrestore(&cio->lock, flags); > > + if (wake) > > + wake_up_process(cio->waiter); > > +} > > + > > +/* > > + * blk_copy_offload - Use device's native copy offload feature > > + * Go through user provide payload, prepare new payload based on device's copy offload limits. > > + */ > > +int blk_copy_offload(struct block_device *src_bdev, int nr_srcs, > > + struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask) > > +{ > > + struct request_queue *sq = bdev_get_queue(src_bdev); > > + struct request_queue *dq = bdev_get_queue(dst_bdev); > > + struct bio *read_bio, *write_bio; > > + struct copy_ctx *ctx; > > + struct cio *cio; > > + struct page *token; > > + sector_t src_blk, copy_len, dst_blk; > > + sector_t remaining, max_copy_len = LONG_MAX; > > + unsigned long flags; > > + int ri = 0, ret = 0; > > + > > + cio = kzalloc(sizeof(struct cio), GFP_KERNEL); > > + if (!cio) > > + return -ENOMEM; > > + cio->rlist = rlist; > > + spin_lock_init(&cio->lock); > > + > > + max_copy_len = min_t(sector_t, sq->limits.max_copy_sectors, dq->limits.max_copy_sectors); > > + max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors, > > + (sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT; > > + > > + for (ri = 0; ri < nr_srcs; ri++) { > > + cio->rlist[ri].comp_len = rlist[ri].len; > > + src_blk = rlist[ri].src; > > + dst_blk = rlist[ri].dst; > > + for (remaining = rlist[ri].len; remaining > 0; remaining -= copy_len) { > > + copy_len = min(remaining, max_copy_len); > > + > > + token = alloc_page(gfp_mask); > > + if (unlikely(!token)) { > > + ret = -ENOMEM; > > + goto err_token; > > + } > > + > > + ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask); > > + if (!ctx) { > > + ret = -ENOMEM; > > + goto err_ctx; > > + } > > + ctx->cio = cio; > > + ctx->range_idx = ri; > > + ctx->start_sec = dst_blk; > > + > > + read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE, > > + gfp_mask); > > + if (!read_bio) { > > + ret = -ENOMEM; > > + goto err_read_bio; > > + } > > + read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT; > > + __bio_add_page(read_bio, token, PAGE_SIZE, 0); > > + /*__bio_add_page increases bi_size by len, so overwrite it with copy len*/ > > + read_bio->bi_iter.bi_size = copy_len; > > + ret = submit_bio_wait(read_bio); > > + bio_put(read_bio); > > + if (ret) > > + goto err_read_bio; > > + > > + write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE, > > + gfp_mask); > > + if (!write_bio) { > > + ret = -ENOMEM; > > + goto err_read_bio; > > + } > > + write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT; > > + __bio_add_page(write_bio, token, PAGE_SIZE, 0); > > + /*__bio_add_page increases bi_size by len, so overwrite it with copy len*/ > > + write_bio->bi_iter.bi_size = copy_len; > > + write_bio->bi_end_io = bio_copy_end_io; > > + write_bio->bi_private = ctx; > > + > > + spin_lock_irqsave(&cio->lock, flags); > > + ++cio->refcount; > > + spin_unlock_irqrestore(&cio->lock, flags); > > + > > + submit_bio(write_bio); > > + src_blk += copy_len; > > + dst_blk += copy_len; > > + } > > + } > > + > > Hmm. I'm not sure if I like the copy loop. > What I definitely would do is to allocate the write bio before reading data; > after all, if we can't allocate the write bio reading is pretty much > pointless. > > But the real issue I have with this is that it's doing synchronous reads, > thereby limiting the performance. > > Can't you submit the write bio from the end_io function of the read bio? > That would disentangle things, and we should be getting a better > performance. > Agree, it will make code efficient. -- Thank you Nitesh Shetty