From: Nitesh Shetty <nj.shetty@xxxxxxxxxxx> Introduce REQ_OP_COPY, a no-merge copy offload operation. Create bio with control information as payload and submit to the device. Larger copy operation may be divided if necessary by looking at device limits. REQ_OP_COPY(19) is a write op and takes zone_write_lock when submitted to zoned device. Native copy offload is not supported for stacked devices. Signed-off-by: Nitesh Shetty <nj.shetty@xxxxxxxxxxx> Signed-off-by: SelvaKumar S <selvakuma.s1@xxxxxxxxxxx> --- block/blk-core.c | 84 ++++++++++++- block/blk-lib.c | 252 ++++++++++++++++++++++++++++++++++++++ block/blk-zoned.c | 1 + block/bounce.c | 1 + include/linux/bio.h | 1 + include/linux/blk_types.h | 20 +++ include/linux/blkdev.h | 13 ++ include/uapi/linux/fs.h | 12 ++ 8 files changed, 378 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index d2722ecd4d9b..541b1561b4af 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -704,6 +704,17 @@ static noinline int should_fail_bio(struct bio *bio) } ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); +static inline int bio_check_copy_eod(struct bio *bio, sector_t start, + sector_t nr_sectors, sector_t max_sect) +{ + if (nr_sectors && max_sect && + (nr_sectors > max_sect || start > max_sect - nr_sectors)) { + handle_bad_sector(bio, max_sect); + return -EIO; + } + return 0; +} + /* * Check whether this bio extends beyond the end of the device or partition. * This may well happen - the kernel calls bread() without checking the size of @@ -723,6 +734,61 @@ static inline int bio_check_eod(struct bio *bio) return 0; } +/* + * check for eod limits and remap ranges if needed + */ +static int blk_check_copy(struct bio *bio) +{ + struct blk_copy_payload *payload = bio_data(bio); + sector_t dst_max_sect, dst_start_sect, copy_size = 0; + sector_t src_max_sect, src_start_sect; + struct block_device *bd_part; + int i, ret = -EIO; + + rcu_read_lock(); + + bd_part = bio->bi_bdev; + if (unlikely(!bd_part)) + goto err; + + dst_max_sect = bdev_nr_sectors(bd_part); + dst_start_sect = bd_part->bd_start_sect; + + src_max_sect = bdev_nr_sectors(payload->src_bdev); + src_start_sect = payload->src_bdev->bd_start_sect; + + if (unlikely(should_fail_request(bd_part, bio->bi_iter.bi_size))) + goto err; + + if (unlikely(bio_check_ro(bio))) + goto err; + + rcu_read_unlock(); + + for (i = 0; i < payload->copy_nr_ranges; i++) { + ret = bio_check_copy_eod(bio, payload->range[i].src, + payload->range[i].len, src_max_sect); + if (unlikely(ret)) + goto out; + + payload->range[i].src += src_start_sect; + copy_size += payload->range[i].len; + } + + /* check if copy length crosses eod */ + ret = bio_check_copy_eod(bio, bio->bi_iter.bi_sector, + copy_size, dst_max_sect); + if (unlikely(ret)) + goto out; + + bio->bi_iter.bi_sector += dst_start_sect; + return 0; +err: + rcu_read_unlock(); +out: + return ret; +} + /* * Remap block n of partition p to block n+start(p) of the disk. */ @@ -799,13 +865,15 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (should_fail_bio(bio)) goto end_io; - if (unlikely(bio_check_ro(bio))) - goto end_io; - if (!bio_flagged(bio, BIO_REMAPPED)) { - if (unlikely(bio_check_eod(bio))) - goto end_io; - if (bdev->bd_partno && unlikely(blk_partition_remap(bio))) + if (likely(!op_is_copy(bio->bi_opf))) { + if (unlikely(bio_check_ro(bio))) goto end_io; + if (!bio_flagged(bio, BIO_REMAPPED)) { + if (unlikely(bio_check_eod(bio))) + goto end_io; + if (bdev->bd_partno && unlikely(blk_partition_remap(bio))) + goto end_io; + } } /* @@ -829,6 +897,10 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (!blk_queue_discard(q)) goto not_supported; break; + case REQ_OP_COPY: + if (unlikely(blk_check_copy(bio))) + goto end_io; + break; case REQ_OP_SECURE_ERASE: if (!blk_queue_secure_erase(q)) goto not_supported; diff --git a/block/blk-lib.c b/block/blk-lib.c index 9f09beadcbe3..7fee0ae95c44 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -151,6 +151,258 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_discard); +/* + * Wait on and process all in-flight BIOs. This must only be called once + * all bios have been issued so that the refcount can only decrease. + * This just waits for all bios to make it through cio_bio_end_io. IO + * errors are propagated through cio->io_error. + */ +static int cio_await_completion(struct cio *cio) +{ + int ret = 0; + + while (atomic_read(&cio->refcount)) { + cio->waiter = current; + __set_current_state(TASK_UNINTERRUPTIBLE); + blk_io_schedule(); + /* wake up sets us TASK_RUNNING */ + cio->waiter = NULL; + ret = cio->io_err; + } + kvfree(cio); + + return ret; +} + +/* + * The BIO completion handler simply decrements refcount. + * Also wake up process, if this is the last bio to be completed. + * + * During I/O bi_private points at the cio. + */ +static void cio_bio_end_io(struct bio *bio) +{ + struct cio *cio = bio->bi_private; + + if (bio->bi_status) + cio->io_err = bio->bi_status; + kvfree(page_address(bio_first_bvec_all(bio)->bv_page) + + bio_first_bvec_all(bio)->bv_offset); + bio_put(bio); + + if (atomic_dec_and_test(&cio->refcount) && cio->waiter) + wake_up_process(cio->waiter); +} + +int blk_copy_offload_submit_bio(struct block_device *bdev, + struct blk_copy_payload *payload, int payload_size, + struct cio *cio, gfp_t gfp_mask) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct bio *bio; + + bio = bio_map_kern(q, payload, payload_size, gfp_mask); + if (IS_ERR(bio)) + return PTR_ERR(bio); + + bio_set_dev(bio, bdev); + bio->bi_opf = REQ_OP_COPY | REQ_NOMERGE; + bio->bi_iter.bi_sector = payload->dest; + bio->bi_end_io = cio_bio_end_io; + bio->bi_private = cio; + atomic_inc(&cio->refcount); + submit_bio(bio); + + return 0; +} + +/* Go through all the enrties inside user provided payload, and determine the + * maximum number of entries in a payload, based on device's scc-limits. + */ +static inline int blk_max_payload_entries(int nr_srcs, struct range_entry *rlist, + int max_nr_srcs, sector_t max_copy_range_sectors, sector_t max_copy_len) +{ + sector_t range_len, copy_len = 0, remaining = 0; + int ri = 0, pi = 1, max_pi = 0; + + for (ri = 0; ri < nr_srcs; ri++) { + for (remaining = rlist[ri].len; remaining > 0; remaining -= range_len) { + range_len = min3(remaining, max_copy_range_sectors, + max_copy_len - copy_len); + pi++; + copy_len += range_len; + + if ((pi == max_nr_srcs) || (copy_len == max_copy_len)) { + max_pi = max(max_pi, pi); + pi = 1; + copy_len = 0; + } + } + } + + return max(max_pi, pi); +} + +/* + * blk_copy_offload_scc - Use device's native copy offload feature + * Go through user provide payload, prepare new payload based on device's copy offload limits. + */ +int blk_copy_offload_scc(struct block_device *src_bdev, int nr_srcs, + struct range_entry *rlist, struct block_device *dest_bdev, + sector_t dest, gfp_t gfp_mask) +{ + struct request_queue *q = bdev_get_queue(dest_bdev); + struct cio *cio = NULL; + struct blk_copy_payload *payload; + sector_t range_len, copy_len = 0, remaining = 0; + sector_t src_blk, cdest = dest; + sector_t max_copy_range_sectors, max_copy_len; + int ri = 0, pi = 0, ret = 0, payload_size, max_pi, max_nr_srcs; + + cio = kzalloc(sizeof(struct cio), GFP_KERNEL); + if (!cio) + return -ENOMEM; + atomic_set(&cio->refcount, 0); + + max_nr_srcs = q->limits.max_copy_nr_ranges; + max_copy_range_sectors = q->limits.max_copy_range_sectors; + max_copy_len = q->limits.max_copy_sectors; + + max_pi = blk_max_payload_entries(nr_srcs, rlist, max_nr_srcs, + max_copy_range_sectors, max_copy_len); + payload_size = struct_size(payload, range, max_pi); + + payload = kvmalloc(payload_size, gfp_mask); + if (!payload) { + ret = -ENOMEM; + goto free_cio; + } + payload->src_bdev = src_bdev; + + for (ri = 0; ri < nr_srcs; ri++) { + for (remaining = rlist[ri].len, src_blk = rlist[ri].src; remaining > 0; + remaining -= range_len, src_blk += range_len) { + + range_len = min3(remaining, max_copy_range_sectors, + max_copy_len - copy_len); + payload->range[pi].len = range_len; + payload->range[pi].src = src_blk; + pi++; + copy_len += range_len; + + /* Submit current payload, if crossing device copy limits */ + if ((pi == max_nr_srcs) || (copy_len == max_copy_len)) { + payload->dest = cdest; + payload->copy_nr_ranges = pi; + ret = blk_copy_offload_submit_bio(dest_bdev, payload, + payload_size, cio, gfp_mask); + if (ret) + goto free_payload; + + /* reset index, length and allocate new payload */ + pi = 0; + cdest += copy_len; + copy_len = 0; + payload = kvmalloc(payload_size, gfp_mask); + if (!payload) { + ret = -ENOMEM; + goto free_cio; + } + payload->src_bdev = src_bdev; + } + } + } + + if (pi) { + payload->dest = cdest; + payload->copy_nr_ranges = pi; + ret = blk_copy_offload_submit_bio(dest_bdev, payload, payload_size, cio, gfp_mask); + if (ret) + goto free_payload; + } + + /* Wait for completion of all IO's*/ + ret = cio_await_completion(cio); + + return ret; + +free_payload: + kvfree(payload); +free_cio: + cio_await_completion(cio); + return ret; +} + +static inline sector_t blk_copy_len(struct range_entry *rlist, int nr_srcs) +{ + int i; + sector_t len = 0; + + for (i = 0; i < nr_srcs; i++) { + if (rlist[i].len) + len += rlist[i].len; + else + return 0; + } + + return len; +} + +static inline bool blk_check_offload_scc(struct request_queue *src_q, + struct request_queue *dest_q) +{ + if (src_q == dest_q && src_q->limits.copy_offload == BLK_COPY_OFFLOAD_SCC) + return true; + + return false; +} + +/* + * blkdev_issue_copy - queue a copy + * @src_bdev: source block device + * @nr_srcs: number of source ranges to copy + * @src_rlist: array of source ranges + * @dest_bdev: destination block device + * @dest: destination in sector + * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: BLKDEV_COPY_* flags to control behaviour + * + * Description: + * Copy source ranges from source block device to destination block device. + * length of a source range cannot be zero. + */ +int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs, + struct range_entry *src_rlist, struct block_device *dest_bdev, + sector_t dest, gfp_t gfp_mask, int flags) +{ + struct request_queue *src_q = bdev_get_queue(src_bdev); + struct request_queue *dest_q = bdev_get_queue(dest_bdev); + sector_t copy_len; + int ret = -EINVAL; + + if (!src_q || !dest_q) + return -ENXIO; + + if (!nr_srcs) + return -EINVAL; + + if (nr_srcs >= MAX_COPY_NR_RANGE) + return -EINVAL; + + copy_len = blk_copy_len(src_rlist, nr_srcs); + if (!copy_len && copy_len >= MAX_COPY_TOTAL_LENGTH) + return -EINVAL; + + if (bdev_read_only(dest_bdev)) + return -EPERM; + + if (blk_check_offload_scc(src_q, dest_q)) + ret = blk_copy_offload_scc(src_bdev, nr_srcs, src_rlist, dest_bdev, dest, gfp_mask); + + return ret; +} +EXPORT_SYMBOL(blkdev_issue_copy); + /** * __blkdev_issue_write_same - generate number of bios with same page * @bdev: target blockdev diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 86fce751bb17..7643fc868521 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -67,6 +67,7 @@ bool blk_req_needs_zone_write_lock(struct request *rq) case REQ_OP_WRITE_ZEROES: case REQ_OP_WRITE_SAME: case REQ_OP_WRITE: + case REQ_OP_COPY: return blk_rq_zone_is_seq(rq); default: return false; diff --git a/block/bounce.c b/block/bounce.c index 05fc7148489d..d9b05aaf6e56 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -176,6 +176,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; switch (bio_op(bio)) { + case REQ_OP_COPY: case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: diff --git a/include/linux/bio.h b/include/linux/bio.h index 3d67d0fbc868..068fa2e8896a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -73,6 +73,7 @@ static inline bool bio_has_data(struct bio *bio) static inline bool bio_no_advance_iter(const struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || + bio_op(bio) == REQ_OP_COPY || bio_op(bio) == REQ_OP_SECURE_ERASE || bio_op(bio) == REQ_OP_WRITE_SAME || bio_op(bio) == REQ_OP_WRITE_ZEROES; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 9e392daa1d7f..1ab77176cb46 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -347,6 +347,8 @@ enum req_opf { REQ_OP_ZONE_RESET = 15, /* reset all the zone present on the device */ REQ_OP_ZONE_RESET_ALL = 17, + /* copy ranges within device */ + REQ_OP_COPY = 19, /* Driver private requests */ REQ_OP_DRV_IN = 34, @@ -470,6 +472,11 @@ static inline bool op_is_discard(unsigned int op) return (op & REQ_OP_MASK) == REQ_OP_DISCARD; } +static inline bool op_is_copy(unsigned int op) +{ + return (op & REQ_OP_MASK) == REQ_OP_COPY; +} + /* * Check if a bio or request operation is a zone management operation, with * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case @@ -529,4 +536,17 @@ struct blk_rq_stat { u64 batch; }; +struct cio { + atomic_t refcount; + blk_status_t io_err; + struct task_struct *waiter; /* waiting task (NULL if none) */ +}; + +struct blk_copy_payload { + struct block_device *src_bdev; + sector_t dest; + int copy_nr_ranges; + struct range_entry range[]; +}; + #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fd4cfaadda5b..38369dff6a36 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -52,6 +52,12 @@ struct blk_keyslot_manager; /* Doing classic polling */ #define BLK_MQ_POLL_CLASSIC -1 +/* Define copy offload options */ +enum blk_copy { + BLK_COPY_OFFLOAD_EMULATE = 0, + BLK_COPY_OFFLOAD_SCC, +}; + /* * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. @@ -1051,6 +1057,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); + if (unlikely(op == REQ_OP_COPY)) + return q->limits.max_copy_sectors; + if (unlikely(op == REQ_OP_WRITE_SAME)) return q->limits.max_write_same_sectors; @@ -1326,6 +1335,10 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop); +int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs, + struct range_entry *src_rlist, struct block_device *dest_bdev, + sector_t dest, gfp_t gfp_mask, int flags); + #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index bdf7b404b3e7..7a97b588d892 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -64,6 +64,18 @@ struct fstrim_range { __u64 minlen; }; +/* Maximum no of entries supported */ +#define MAX_COPY_NR_RANGE (1 << 12) + +/* maximum total copy length */ +#define MAX_COPY_TOTAL_LENGTH (1 << 21) + +/* Source range entry for copy */ +struct range_entry { + __u64 src; + __u64 len; +}; + /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ #define FILE_DEDUPE_RANGE_SAME 0 #define FILE_DEDUPE_RANGE_DIFFERS 1 -- 2.25.1