On Thu, Jun 30, 2022 at 02:14:01AM -0700, Chaitanya Kulkarni wrote: > This adds a new block layer operation to offload verifying a range of > LBAs. This support is needed in order to provide file systems and > fabrics, kernel components to offload LBA verification when it is > supported by the hardware controller. In case hardware offloading is > not supported then we provide API to emulate the same. The prominent > example of that is SCSI and NVMe Verify command. We also provide > an emulation of the same operation that can be used in case H/W does > not support verify. This is still useful when block device is remotely > attached e.g. using NVMeOF. > > Signed-off-by: Chaitanya Kulkarni <kch@xxxxxxxxxx> > --- > Documentation/ABI/stable/sysfs-block | 12 +++ > block/blk-core.c | 5 + > block/blk-lib.c | 155 +++++++++++++++++++++++++++ > block/blk-merge.c | 18 ++++ > block/blk-settings.c | 17 +++ > block/blk-sysfs.c | 8 ++ > block/blk.h | 4 + > block/ioctl.c | 35 ++++++ > include/linux/bio.h | 9 +- > include/linux/blk_types.h | 2 + > include/linux/blkdev.h | 22 ++++ > include/uapi/linux/fs.h | 1 + > 12 files changed, 285 insertions(+), 3 deletions(-) > > diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block > index e8797cd09aff..a71d9c41cf8b 100644 > --- a/Documentation/ABI/stable/sysfs-block > +++ b/Documentation/ABI/stable/sysfs-block > @@ -657,6 +657,18 @@ Description: > in a single write zeroes command. If write_zeroes_max_bytes is > 0, write zeroes is not supported by the device. > > +What: /sys/block/<disk>/queue/verify_max_bytes > +Date: April 2022 > +Contact: Chaitanya Kulkarni <kch@xxxxxxxxxx> > +Description: > + Devices that support verify operation in which a single > + request can be issued to verify the range of the contiguous > + blocks on the storage without any payload in the request. > + This can be used to optimize verifying LBAs on the device > + without reading by offloading functionality. verify_max_bytes > + indicates how many bytes can be written in a single verify > + command. If verify_max_bytes is 0, verify operation is not > + supported by the device. > > What: /sys/block/<disk>/queue/zone_append_max_bytes > Date: May 2020 > diff --git a/block/blk-core.c b/block/blk-core.c > index 06ff5bbfe8f6..9ad52247dcdf 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -123,6 +123,7 @@ static const char *const blk_op_name[] = { > REQ_OP_NAME(ZONE_FINISH), > REQ_OP_NAME(ZONE_APPEND), > REQ_OP_NAME(WRITE_ZEROES), > + REQ_OP_NAME(VERIFY), > REQ_OP_NAME(DRV_IN), > REQ_OP_NAME(DRV_OUT), > }; > @@ -842,6 +843,10 @@ void submit_bio_noacct(struct bio *bio) > if (!q->limits.max_write_zeroes_sectors) > goto not_supported; > break; > + case REQ_OP_VERIFY: > + if (!q->limits.max_verify_sectors) > + goto not_supported; > + break; > default: > break; > } > diff --git a/block/blk-lib.c b/block/blk-lib.c > index 09b7e1200c0f..4624d68bb3cb 100644 > --- a/block/blk-lib.c > +++ b/block/blk-lib.c > @@ -340,3 +340,158 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, > return ret; > } > EXPORT_SYMBOL(blkdev_issue_secure_erase); > + > +/** > + * __blkdev_emulate_verify - emulate number of verify operations > + * asynchronously > + * @bdev: blockdev to issue > + * @sector: start sector > + * @nr_sects: number of sectors to verify > + * @gfp_mask: memory allocation flags (for bio_alloc) > + * @biop: pointer to anchor bio > + * @buf: data buffer to mapped on bio > + * > + * Description: > + * Verify a block range by emulating REQ_OP_VERIFY into REQ_OP_READ, > + * use this when H/W offloading is not supported asynchronously. > + * Caller is responsible to handle anchored bio. > + */ > +static int __blkdev_emulate_verify(struct block_device *bdev, sector_t sector, > + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, char *buf) > +{ > + struct bio *bio = *biop; > + unsigned int sz; > + int bi_size; > + > + while (nr_sects != 0) { > + bio = blk_next_bio(bio, bdev, > + __blkdev_sectors_to_bio_pages(nr_sects), > + REQ_OP_READ, gfp_mask); > + bio->bi_iter.bi_sector = sector; > + > + while (nr_sects != 0) { > + bool is_vaddr = is_vmalloc_addr(buf); > + struct page *p; > + > + p = is_vaddr ? vmalloc_to_page(buf) : virt_to_page(buf); > + sz = min((sector_t) PAGE_SIZE, nr_sects << 9); > + > + bi_size = bio_add_page(bio, p, sz, offset_in_page(buf)); > + if (bi_size < sz) > + return -EIO; > + > + nr_sects -= bi_size >> 9; > + sector += bi_size >> 9; > + buf += bi_size; > + } > + cond_resched(); > + } > + > + *biop = bio; > + return 0; > +} > + > +/** > + * __blkdev_issue_verify - generate number of verify operations > + * @bdev: blockdev to issue > + * @sector: start sector > + * @nr_sects: number of sectors to verify > + * @gfp_mask: memory allocation flags (for bio_alloc()) > + * @biop: pointer to anchor bio > + * > + * Description: > + * Verify a block range using hardware offload. > + * > + * The function will emulate verify operation if no explicit hardware > + * offloading for verifying is provided. > + */ > +int __blkdev_issue_verify(struct block_device *bdev, sector_t sector, > + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) > +{ > + unsigned int max_verify_sectors = bdev_verify_sectors(bdev); > + sector_t min_io_sect = (BIO_MAX_VECS << PAGE_SHIFT) >> 9; > + struct bio *bio = *biop; > + sector_t curr_sects; > + char *buf; > + > + if (!max_verify_sectors) { > + int ret = 0; > + > + buf = kzalloc(min_io_sect << 9, GFP_KERNEL); k*z*alloc? I don't think you need to zero a buffer that we're reading into, right? --D > + if (!buf) > + return -ENOMEM; > + > + while (nr_sects > 0) { > + curr_sects = min_t(sector_t, nr_sects, min_io_sect); > + ret = __blkdev_emulate_verify(bdev, sector, curr_sects, > + gfp_mask, &bio, buf); > + if (ret) > + break; > + > + if (bio) { > + ret = submit_bio_wait(bio); > + bio_put(bio); > + bio = NULL; > + } > + > + nr_sects -= curr_sects; > + sector += curr_sects; > + > + } > + /* set the biop to NULL since we have alrady completed above */ > + *biop = NULL; > + kfree(buf); > + return ret; > + } > + > + while (nr_sects) { > + bio = blk_next_bio(bio, bdev, 0, REQ_OP_VERIFY, gfp_mask); > + bio->bi_iter.bi_sector = sector; > + > + if (nr_sects > max_verify_sectors) { > + bio->bi_iter.bi_size = max_verify_sectors << 9; > + nr_sects -= max_verify_sectors; > + sector += max_verify_sectors; > + } else { > + bio->bi_iter.bi_size = nr_sects << 9; > + nr_sects = 0; > + } > + cond_resched(); > + } > + *biop = bio; > + return 0; > +} > +EXPORT_SYMBOL_GPL(__blkdev_issue_verify); > + > +/** > + * blkdev_issue_verify - verify a block range > + * @bdev: blockdev to verify > + * @sector: start sector > + * @nr_sects: number of sectors to verify > + * @gfp_mask: memory allocation flags (for bio_alloc) > + * > + * Description: > + * Verify a block range using hardware offload. > + */ > +int blkdev_issue_verify(struct block_device *bdev, sector_t sector, > + sector_t nr_sects, gfp_t gfp_mask) > +{ > + sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; > + struct bio *bio = NULL; > + struct blk_plug plug; > + int ret = 0; > + > + if ((sector | nr_sects) & bs_mask) > + return -EINVAL; > + > + blk_start_plug(&plug); > + ret = __blkdev_issue_verify(bdev, sector, nr_sects, gfp_mask, &bio); > + if (ret == 0 && bio) { > + ret = submit_bio_wait(bio); > + bio_put(bio); > + } > + blk_finish_plug(&plug); > + > + return ret; > +} > +EXPORT_SYMBOL_GPL(blkdev_issue_verify); > diff --git a/block/blk-merge.c b/block/blk-merge.c > index 7771dacc99cb..8ff305377b5a 100644 > --- a/block/blk-merge.c > +++ b/block/blk-merge.c > @@ -153,6 +153,20 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, > return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs); > } > > +static struct bio *blk_bio_verify_split(struct request_queue *q, > + struct bio *bio, struct bio_set *bs, unsigned *nsegs) > +{ > + *nsegs = 0; > + > + if (!q->limits.max_verify_sectors) > + return NULL; > + > + if (bio_sectors(bio) <= q->limits.max_verify_sectors) > + return NULL; > + > + return bio_split(bio, q->limits.max_verify_sectors, GFP_NOIO, bs); > +} > + > /* > * Return the maximum number of sectors from the start of a bio that may be > * submitted as a single request to a block device. If enough sectors remain, > @@ -336,6 +350,10 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, > split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, > nr_segs); > break; > + case REQ_OP_VERIFY: > + split = blk_bio_verify_split(q, *bio, &q->bio_split, > + nr_segs); > + break; > default: > split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); > break; > diff --git a/block/blk-settings.c b/block/blk-settings.c > index 6ccceb421ed2..c77697290bc5 100644 > --- a/block/blk-settings.c > +++ b/block/blk-settings.c > @@ -43,6 +43,7 @@ void blk_set_default_limits(struct queue_limits *lim) > lim->max_dev_sectors = 0; > lim->chunk_sectors = 0; > lim->max_write_zeroes_sectors = 0; > + lim->max_verify_sectors = 0; > lim->max_zone_append_sectors = 0; > lim->max_discard_sectors = 0; > lim->max_hw_discard_sectors = 0; > @@ -80,6 +81,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) > lim->max_sectors = UINT_MAX; > lim->max_dev_sectors = UINT_MAX; > lim->max_write_zeroes_sectors = UINT_MAX; > + lim->max_verify_sectors = UINT_MAX; > lim->max_zone_append_sectors = UINT_MAX; > } > EXPORT_SYMBOL(blk_set_stacking_limits); > @@ -202,6 +204,19 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q, > } > EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); > > +/** > + * blk_queue_max_verify_sectors - set max sectors for a single verify > + * > + * @q: the request queue for the device > + * @max_verify_sectors: maximum number of sectors to verify per command > + **/ > +void blk_queue_max_verify_sectors(struct request_queue *q, > + unsigned int max_verify_sectors) > +{ > + q->limits.max_verify_sectors = max_verify_sectors; > +} > +EXPORT_SYMBOL(blk_queue_max_verify_sectors); > + > /** > * blk_queue_max_zone_append_sectors - set max sectors for a single zone append > * @q: the request queue for the device > @@ -554,6 +569,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, > t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); > t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, > b->max_write_zeroes_sectors); > + t->max_verify_sectors = min(t->max_verify_sectors, > + b->max_verify_sectors); > t->max_zone_append_sectors = min(t->max_zone_append_sectors, > b->max_zone_append_sectors); > t->bounce = max(t->bounce, b->bounce); > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c > index 88bd41d4cb59..4fb6a731acad 100644 > --- a/block/blk-sysfs.c > +++ b/block/blk-sysfs.c > @@ -113,6 +113,12 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count) > return ret; > } > > +static ssize_t queue_verify_max_show(struct request_queue *q, char *page) > +{ > + return sprintf(page, "%llu\n", > + (unsigned long long)q->limits.max_verify_sectors << 9); > +} > + > static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) > { > int max_sectors_kb = queue_max_sectors(q) >> 1; > @@ -588,6 +594,7 @@ QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); > > QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); > QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); > +QUEUE_RO_ENTRY(queue_verify_max, "verify_max_bytes"); > QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); > QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); > > @@ -644,6 +651,7 @@ static struct attribute *queue_attrs[] = { > &queue_discard_zeroes_data_entry.attr, > &queue_write_same_max_entry.attr, > &queue_write_zeroes_max_entry.attr, > + &queue_verify_max_entry.attr, > &queue_zone_append_max_entry.attr, > &queue_zone_write_granularity_entry.attr, > &queue_nonrot_entry.attr, > diff --git a/block/blk.h b/block/blk.h > index 434017701403..63a0e3aca7e0 100644 > --- a/block/blk.h > +++ b/block/blk.h > @@ -132,6 +132,9 @@ static inline bool rq_mergeable(struct request *rq) > if (req_op(rq) == REQ_OP_WRITE_ZEROES) > return false; > > + if (req_op(rq) == REQ_OP_VERIFY) > + return false; > + > if (req_op(rq) == REQ_OP_ZONE_APPEND) > return false; > > @@ -286,6 +289,7 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio) > case REQ_OP_DISCARD: > case REQ_OP_SECURE_ERASE: > case REQ_OP_WRITE_ZEROES: > + case REQ_OP_VERIFY: > return true; /* non-trivial splitting decisions */ > default: > break; > diff --git a/block/ioctl.c b/block/ioctl.c > index 46949f1b0dba..60a48e24b82d 100644 > --- a/block/ioctl.c > +++ b/block/ioctl.c > @@ -192,6 +192,39 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, > return err; > } > > +static int blk_ioctl_verify(struct block_device *bdev, fmode_t mode, > + unsigned long arg) > +{ > + uint64_t range[2]; > + struct address_space *mapping; > + uint64_t start, end, len; > + > + if (!(mode & FMODE_READ)) > + return -EBADF; > + > + if (copy_from_user(range, (void __user *)arg, sizeof(range))) > + return -EFAULT; > + > + start = range[0]; > + len = range[1]; > + end = start + len - 1; > + > + if (start & 511) > + return -EINVAL; > + if (len & 511) > + return -EINVAL; > + if (end >= (uint64_t)i_size_read(bdev->bd_inode)) > + return -EINVAL; > + if (end < start) > + return -EINVAL; > + > + /* Invalidate the page cache, including dirty pages */ > + mapping = bdev->bd_inode->i_mapping; > + truncate_inode_pages_range(mapping, start, end); You might want to write any dirty pagecache contents to disk before you invalidate them all... > + > + return blkdev_issue_verify(bdev, start >> 9, len >> 9, GFP_KERNEL); > +} > + > static int put_ushort(unsigned short __user *argp, unsigned short val) > { > return put_user(val, argp); > @@ -483,6 +516,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, > return blk_ioctl_secure_erase(bdev, mode, argp); > case BLKZEROOUT: > return blk_ioctl_zeroout(bdev, mode, arg); > + case BLKVERIFY: > + return blk_ioctl_verify(bdev, mode, arg); > case BLKGETDISKSEQ: > return put_u64(argp, bdev->bd_disk->diskseq); > case BLKREPORTZONE: > diff --git a/include/linux/bio.h b/include/linux/bio.h > index 1cf3738ef1ea..3dfafe1da098 100644 > --- a/include/linux/bio.h > +++ b/include/linux/bio.h > @@ -55,7 +55,8 @@ static inline bool bio_has_data(struct bio *bio) > bio->bi_iter.bi_size && > bio_op(bio) != REQ_OP_DISCARD && > bio_op(bio) != REQ_OP_SECURE_ERASE && > - bio_op(bio) != REQ_OP_WRITE_ZEROES) > + bio_op(bio) != REQ_OP_WRITE_ZEROES && > + bio_op(bio) != REQ_OP_VERIFY) > return true; > > return false; > @@ -65,7 +66,8 @@ static inline bool bio_no_advance_iter(const struct bio *bio) > { > return bio_op(bio) == REQ_OP_DISCARD || > bio_op(bio) == REQ_OP_SECURE_ERASE || > - bio_op(bio) == REQ_OP_WRITE_ZEROES; > + bio_op(bio) == REQ_OP_WRITE_ZEROES || > + bio_op(bio) == REQ_OP_VERIFY; > } > > static inline void *bio_data(struct bio *bio) > @@ -176,7 +178,7 @@ static inline unsigned bio_segments(struct bio *bio) > struct bvec_iter iter; > > /* > - * We special case discard/write same/write zeroes, because they > + * We special case discard/write same/write zeroes/verify, because they > * interpret bi_size differently: > */ > > @@ -184,6 +186,7 @@ static inline unsigned bio_segments(struct bio *bio) > case REQ_OP_DISCARD: > case REQ_OP_SECURE_ERASE: > case REQ_OP_WRITE_ZEROES: > + case REQ_OP_VERIFY: > return 0; > default: > break; > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > index a24d4078fb21..0d5383fc84ed 100644 > --- a/include/linux/blk_types.h > +++ b/include/linux/blk_types.h > @@ -363,6 +363,8 @@ enum req_opf { > REQ_OP_FLUSH = 2, > /* discard sectors */ > REQ_OP_DISCARD = 3, > + /* Verify the sectors */ > + REQ_OP_VERIFY = 6, > /* securely erase sectors */ > REQ_OP_SECURE_ERASE = 5, > /* write the zero filled sector many times */ > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 608d577734c2..78fd6c5530d7 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -266,6 +266,7 @@ struct queue_limits { > unsigned int max_hw_discard_sectors; > unsigned int max_secure_erase_sectors; > unsigned int max_write_zeroes_sectors; > + unsigned int max_verify_sectors; > unsigned int max_zone_append_sectors; > unsigned int discard_granularity; > unsigned int discard_alignment; > @@ -925,6 +926,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, > if (unlikely(op == REQ_OP_WRITE_ZEROES)) > return q->limits.max_write_zeroes_sectors; > > + if (unlikely(op == REQ_OP_VERIFY)) > + return q->limits.max_verify_sectors; > + > return q->limits.max_sectors; > } > > @@ -968,6 +972,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q, > unsigned int max_discard_sectors); > extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, > unsigned int max_write_same_sectors); > +extern void blk_queue_max_verify_sectors(struct request_queue *q, > + unsigned int max_verify_sectors); > extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); > extern void blk_queue_max_zone_append_sectors(struct request_queue *q, > unsigned int max_zone_append_sectors); > @@ -1119,6 +1125,12 @@ extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, > extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, > sector_t nr_sects, gfp_t gfp_mask, unsigned flags); > > +extern int __blkdev_issue_verify(struct block_device *bdev, > + sector_t sector, sector_t nr_sects, gfp_t gfp_mask, > + struct bio **biop); > +extern int blkdev_issue_verify(struct block_device *bdev, sector_t sector, > + sector_t nr_sects, gfp_t gfp_mask); > + > static inline int sb_issue_discard(struct super_block *sb, sector_t block, > sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) > { > @@ -1293,6 +1305,16 @@ static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) > return 0; > } > > +static inline unsigned int bdev_verify_sectors(struct block_device *bdev) > +{ > + struct request_queue *q = bdev_get_queue(bdev); > + > + if (q) > + return q->limits.max_verify_sectors; > + > + return 0; > +} > + > static inline bool bdev_nonrot(struct block_device *bdev) > { > return blk_queue_nonrot(bdev_get_queue(bdev)); > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > index bdf7b404b3e7..ad0e5cb5cac4 100644 > --- a/include/uapi/linux/fs.h > +++ b/include/uapi/linux/fs.h > @@ -185,6 +185,7 @@ struct fsxattr { > #define BLKROTATIONAL _IO(0x12,126) > #define BLKZEROOUT _IO(0x12,127) > #define BLKGETDISKSEQ _IOR(0x12,128,__u64) > +#define BLKVERIFY _IO(0x12,129) > /* > * A jump here: 130-136 are reserved for zoned block devices > * (see uapi/linux/blkzoned.h) > -- > 2.29.0 >