From: Keith Busch <kbusch@xxxxxxxxxx> Recent storage protocol enhancements allow devices to support partial sector read access. When used by a host, this can provide link bandwidth savings and reduce memory utilization. Provide a way for drivers to indicate support for this capability, and implement the framework to submit bit-bucket read bio's. The implementation indicates the unwanted data by using a special page. The page can be used multiple times within the same bio to designate one or more unwanted data gaps within the requested sector(s). Drivers that subscribe to the capability must check for this page and set up their protocol specific scatter-gather accordingly. Requests with bit buckets need to be flagged specially for this since NVMe needs to know before walking the segments if it should construct a bit bucket SGL instead of a PRP. Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> --- block/bio.c | 29 +++++++++++++++++++++++++++-- block/blk-core.c | 5 +++++ block/blk-merge.c | 3 ++- block/blk-mq.c | 2 ++ include/linux/blk-mq.h | 2 ++ include/linux/blk_types.h | 1 + include/linux/blkdev.h | 13 +++++++++++++ 7 files changed, 52 insertions(+), 3 deletions(-) diff --git a/block/bio.c b/block/bio.c index 933ea3210954..b0c85778257a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1208,6 +1208,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; + unsigned int lbas = bdev_logical_block_size(bio->bi_bdev); ssize_t size, left; unsigned len, i; size_t offset; @@ -1226,10 +1227,32 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the * result to ensure the bio's total size is correct. The remainder of * the iov data will be picked up in the next bio iteration. + * + * Partial sector reads can break the iov length expecations by + * allowing dma_alignement granularities. The code enforces only 1 + * segment in that case, which simplifies the following logic. We don't + * need to consider individual segment lengths since the skip and + * truncate bytes are guaranteed to align the total length to the block + * size. */ size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); - if (size > 0) - size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); + if (size > 0) { + /* + * If size doesn't reach the end with bit buckets, align the + * total size down to the block size to avoid a bit-bucket + * truncation overlapping with the desired read data. + */ + if (bio_flagged(bio, BIO_BIT_BUCKET)) { + if (size != iov_iter_count(iter)) { + size_t total_size = size + bio->bi_iter.bi_size; + + total_size = ALIGN_DOWN(total_size, lbas); + size = total_size - bio->bi_iter.bi_size; + } + } else { + size = ALIGN_DOWN(size, lbas); + } + } if (unlikely(size <= 0)) return size ? size : -EFAULT; @@ -1602,6 +1625,8 @@ struct bio *bio_split(struct bio *bio, int sectors, if (bio_flagged(bio, BIO_TRACE_COMPLETION)) bio_set_flag(split, BIO_TRACE_COMPLETION); + if (bio_flagged(bio, BIO_BIT_BUCKET)) + bio_set_flag(split, BIO_BIT_BUCKET); return split; } diff --git a/block/blk-core.c b/block/blk-core.c index 5ad7bd93077c..d2e9fd42b732 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -73,6 +73,9 @@ struct kmem_cache *blk_requestq_srcu_cachep; */ static struct workqueue_struct *kblockd_workqueue; +struct page *blk_bb_page; +EXPORT_SYMBOL_GPL(blk_bb_page); + /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set @@ -1228,5 +1231,7 @@ int __init blk_dev_init(void) blk_debugfs_root = debugfs_create_dir("block", NULL); + blk_bb_page = ZERO_PAGE(0); + return 0; } diff --git a/block/blk-merge.c b/block/blk-merge.c index 0f5f42ebd0bb..65b71114633f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -281,7 +281,8 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ - if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) + if (!bio_flagged(bio, BIO_BIT_BUCKET) && bvprvp && + bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) goto split; if (nsegs < max_segs && diff --git a/block/blk-mq.c b/block/blk-mq.c index 15c7c5c4ad22..efbe308d7ae5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2425,6 +2425,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, if (bio->bi_opf & REQ_RAHEAD) rq->cmd_flags |= REQ_FAILFAST_MASK; + if (bio_flagged(bio, BIO_BIT_BUCKET)) + rq->rq_flags |= RQF_BIT_BUCKET; rq->__sector = bio->bi_iter.bi_sector; blk_rq_bio_prep(rq, bio, nr_segs); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 43aad0da3305..05fa0b292223 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -22,6 +22,8 @@ typedef __u32 __bitwise req_flags_t; /* drive already may have started this one */ #define RQF_STARTED ((__force req_flags_t)(1 << 1)) +/* request has bit bucket payload */ +#define RQF_BIT_BUCKET ((__force req_flags_t)(1 << 2)) /* may not be passed by ioscheduler */ #define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) /* request for flush sequence */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a24d4078fb21..dc981d0232d1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -332,6 +332,7 @@ enum { BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ + BIO_BIT_BUCKET, /* contains one or more bit bucket pages */ BIO_FLAG_LAST }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9d676adfaaa1..4396fcf04bb8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -44,6 +44,7 @@ struct blk_crypto_profile; extern const struct device_type disk_type; extern struct device_type part_type; extern struct class block_class; +extern struct page *blk_bb_page; /* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 @@ -580,6 +581,7 @@ struct request_queue { #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ +#define QUEUE_FLAG_BIT_BUCKET 31 /* device supports read bit buckets */ #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ @@ -621,6 +623,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) +#define blk_queue_bb(q) test_bit(QUEUE_FLAG_BIT_BUCKET, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); @@ -1566,4 +1569,14 @@ struct io_comp_batch { #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } +static inline void blk_add_bb_page(struct bio *bio, int len) +{ + bio_set_flag(bio, BIO_BIT_BUCKET); + get_page(blk_bb_page); + bio_add_page(bio, blk_bb_page, len, 0); +} +static inline bool blk_is_bit_bucket(struct page *page) +{ + return page == blk_bb_page; +} #endif /* _LINUX_BLKDEV_H */ -- 2.30.2