Provide an IO extension handler that attaches PI data from the io extension structure to a kiocb, then teach directio how to attach the pages representing the PI buffer directly to a bio. Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> --- Documentation/block/data-integrity.txt | 11 ++++ fs/aio.c | 62 +++++++++++++++++++++ fs/bio-integrity.c | 94 +++++++++++++++++++++++++++++++- fs/direct-io.c | 70 +++++++++++++++++++----- include/linux/aio.h | 10 +++ include/linux/bio.h | 15 +++++ include/uapi/linux/aio_abi.h | 6 ++ mm/filemap.c | 6 ++ 8 files changed, 259 insertions(+), 15 deletions(-) diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.txt index 2d735b0a..1d1f070 100644 --- a/Documentation/block/data-integrity.txt +++ b/Documentation/block/data-integrity.txt @@ -282,6 +282,17 @@ will require extra work due to the application tag. It is up to the receiver to process them and verify data integrity upon completion. + int bio_integrity_prep_buffer(struct bio *bio, int rw, + struct bio_integrity_prep_iter *pi); + + This function should be called before submit_bio; its purpose is to + attach an arbitrary array of struct page * containing integrity data + to an existing bio. Primarily this is intended for AIO/DIO to be + able to attach a userspace buffer to a bio. + + The bio_integrity_prep_iter should contain the page offset and buffer + length of the PI buffer, the number of pages, and the actual array of + pages, as returned by get_user_pages. 5.4 REGISTERING A BLOCK DEVICE AS CAPABLE OF EXCHANGING INTEGRITY METADATA diff --git a/fs/aio.c b/fs/aio.c index 0c40bdc..3f932c3 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1379,7 +1379,69 @@ struct io_extension_type { int (*destroy_fn)(struct kiocb *); }; +#ifdef CONFIG_BLK_DEV_INTEGRITY +static int destroy_pi_ext(struct kiocb *req) +{ + unsigned int i; + + if (req->ki_ioext->ke_pi_iter.pi_userpages == NULL) + return 0; + + for (i = 0; i < req->ki_ioext->ke_pi_iter.pi_nrpages; i++) + page_cache_release(req->ki_ioext->ke_pi_iter.pi_userpages[i]); + kfree(req->ki_ioext->ke_pi_iter.pi_userpages); + req->ki_ioext->ke_pi_iter.pi_userpages = NULL; + + return 0; +} + +static int setup_pi_ext(struct kiocb *req, int is_write) +{ + struct file *file = req->ki_filp; + struct io_extension *ext = &req->ki_ioext->ke_kern; + void *p; + unsigned long start, end; + int retval; + + if (!(file->f_flags & O_DIRECT)) { + pr_debug("EINVAL: can't use PI without O_DIRECT.\n"); + return -EINVAL; + } + + BUG_ON(req->ki_ioext->ke_pi_iter.pi_userpages); + + end = (((unsigned long)ext->ie_pi_buf) + ext->ie_pi_buflen + + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ((unsigned long)ext->ie_pi_buf) >> PAGE_SHIFT; + req->ki_ioext->ke_pi_iter.pi_offset = offset_in_page(ext->ie_pi_buf); + req->ki_ioext->ke_pi_iter.pi_len = ext->ie_pi_buflen; + req->ki_ioext->ke_pi_iter.pi_nrpages = end - start; + p = kzalloc(req->ki_ioext->ke_pi_iter.pi_nrpages * + sizeof(struct page *), + GFP_NOIO); + if (p == NULL) { + pr_err("%s: no room for page array?\n", __func__); + return -ENOMEM; + } + req->ki_ioext->ke_pi_iter.pi_userpages = p; + + retval = get_user_pages_fast((unsigned long)ext->ie_pi_buf, + req->ki_ioext->ke_pi_iter.pi_nrpages, + is_write, + req->ki_ioext->ke_pi_iter.pi_userpages); + if (retval != req->ki_ioext->ke_pi_iter.pi_nrpages) { + pr_err("%s: couldn't map pages?\n", __func__); + req->ki_ioext->ke_pi_iter.pi_nrpages = retval; + return -ENOMEM; + } + req->ki_flags |= KIOCB_DIO_ONLY; + + return 0; +} +#endif + static struct io_extension_type extensions[] = { + {IO_EXT_PI, IO_EXT_SIZE(ie_pi_ret), setup_pi_ext, destroy_pi_ext}, {IO_EXT_INVALID, 0, NULL, NULL}, }; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 413312f..3df9aeb 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -138,7 +138,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, struct bio_vec *iv; if (bip->bip_vcnt >= bip_integrity_vecs(bip)) { - printk(KERN_ERR "%s: bip_vec full\n", __func__); + pr_err("%s: bip_vec full\n", __func__); return 0; } @@ -250,7 +250,7 @@ static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, DIV_ROUND_UP(len, bi->tag_size)); if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) { - printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__, + pr_err("%s: tag too big for bio: %u > %u\n", __func__, nr_sectors * bi->tuple_size, bip->bip_iter.bi_size); return -1; } @@ -375,6 +375,96 @@ static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) } /** + * bio_integrity_prep_buffer - Prepare bio for integrity I/O + * @bio: bio to prepare + * @rw: data direction for the bio + * @pi: pi data to attach to bio + * + * Description: Allocates a buffer for integrity metadata, maps the + * pages and attaches them to a bio. The bio must have target device + * and start sector set prior to calling. The pages specified in the + * @pi argument should contain integrity metadata in the WRITE case, + * and should be ready to receive metadata in the READ case. + */ +int bio_integrity_prep_buffer(struct bio *bio, int rw, + struct bio_integrity_prep_iter *pi) +{ + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + unsigned long start, end; + unsigned int len, nr_pages; + unsigned int bytes, i; + unsigned int sectors; + int ret; + + bi = bdev_get_integrity(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bio_integrity(bio)); + + sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); + + /* Allocate kernel buffer for protection data */ + len = sectors * blk_integrity_tuple_size(bi); + end = (pi->pi_offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = pi->pi_offset >> PAGE_SHIFT; + nr_pages = end - start; + + if (pi->pi_len < len) { + pr_err("%s: not enough space left in buffer!\n", __func__); + return -ENOMEM; + } + + /* Allocate bio integrity payload and integrity vectors */ + bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); + if (unlikely(bip == NULL)) { + pr_err("could not allocate data integrity bioset\n"); + return -EIO; + } + + bip->bip_owns_buf = 0; + bip->bip_buf = NULL; + bip->bip_iter.bi_size = len; + bip->bip_iter.bi_sector = bio->bi_iter.bi_sector; + + /* Map it */ + for (i = 0 ; i < nr_pages ; i++) { + bytes = PAGE_SIZE - pi->pi_offset; + + if (bytes > pi->pi_len) + bytes = pi->pi_len; + if (bytes > len) + bytes = len; + if (pi->pi_len <= 0 || len == 0) + break; + + ret = bio_integrity_add_page(bio, *pi->pi_userpages, + bytes, pi->pi_offset); + + if (ret == 0) + return -EIO; + + if (ret < bytes) + break; + + len -= bytes; + pi->pi_len -= bytes; + if (pi->pi_offset + bytes == PAGE_SIZE) + pi->pi_userpages++; + pi->pi_offset = (pi->pi_offset + bytes) % PAGE_SIZE; + } + + /* Install custom I/O completion handler if read verify is enabled */ + if ((rw & WRITE) == READ) { + bip->bip_end_io = bio->bi_end_io; + bio->bi_end_io = bio_integrity_endio; + ret = 0; + } + + return ret; +} +EXPORT_SYMBOL(bio_integrity_prep_buffer); + +/** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare * diff --git a/fs/direct-io.c b/fs/direct-io.c index 160a548..3f591f8 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -111,6 +111,10 @@ struct dio_submit { */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ + +#if defined(CONFIG_BLK_DEV_INTEGRITY) + struct bio_integrity_prep_iter pi_iter; +#endif }; /* dio_state communicated between submission path and end_io */ @@ -221,6 +225,7 @@ static inline struct page *dio_get_page(struct dio *dio, return dio->pages[sdio->head++]; } + /** * dio_complete() - called when all DIO BIO I/O has been completed * @offset: the byte offset in the file of the completed operation @@ -385,6 +390,22 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } +#ifdef CONFIG_BLK_DEV_INTEGRITY +static int dio_prep_pi_buffers(struct dio *dio, struct dio_submit *sdio) +{ + struct bio *bio = sdio->bio; + if (sdio->pi_iter.pi_userpages == NULL || !bio_integrity_enabled(bio)) + return 0; + + return bio_integrity_prep_buffer(bio, dio->rw, &sdio->pi_iter); +} +#else +static int dio_prep_pi_buffers(struct dio *dio, struct dio_submit *sdio) +{ + return 0; +} +#endif + /* * In the AIO read case we speculatively dirty the pages before starting IO. * During IO completion, any of these pages which happen to have been written @@ -392,13 +413,18 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, * * bios hold a dio reference between submit_bio and ->end_io. */ -static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) +static inline int dio_bio_submit(struct dio *dio, struct dio_submit *sdio) { struct bio *bio = sdio->bio; unsigned long flags; + int ret = 0; bio->bi_private = dio; + ret = dio_prep_pi_buffers(dio, sdio); + if (ret) + return ret; + spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++; spin_unlock_irqrestore(&dio->bio_lock, flags); @@ -415,6 +441,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) sdio->bio = NULL; sdio->boundary = 0; sdio->logical_offset_in_bio = 0; + + return ret; } /* @@ -736,8 +764,11 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, * have. */ if (sdio->final_block_in_bio != sdio->cur_page_block || - cur_offset != bio_next_offset) - dio_bio_submit(dio, sdio); + cur_offset != bio_next_offset) { + ret = dio_bio_submit(dio, sdio); + if (ret) + goto out; + } } if (sdio->bio == NULL) { @@ -747,7 +778,9 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, } if (dio_bio_add_page(sdio) != 0) { - dio_bio_submit(dio, sdio); + ret = dio_bio_submit(dio, sdio); + if (ret) + goto out; ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret == 0) { ret = dio_bio_add_page(sdio); @@ -823,8 +856,12 @@ out: * avoid metadata seeks. */ if (sdio->boundary) { + int ret2; + ret = dio_send_cur_page(dio, sdio, map_bh); - dio_bio_submit(dio, sdio); + ret2 = dio_bio_submit(dio, sdio); + if (ret == 0) + ret = ret2; page_cache_release(sdio->cur_page); sdio->cur_page = NULL; } @@ -1120,7 +1157,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; loff_t end = offset; - struct dio *dio; + struct dio *dio = NULL; struct dio_submit sdio = { 0, }; unsigned long user_addr; size_t bytes; @@ -1187,8 +1224,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, end - 1); if (retval) { mutex_unlock(&inode->i_mutex); - kmem_cache_free(dio_cache, dio); - goto out; + goto out_dio; } } } @@ -1217,8 +1253,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, * We grab i_mutex only for reads so we don't have * to release it here */ - kmem_cache_free(dio_cache, dio); - goto out; + goto out_dio; } } @@ -1228,6 +1263,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, atomic_inc(&inode->i_dio_count); retval = 0; +#ifdef CONFIG_BLK_DEV_INTEGRITY + sdio.pi_iter = iocb->ki_ioext->ke_pi_iter; +#endif sdio.blkbits = blkbits; sdio.blkfactor = i_blkbits - blkbits; sdio.block_in_file = offset >> blkbits; @@ -1315,8 +1353,12 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, page_cache_release(sdio.cur_page); sdio.cur_page = NULL; } - if (sdio.bio) - dio_bio_submit(dio, &sdio); + if (sdio.bio) { + int ret2; + ret2 = dio_bio_submit(dio, &sdio); + if (retval == 0) + retval = ret2; + } blk_finish_plug(&plug); @@ -1353,7 +1395,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, retval = dio_complete(dio, offset, retval, false); } else BUG_ON(retval != -EIOCBQUEUED); - + return retval; +out_dio: + kmem_cache_free(dio_cache, dio); out: return retval; } diff --git a/include/linux/aio.h b/include/linux/aio.h index 60f4364..3f142b8 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -6,6 +6,7 @@ #include <linux/aio_abi.h> #include <linux/uio.h> #include <linux/rcupdate.h> +#include <linux/bio.h> #include <linux/atomic.h> @@ -14,6 +15,8 @@ struct kiocb; #define KIOCB_KEY 0 +#define KIOCB_DIO_ONLY (1) /* don't try buffered if directio fails */ + /* * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either * cancelled or completed (this makes a certain amount of sense because @@ -29,10 +32,15 @@ struct kiocb; typedef int (kiocb_cancel_fn)(struct kiocb *); +/* per-kiocb extension data */ struct kio_extension { struct io_extension __user *ke_user; struct io_extension ke_kern; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + struct bio_integrity_prep_iter ke_pi_iter; /* PI buffers */ +#endif }; + struct kiocb { struct file *ki_filp; struct kioctx *ki_ctx; /* NULL for sync ops */ @@ -59,6 +67,8 @@ struct kiocb { /* Kernel copy of extension descriptors */ struct kio_extension *ki_ioext; + + unsigned int ki_flags; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) diff --git a/include/linux/bio.h b/include/linux/bio.h index 5a4d39b..4729ab1 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -635,6 +635,13 @@ struct biovec_slab { struct kmem_cache *slab; }; +struct bio_integrity_prep_iter { + struct page **pi_userpages; /* Pages containing PI data */ + size_t pi_nrpages; /* Number of PI data pages */ + size_t pi_offset; /* Offset into the page */ + size_t pi_len; /* Length of the buffer */ +}; + /* * a small number of entries is fine, not going to be performance critical. * basically we just need to survive @@ -663,6 +670,8 @@ extern int bio_integrity_enabled(struct bio *bio); extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); extern int bio_integrity_get_tag(struct bio *, void *, unsigned int); extern int bio_integrity_prep(struct bio *); +extern int bio_integrity_prep_buffer(struct bio *, int rw, + struct bio_integrity_prep_iter *); extern void bio_integrity_endio(struct bio *, int); extern void bio_integrity_advance(struct bio *, unsigned int); extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); @@ -693,6 +702,12 @@ static inline void bioset_integrity_free (struct bio_set *bs) return; } +static inline int bio_integrity_prep_buffer(struct bio *bio, int rw, + struct bio_integrity_prep_iter *pi) +{ + return 0; +} + static inline int bio_integrity_prep(struct bio *bio) { return 0; diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index 07ffd1f..d7b8c68 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -74,11 +74,17 @@ struct io_event { /* IO extension types */ #define IO_EXT_INVALID (0) +#define IO_EXT_PI (1) /* protection info (checksums, etc) */ /* IO extension descriptor */ struct io_extension { __u64 ie_size; __u64 ie_has; + + /* PI stuff */ + __u64 ie_pi_buf; + __u32 ie_pi_buflen; + __u32 ie_pi_ret; }; /* diff --git a/mm/filemap.c b/mm/filemap.c index 7a13f6a..d35ddb3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2477,6 +2477,12 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, ppos, count, ocount); if (written < 0 || written == count) goto out; + + if (iocb->ki_flags & KIOCB_DIO_ONLY) { + err = -EINVAL; + goto out; + } + /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>