Convert the generic direct-I/O code to use iov_iter_extract_pages() instead of iov_iter_get_pages(). This will pin pages or leave them unaltered rather than getting a ref on them as appropriate to the iterator. The pages need to be pinned for DIO-read rather than having refs taken on them to prevent VM copy-on-write from malfunctioning during a concurrent fork() (the result of the I/O would otherwise end up only visible to the child process and not the parent). Signed-off-by: David Howells <dhowells@xxxxxxxxxx> cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx> cc: Jens Axboe <axboe@xxxxxxxxx> cc: Jan Kara <jack@xxxxxxx> cc: Christoph Hellwig <hch@xxxxxx> cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> cc: Logan Gunthorpe <logang@xxxxxxxxxxxx> cc: linux-fsdevel@xxxxxxxxxxxxxxx cc: linux-block@xxxxxxxxxxxxxxx --- fs/direct-io.c | 57 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index b1e26a706e31..b4d2c9f85a5b 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -142,9 +142,11 @@ struct dio { /* * pages[] (and any fields placed after it) are not zeroed out at - * allocation time. Don't add new fields after pages[] unless you - * wish that they not be zeroed. + * allocation time. Don't add new fields after pages[] unless you wish + * that they not be zeroed. Pages may have a ref taken, a pin emplaced + * or no retention measures. */ + unsigned int cleanup_mode; /* How pages should be cleaned up (0/FOLL_GET/PIN) */ union { struct page *pages[DIO_PAGES]; /* page buffer */ struct work_struct complete_work;/* deferred AIO completion */ @@ -167,12 +169,13 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio) static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { const enum req_op dio_op = dio->opf & REQ_OP_MASK; + unsigned int gup_flags = + op_is_write(dio_op) ? FOLL_SOURCE_BUF : FOLL_DEST_BUF; + struct page **pages = dio->pages; ssize_t ret; - ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES, - &sdio->from, - op_is_write(dio_op) ? - FOLL_SOURCE_BUF : FOLL_DEST_BUF); + ret = iov_iter_extract_pages(sdio->iter, &pages, LONG_MAX, DIO_PAGES, + gup_flags, &sdio->from); if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) { struct page *page = ZERO_PAGE(0); @@ -183,7 +186,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) */ if (dio->page_errors == 0) dio->page_errors = ret; - get_page(page); + dio->cleanup_mode = 0; dio->pages[0] = page; sdio->head = 0; sdio->tail = 1; @@ -197,6 +200,8 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) sdio->head = 0; sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE; sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1; + dio->cleanup_mode = + iov_iter_extract_mode(sdio->iter, gup_flags); return 0; } return ret; @@ -400,6 +405,10 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, * we request a valid number of vectors. */ bio = bio_alloc(bdev, nr_vecs, dio->opf, GFP_KERNEL); + if (!(dio->cleanup_mode & FOLL_GET)) + bio_clear_flag(bio, BIO_PAGE_REFFED); + if (dio->cleanup_mode & FOLL_PIN) + bio_set_flag(bio, BIO_PAGE_PINNED); bio->bi_iter.bi_sector = first_sector; if (dio->is_async) bio->bi_end_io = dio_bio_end_aio; @@ -443,13 +452,18 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) sdio->logical_offset_in_bio = 0; } +static void dio_cleanup_page(struct dio *dio, struct page *page) +{ + page_put_unpin(page, dio->cleanup_mode); +} + /* * Release any resources in case of a failure */ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { while (sdio->head < sdio->tail) - put_page(dio->pages[sdio->head++]); + dio_cleanup_page(dio, dio->pages[sdio->head++]); } /* @@ -704,7 +718,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, * * Return zero on success. Non-zero means the caller needs to start a new BIO. */ -static inline int dio_bio_add_page(struct dio_submit *sdio) +static inline int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio) { int ret; @@ -771,11 +785,11 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, goto out; } - if (dio_bio_add_page(sdio) != 0) { + if (dio_bio_add_page(dio, sdio) != 0) { dio_bio_submit(dio, sdio); ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret == 0) { - ret = dio_bio_add_page(sdio); + ret = dio_bio_add_page(dio, sdio); BUG_ON(ret != 0); } } @@ -832,13 +846,16 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, */ if (sdio->cur_page) { ret = dio_send_cur_page(dio, sdio, map_bh); - put_page(sdio->cur_page); + dio_cleanup_page(dio, sdio->cur_page); sdio->cur_page = NULL; if (ret) return ret; } - get_page(page); /* It is in dio */ + ret = try_grab_page(page, dio->cleanup_mode); /* It is in dio */ + if (ret < 0) + return ret; + sdio->cur_page = page; sdio->cur_page_offset = offset; sdio->cur_page_len = len; @@ -853,7 +870,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, ret = dio_send_cur_page(dio, sdio, map_bh); if (sdio->bio) dio_bio_submit(dio, sdio); - put_page(sdio->cur_page); + dio_cleanup_page(dio, sdio->cur_page); sdio->cur_page = NULL; } return ret; @@ -954,7 +971,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, ret = get_more_blocks(dio, sdio, map_bh); if (ret) { - put_page(page); + dio_cleanup_page(dio, page); goto out; } if (!buffer_mapped(map_bh)) @@ -999,7 +1016,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, /* AKPM: eargh, -ENOTBLK is a hack */ if (dio_op == REQ_OP_WRITE) { - put_page(page); + dio_cleanup_page(dio, page); return -ENOTBLK; } @@ -1012,7 +1029,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, if (sdio->block_in_file >= i_size_aligned >> blkbits) { /* We hit eof */ - put_page(page); + dio_cleanup_page(dio, page); goto out; } zero_user(page, from, 1 << blkbits); @@ -1052,7 +1069,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, sdio->next_block_for_io, map_bh); if (ret) { - put_page(page); + dio_cleanup_page(dio, page); goto out; } sdio->next_block_for_io += this_chunk_blocks; @@ -1068,7 +1085,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, } /* Drop the ref which was taken in get_user_pages() */ - put_page(page); + dio_cleanup_page(dio, page); } out: return ret; @@ -1288,7 +1305,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, ret2 = dio_send_cur_page(dio, &sdio, &map_bh); if (retval == 0) retval = ret2; - put_page(sdio.cur_page); + dio_cleanup_page(dio, sdio.cur_page); sdio.cur_page = NULL; } if (sdio.bio)