Make direct IO code grab mapping range lock just before DIO is submitted for the range under IO and release the lock once the IO is complete. Signed-off-by: Jan Kara <jack@xxxxxxx> --- fs/direct-io.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 files changed, 57 insertions(+), 10 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 3a430f3..1127ca5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -56,10 +56,13 @@ * blocksize. */ +struct dio_bio_data; + /* dio_state only used in the submission path */ struct dio_submit { struct bio *bio; /* bio under assembly */ + struct dio_bio_data *bio_data; /* structure to be attached to the bio*/ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -143,7 +146,17 @@ struct dio { struct page *pages[DIO_PAGES]; /* page buffer */ } ____cacheline_aligned_in_smp; +/* + * Structure associated with each submitted bio to provide back pointer and + * lock for the range accessed by the bio. + */ +struct dio_bio_data { + struct dio *dio; + struct range_lock lock; +}; + static struct kmem_cache *dio_cache __read_mostly; +static struct kmem_cache *dio_bio_data_cache __read_mostly; /* * How many pages are in the queue? @@ -275,10 +288,13 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio); */ static void dio_bio_end_aio(struct bio *bio, int error) { - struct dio *dio = bio->bi_private; + struct dio_bio_data *bio_data = bio->bi_private; + struct dio *dio = bio_data->dio; unsigned long remaining; unsigned long flags; + range_unlock(&dio->inode->i_mapping->mapping_lock, &bio_data->lock); + kmem_cache_free(dio_bio_data_cache, bio_data); /* cleanup the bio */ dio_bio_complete(dio, bio); @@ -298,14 +314,17 @@ static void dio_bio_end_aio(struct bio *bio, int error) * The BIO completion handler simply queues the BIO up for the process-context * handler. * - * During I/O bi_private points at the dio. After I/O, bi_private is used to - * implement a singly-linked list of completed BIOs, at dio->bio_list. + * During I/O bi_private points at the dio_data. After I/O, bi_private is used + * to implement a singly-linked list of completed BIOs, at dio->bio_list. */ static void dio_bio_end_io(struct bio *bio, int error) { - struct dio *dio = bio->bi_private; + struct dio_bio_data *bio_data = bio->bi_private; + struct dio *dio = bio_data->dio; unsigned long flags; + range_unlock(&dio->inode->i_mapping->mapping_lock, &bio_data->lock); + kmem_cache_free(dio_bio_data_cache, bio_data); spin_lock_irqsave(&dio->bio_lock, flags); bio->bi_private = dio->bio_list; dio->bio_list = bio; @@ -325,7 +344,8 @@ static void dio_bio_end_io(struct bio *bio, int error) */ void dio_end_io(struct bio *bio, int error) { - struct dio *dio = bio->bi_private; + struct dio_bio_data *bio_data = bio->bi_private; + struct dio *dio = bio_data->dio; if (dio->is_async) dio_bio_end_aio(bio, error); @@ -369,8 +389,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) { struct bio *bio = sdio->bio; unsigned long flags; - - bio->bi_private = dio; + loff_t start = sdio->logical_offset_in_bio; spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++; @@ -380,10 +399,30 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) bio_set_pages_dirty(bio); if (sdio->submit_io) - sdio->submit_io(dio->rw, bio, dio->inode, - sdio->logical_offset_in_bio); - else + sdio->submit_io(dio->rw, bio, dio->inode, start); + else { + struct address_space *mapping = dio->inode->i_mapping; + loff_t end = sdio->logical_offset_in_bio + bio->bi_size - 1; + + sdio->bio_data->dio = dio; + range_lock_init(&sdio->bio_data->lock, + start >> PAGE_CACHE_SHIFT, end >> PAGE_CACHE_SHIFT); + range_lock(&mapping->mapping_lock, &sdio->bio_data->lock); + /* + * Once we hold mapping range lock writeout and invalidation + * cannot race with page faults of buffered IO. + */ + filemap_write_and_wait_range(mapping, start, end); + if (dio->rw == WRITE && mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + start >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); + } + bio->bi_private = sdio->bio_data; + sdio->bio_data = NULL; + submit_bio(dio->rw, bio); + } sdio->bio = NULL; sdio->boundary = 0; @@ -397,6 +436,8 @@ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { while (dio_pages_present(sdio)) page_cache_release(dio_get_page(dio, sdio)); + if (sdio->bio_data) + kmem_cache_free(dio_bio_data_cache, sdio->bio_data); } /* @@ -600,6 +641,11 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); nr_pages = min(nr_pages, BIO_MAX_PAGES); BUG_ON(nr_pages <= 0); + sdio->bio_data = kmem_cache_alloc(dio_bio_data_cache, GFP_KERNEL); + if (!sdio->bio_data) { + ret = -ENOMEM; + goto out; + } dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); sdio->boundary = 0; out: @@ -1307,6 +1353,7 @@ EXPORT_SYMBOL(__blockdev_direct_IO); static __init int dio_init(void) { dio_cache = KMEM_CACHE(dio, SLAB_PANIC); + dio_bio_data_cache = KMEM_CACHE(dio_bio_data, SLAB_PANIC); return 0; } module_init(dio_init) -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html