All mutex and semaphore usage is removed from the blockdev_direct_IO paths. Filesystems can either do this locking on their own, or ask for placeholder pages. Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> diff -r 7819e6e3f674 -r 5cd028318654 fs/direct-io.c --- a/fs/direct-io.c Tue Feb 06 19:45:28 2007 -0500 +++ b/fs/direct-io.c Tue Feb 06 20:02:49 2007 -0500 @@ -36,6 +36,7 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <asm/atomic.h> +#include <linux/writeback.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -95,6 +96,22 @@ struct dio { struct buffer_head map_bh; /* last get_block() result */ /* + * kernel page pinning (placeholders); + */ + unsigned long fspages_start_off; /* page index where pinning starts */ + + /* + * end off is the first page past the of the pinned range. If + * no pages or placeholders are pinned down, start_off == end_off + */ + unsigned long fspages_end_off; + + /* + * how big of a radix extent are we allowed to insert + */ + unsigned long fspages_span; + + /* * Deferred addition of a page to the dio. These variables are * private to dio_send_cur_page(), submit_page_section() and * dio_bio_add_page(). @@ -187,7 +204,50 @@ static int dio_refill_pages(struct dio * ret = 0; } out: - return ret; + return ret; +} + +static void dio_unlock_page_range(struct dio *dio) +{ + if (dio->lock_type != DIO_NO_LOCKING) { + remove_placeholder_pages(dio->inode->i_mapping, + dio->fspages_start_off, + dio->fspages_end_off); + dio->fspages_end_off = dio->fspages_start_off; + } +} + +static int dio_lock_page_range(struct dio *dio, struct buffer_head *map_bh, + unsigned long index, unsigned long end) +{ + struct address_space *mapping = dio->inode->i_mapping; + unsigned long max_size; + int ret = 0; + + if (dio->lock_type == DIO_NO_LOCKING) + return 0; + + while (index >= dio->fspages_end_off) { + unsigned long nr = end - dio->fspages_end_off + 1; + nr = min(nr, dio->fspages_span); + ret = find_or_insert_placeholders(mapping, + dio->fspages_end_off, + dio->fspages_end_off + nr, + GFP_KERNEL, 1); + if (ret) + break; + dio->fspages_end_off += nr; + } + /* + * if we allow the FS to allocate more than we've placeholdered, + * a concurrent readahead operation will find metadata where the + * corresponding data has never been written. This will trim + * down amount of data we ask the FS to return. + */ + max_size = (dio->fspages_end_off - index) << PAGE_CACHE_SHIFT; + if (map_bh->b_size > max_size) + map_bh->b_size = max_size; + return ret; } /* @@ -246,9 +306,7 @@ static int dio_complete(struct dio *dio, if (dio->end_io && dio->result) dio->end_io(dio->iocb, offset, transferred, dio->map_bh.b_private); - if (dio->lock_type == DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); + dio_unlock_page_range(dio); if (ret == 0) ret = dio->page_errors; @@ -513,6 +571,8 @@ static int get_more_blocks(struct dio *d unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; + unsigned long index; + unsigned long end; int create; /* @@ -540,7 +600,14 @@ static int get_more_blocks(struct dio *d } else if (dio->lock_type == DIO_NO_LOCKING) { create = 0; } - + index = fs_startblk >> (PAGE_CACHE_SHIFT - + dio->inode->i_blkbits); + end = (dio->final_block_in_request >> dio->blkfactor) >> + (PAGE_CACHE_SHIFT - dio->inode->i_blkbits); + BUG_ON(index > end); + ret = dio_lock_page_range(dio, map_bh, index, end); + if (ret) + goto error; /* * For writes inside i_size we forbid block creations: only * overwrites are permitted. We fall back to buffered writes @@ -550,6 +617,7 @@ static int get_more_blocks(struct dio *d ret = (*dio->get_block)(dio->inode, fs_startblk, map_bh, create); } +error: return ret; } @@ -946,9 +1014,6 @@ out: return ret; } -/* - * Releases both i_mutex and i_alloc_sem - */ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, @@ -992,6 +1057,24 @@ direct_io_worker(int rw, struct kiocb *i dio->bio_list = NULL; dio->waiter = NULL; + if (dio->lock_type != DIO_NO_LOCKING) { + dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT; + dio->fspages_end_off = dio->fspages_start_off; + + /* if the mapping is mapped, they may be using a mmap'd portion + * of the file as the buffer for this io. That will deadlock + * with placeholders because the placeholder code forces the + * page fault handler to block. The (ugly) solution is to + * limit the span of inserted placeholders to the same + * increment we use for get_user_pages. + */ + if (inode->i_mapping->nrpages || + mapping_mapped(inode->i_mapping)) + dio->fspages_span = DIO_PAGES; + else + dio->fspages_span = ULONG_MAX; + } + /* * In case of non-aligned buffers, we may need 2 more * pages since we need to zero out first and last block. @@ -1074,14 +1157,6 @@ direct_io_worker(int rw, struct kiocb *i dio_cleanup(dio); /* - * All block lookups have been performed. For READ requests - * we can let i_mutex go now that its achieved its purpose - * of protecting us from looking up uninitialized blocks. - */ - if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); - - /* * The only time we want to leave bios in flight is when a successful * partial aio read or full aio write have been setup. In that case * bio completion will call aio_complete. The only time it's safe to @@ -1130,8 +1205,6 @@ direct_io_worker(int rw, struct kiocb *i * DIO_LOCKING (simple locking for regular files) * For writes we are called under i_mutex and return with i_mutex held, even * though it is internally dropped. - * For reads, i_mutex is not held on entry, but it is taken and dropped before - * returning. * * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of * uninitialised data, allowing parallel direct readers and writers) @@ -1156,8 +1229,7 @@ __blockdev_direct_IO(int rw, struct kioc ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - int release_i_mutex = 0; - int acquire_i_mutex = 0; + struct address_space *mapping = iocb->ki_filp->f_mapping; if (rw & WRITE) rw = WRITE_SYNC; @@ -1186,49 +1258,26 @@ __blockdev_direct_IO(int rw, struct kioc goto out; } } - dio = kmalloc(sizeof(*dio), GFP_KERNEL); retval = -ENOMEM; if (!dio) goto out; + /* * For block device access DIO_NO_LOCKING is used, * neither readers nor writers do any locking at all * For regular files using DIO_LOCKING, - * readers need to grab i_mutex and i_alloc_sem - * writers need to grab i_alloc_sem only (i_mutex is already held) + * No locks are taken * For regular files using DIO_OWN_LOCKING, * neither readers nor writers take any locks here */ dio->lock_type = dio_lock_type; - if (dio_lock_type != DIO_NO_LOCKING) { - /* watch out for a 0 len io from a tricksy fs */ - if (rw == READ && end > offset) { - struct address_space *mapping; - - mapping = iocb->ki_filp->f_mapping; - if (dio_lock_type != DIO_OWN_LOCKING) { - mutex_lock(&inode->i_mutex); - release_i_mutex = 1; - } - - retval = filemap_write_and_wait_range(mapping, offset, - end - 1); - if (retval) { - kfree(dio); - goto out; - } - - if (dio_lock_type == DIO_OWN_LOCKING) { - mutex_unlock(&inode->i_mutex); - acquire_i_mutex = 1; - } - } - - if (dio_lock_type == DIO_LOCKING) - /* lockdep: not the owner will release it */ - down_read_non_owner(&inode->i_alloc_sem); + + if (dio->lock_type == DIO_NO_LOCKING && end > offset) { + retval = filemap_write_and_wait_range(mapping, offset, end - 1); + if (retval) + goto out; } /* @@ -1242,15 +1291,7 @@ __blockdev_direct_IO(int rw, struct kioc retval = direct_io_worker(rw, iocb, inode, iov, offset, nr_segs, blkbits, get_block, end_io, dio); - - if (rw == READ && dio_lock_type == DIO_LOCKING) - release_i_mutex = 0; - out: - if (release_i_mutex) - mutex_unlock(&inode->i_mutex); - else if (acquire_i_mutex) - mutex_lock(&inode->i_mutex); return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html