All mutex and semaphore usage is removed from the blockdev_direct_IO paths. Filesystems can either do this locking on their own, or ask for placeholder pages. Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> diff -r 4cac7e560b53 -r 317779b11fe1 fs/direct-io.c --- a/fs/direct-io.c Thu Dec 21 15:31:30 2006 -0500 +++ b/fs/direct-io.c Thu Dec 21 15:31:30 2006 -0500 @@ -36,6 +36,7 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <asm/atomic.h> +#include <linux/writeback.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -95,6 +96,13 @@ struct dio { struct buffer_head map_bh; /* last get_block() result */ /* + * kernel page pinning + */ + struct page *tmppages[DIO_PAGES]; + unsigned long fspages_start_off; + unsigned long fspages_end_off; + + /* * Deferred addition of a page to the dio. These variables are * private to dio_send_cur_page(), submit_page_section() and * dio_bio_add_page(). @@ -190,6 +198,31 @@ out: return ret; } +static void unlock_page_range(struct dio *dio, unsigned long start, + unsigned long nr) +{ + if (dio->lock_type != DIO_NO_LOCKING) { + remove_placeholder_pages(dio->inode->i_mapping, dio->tmppages, + start, start + nr, + ARRAY_SIZE(dio->tmppages)); + } +} + +static int lock_page_range(struct dio *dio, unsigned long start, + unsigned long nr) +{ + struct address_space *mapping = dio->inode->i_mapping; + unsigned long end = start + nr; + + if (dio->lock_type == DIO_NO_LOCKING) + return 0; + return find_or_insert_placeholders(mapping, dio->tmppages, start, end, + ARRAY_SIZE(dio->tmppages), + GFP_KERNEL, + dio->rw == READ); +} + + /* * Get another userspace page. Returns an ERR_PTR on error. Pages are * buffered inside the dio so that we can call get_user_pages() against a @@ -246,9 +279,9 @@ static int dio_complete(struct dio *dio, if (dio->end_io && dio->result) dio->end_io(dio->iocb, offset, transferred, dio->map_bh.b_private); - if (dio->lock_type == DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); + unlock_page_range(dio, dio->fspages_start_off, + dio->fspages_end_off - dio->fspages_start_off); + dio->fspages_end_off = dio->fspages_start_off; if (ret == 0) ret = dio->page_errors; @@ -513,6 +546,8 @@ static int get_more_blocks(struct dio *d unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; + unsigned long index; + unsigned long end; int create; /* @@ -540,7 +575,24 @@ static int get_more_blocks(struct dio *d } else if (dio->lock_type == DIO_NO_LOCKING) { create = 0; } - + index = fs_startblk >> (PAGE_CACHE_SHIFT - + dio->inode->i_blkbits); + end = (dio->final_block_in_request >> dio->blkfactor) >> + (PAGE_CACHE_SHIFT - dio->inode->i_blkbits); + BUG_ON(index > end); + while (index >= dio->fspages_end_off) { + unsigned long nr = end - dio->fspages_end_off + 1; + /* if we're hitting buffered pages, + * work in smaller chunks. Otherwise, just + * lock down the whole thing + */ + if (dio->inode->i_mapping->nrpages) + nr = min(nr, (unsigned long)DIO_PAGES); + ret = lock_page_range(dio, dio->fspages_end_off, nr); + if (ret) + goto error; + dio->fspages_end_off += nr; + } /* * For writes inside i_size we forbid block creations: only * overwrites are permitted. We fall back to buffered writes @@ -550,6 +602,7 @@ static int get_more_blocks(struct dio *d ret = (*dio->get_block)(dio->inode, fs_startblk, map_bh, create); } +error: return ret; } @@ -946,9 +999,6 @@ out: return ret; } -/* - * Releases both i_mutex and i_alloc_sem - */ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, @@ -1074,14 +1124,6 @@ direct_io_worker(int rw, struct kiocb *i dio_cleanup(dio); /* - * All block lookups have been performed. For READ requests - * we can let i_mutex go now that its achieved its purpose - * of protecting us from looking up uninitialized blocks. - */ - if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); - - /* * The only time we want to leave bios in flight is when a successful * partial aio read or full aio write have been setup. In that case * bio completion will call aio_complete. The only time it's safe to @@ -1130,8 +1172,6 @@ direct_io_worker(int rw, struct kiocb *i * DIO_LOCKING (simple locking for regular files) * For writes we are called under i_mutex and return with i_mutex held, even * though it is internally dropped. - * For reads, i_mutex is not held on entry, but it is taken and dropped before - * returning. * * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of * uninitialised data, allowing parallel direct readers and writers) @@ -1156,8 +1196,7 @@ __blockdev_direct_IO(int rw, struct kioc ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - int release_i_mutex = 0; - int acquire_i_mutex = 0; + struct address_space *mapping = iocb->ki_filp->f_mapping; if (rw & WRITE) rw = WRITE_SYNC; @@ -1186,49 +1225,28 @@ __blockdev_direct_IO(int rw, struct kioc goto out; } } - dio = kmalloc(sizeof(*dio), GFP_KERNEL); retval = -ENOMEM; if (!dio) goto out; + dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT; + dio->fspages_end_off = dio->fspages_start_off; + /* * For block device access DIO_NO_LOCKING is used, * neither readers nor writers do any locking at all * For regular files using DIO_LOCKING, - * readers need to grab i_mutex and i_alloc_sem - * writers need to grab i_alloc_sem only (i_mutex is already held) + * No locks are taken * For regular files using DIO_OWN_LOCKING, * neither readers nor writers take any locks here */ dio->lock_type = dio_lock_type; - if (dio_lock_type != DIO_NO_LOCKING) { - /* watch out for a 0 len io from a tricksy fs */ - if (rw == READ && end > offset) { - struct address_space *mapping; - - mapping = iocb->ki_filp->f_mapping; - if (dio_lock_type != DIO_OWN_LOCKING) { - mutex_lock(&inode->i_mutex); - release_i_mutex = 1; - } - - retval = filemap_write_and_wait_range(mapping, offset, - end - 1); - if (retval) { - kfree(dio); - goto out; - } - - if (dio_lock_type == DIO_OWN_LOCKING) { - mutex_unlock(&inode->i_mutex); - acquire_i_mutex = 1; - } - } - - if (dio_lock_type == DIO_LOCKING) - /* lockdep: not the owner will release it */ - down_read_non_owner(&inode->i_alloc_sem); + + if (dio->lock_type == DIO_NO_LOCKING && end > offset) { + retval = filemap_write_and_wait_range(mapping, offset, end - 1); + if (retval) + goto out; } /* @@ -1242,15 +1260,7 @@ __blockdev_direct_IO(int rw, struct kioc retval = direct_io_worker(rw, iocb, inode, iov, offset, nr_segs, blkbits, get_block, end_io, dio); - - if (rw == READ && dio_lock_type == DIO_LOCKING) - release_i_mutex = 0; - out: - if (release_i_mutex) - mutex_unlock(&inode->i_mutex); - else if (acquire_i_mutex) - mutex_lock(&inode->i_mutex); return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html