All mutex and semaphore usage is removed from fs/direct-io.c. Callers can ask for placeholder pages if they want help protecting against races with buffered io. Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> diff -r e3fe4a6b9355 -r 4486b1f7011a fs/direct-io.c --- a/fs/direct-io.c Tue Oct 31 10:01:36 2006 -0500 +++ b/fs/direct-io.c Wed Nov 01 10:18:44 2006 -0500 @@ -35,6 +35,7 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <asm/atomic.h> +#include <linux/writeback.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -94,6 +95,14 @@ struct dio { struct buffer_head map_bh; /* last get_block() result */ /* + * kernel page pinning + */ + struct page fake; + struct page *tmppages[DIO_PAGES]; + unsigned long fspages_start_off; + unsigned long fspages_end_off; + + /* * Deferred addition of a page to the dio. These variables are * private to dio_send_cur_page(), submit_page_section() and * dio_bio_add_page(). @@ -190,6 +199,33 @@ out: return ret; } +static void unlock_page_range(struct dio *dio, unsigned long start, + unsigned long nr) +{ + if (dio->lock_type != DIO_NO_LOCKING) { + remove_placeholder_pages(dio->inode->i_mapping, dio->tmppages, + &dio->fake, + start, start + nr, + ARRAY_SIZE(dio->tmppages)); + } +} + +static int lock_page_range(struct dio *dio, unsigned long start, + unsigned long nr) +{ + struct address_space *mapping = dio->inode->i_mapping; + struct page *fake = &dio->fake; + unsigned long end = start + nr; + + if (dio->lock_type == DIO_NO_LOCKING) + return 0; + return find_or_insert_placeholders(mapping, dio->tmppages, start, end, + ARRAY_SIZE(dio->tmppages), + GFP_KERNEL, fake, + dio->rw == READ); +} + + /* * Get another userspace page. Returns an ERR_PTR on error. Pages are * buffered inside the dio so that we can call get_user_pages() against a @@ -219,9 +255,9 @@ static void dio_complete(struct dio *dio { if (dio->end_io && dio->result) dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private); - if (dio->lock_type == DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); + unlock_page_range(dio, dio->fspages_start_off, + dio->fspages_end_off - dio->fspages_start_off); + dio->fspages_end_off = dio->fspages_start_off; } /* @@ -517,6 +553,7 @@ static int get_more_blocks(struct dio *d unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; + unsigned long index; int create; /* @@ -544,7 +581,19 @@ static int get_more_blocks(struct dio *d } else if (dio->lock_type == DIO_NO_LOCKING) { create = 0; } - + index = fs_startblk >> (PAGE_CACHE_SHIFT - + dio->inode->i_blkbits); + end = (dio->final_block_in_request >> dio->blkfactor) >> + (PAGE_CACHE_SHIFT - dio->inode->i_blkbits); + BUG_ON(index > end); + while (index >= dio->fspages_end_off) { + unsigned long nr; + nr = min(end - index + 1, (unsigned long)DIO_PAGES); + ret = lock_page_range(dio, dio->fspages_end_off, nr); + if (ret) + goto error; + dio->fspages_end_off += nr; + } /* * For writes inside i_size we forbid block creations: only * overwrites are permitted. We fall back to buffered writes @@ -554,6 +603,7 @@ static int get_more_blocks(struct dio *d ret = (*dio->get_block)(dio->inode, fs_startblk, map_bh, create); } +error: return ret; } @@ -943,9 +993,6 @@ out: return ret; } -/* - * Releases both i_mutex and i_alloc_sem - */ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, @@ -1074,14 +1121,6 @@ direct_io_worker(int rw, struct kiocb *i * In that case, we need to release all the pages we got hold on. */ dio_cleanup(dio); - - /* - * All block lookups have been performed. For READ requests - * we can let i_mutex go now that its achieved its purpose - * of protecting us from looking up uninitialized blocks. - */ - if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); /* * OK, all BIOs are submitted, so we can decrement bio_count to truly @@ -1165,8 +1204,6 @@ direct_io_worker(int rw, struct kiocb *i * DIO_LOCKING (simple locking for regular files) * For writes we are called under i_mutex and return with i_mutex held, even * though it is internally dropped. - * For reads, i_mutex is not held on entry, but it is taken and dropped before - * returning. * * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of * uninitialised data, allowing parallel direct readers and writers) @@ -1191,8 +1228,7 @@ __blockdev_direct_IO(int rw, struct kioc ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - int release_i_mutex = 0; - int acquire_i_mutex = 0; + struct address_space *mapping = iocb->ki_filp->f_mapping; if (rw & WRITE) rw = WRITE_SYNC; @@ -1221,49 +1257,29 @@ __blockdev_direct_IO(int rw, struct kioc goto out; } } - dio = kmalloc(sizeof(*dio), GFP_KERNEL); retval = -ENOMEM; if (!dio) goto out; + set_page_placeholder(&dio->fake); + dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT; + dio->fspages_end_off = dio->fspages_start_off; + /* * For block device access DIO_NO_LOCKING is used, * neither readers nor writers do any locking at all * For regular files using DIO_LOCKING, - * readers need to grab i_mutex and i_alloc_sem - * writers need to grab i_alloc_sem only (i_mutex is already held) + * No locks are taken * For regular files using DIO_OWN_LOCKING, * neither readers nor writers take any locks here */ dio->lock_type = dio_lock_type; - if (dio_lock_type != DIO_NO_LOCKING) { - /* watch out for a 0 len io from a tricksy fs */ - if (rw == READ && end > offset) { - struct address_space *mapping; - - mapping = iocb->ki_filp->f_mapping; - if (dio_lock_type != DIO_OWN_LOCKING) { - mutex_lock(&inode->i_mutex); - release_i_mutex = 1; - } - - retval = filemap_write_and_wait_range(mapping, offset, - end - 1); - if (retval) { - kfree(dio); - goto out; - } - - if (dio_lock_type == DIO_OWN_LOCKING) { - mutex_unlock(&inode->i_mutex); - acquire_i_mutex = 1; - } - } - - if (dio_lock_type == DIO_LOCKING) - /* lockdep: not the owner will release it */ - down_read_non_owner(&inode->i_alloc_sem); + + if (dio->lock_type == DIO_NO_LOCKING && end > offset) { + retval = filemap_write_and_wait_range(mapping, offset, end - 1); + if (retval) + goto out; } /* @@ -1277,15 +1293,7 @@ __blockdev_direct_IO(int rw, struct kioc retval = direct_io_worker(rw, iocb, inode, iov, offset, nr_segs, blkbits, get_block, end_io, dio); - - if (rw == READ && dio_lock_type == DIO_LOCKING) - release_i_mutex = 0; - out: - if (release_i_mutex) - mutex_unlock(&inode->i_mutex); - else if (acquire_i_mutex) - mutex_lock(&inode->i_mutex); return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html