This changes O_DIRECT to take page locks or insert placeholder pages to lock regions under direct io. Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> diff -r 18a9e9f5c707 fs/direct-io.c --- a/fs/direct-io.c Thu Oct 19 08:30:00 2006 +0700 +++ b/fs/direct-io.c Fri Oct 20 12:38:24 2006 -0400 @@ -35,6 +35,7 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <asm/atomic.h> +#include <linux/writeback.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -94,6 +95,14 @@ struct dio { struct buffer_head map_bh; /* last get_block() result */ /* + * kernel page pinning + */ + struct page fake; + struct page **fspages; + unsigned long nr_fspages; + loff_t fs_start_off; + + /* * Deferred addition of a page to the dio. These variables are * private to dio_send_cur_page(), submit_page_section() and * dio_bio_add_page(). @@ -190,6 +199,66 @@ out: return ret; } +static void unlock_page_range(struct address_space *mapping, + struct page **pages, + unsigned long start, + unsigned long nr) +{ + unsigned long i; + struct page *p; + struct page *placeholder = NULL; + for (i = 0; i < nr; i++) { + p = pages[i]; + if (PagePlaceHolder(p)) { + placeholder = p; + remove_placeholder_page(mapping, p, start + i); + } else { + unlock_page(p); + page_cache_release(p); + } + } + if (placeholder) + wake_up_placeholder_page(placeholder); +} + +static int lock_page_range(struct address_space *mapping, + struct page **pages, + unsigned long start, + unsigned long nr, + struct page *fake) +{ + struct page *p; + unsigned long numlock = 0; + unsigned long end = start + nr; + loff_t end_bytes = end << PAGE_CACHE_SHIFT; + unsigned long i; + for (i = start ; i < end; i++) { + p = find_or_insert_page(mapping, i, GFP_KERNEL, fake); + if (!p) + goto fail; + if (PageDirty(p)) { + /* this page was dirty, so someone raced in and + * did a write. Start IO on the whole region + * and try again + */ + unlock_page(p); + page_cache_release(p); + __filemap_fdatawrite_range(mapping, + i << PAGE_CACHE_SHIFT, + end_bytes, WB_SYNC_ALL); + continue; + } + pages[numlock++] = p; + } + /* now that we have all the pages locked, wait for any io */ + wait_on_page_writeback_range(mapping, start, end); + return 0; +fail: + unlock_page_range(mapping, pages, start, numlock); + return -1; +} + + /* * Get another userspace page. Returns an ERR_PTR on error. Pages are * buffered inside the dio so that we can call get_user_pages() against a @@ -219,9 +288,8 @@ static void dio_complete(struct dio *dio { if (dio->end_io && dio->result) dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private); - if (dio->lock_type == DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); + unlock_page_range(dio->inode->i_mapping, dio->fspages, + dio->fs_start_off, dio->nr_fspages); } /* @@ -944,7 +1012,7 @@ out: } /* - * Releases both i_mutex and i_alloc_sem + * Releases both i_mutex */ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, @@ -1191,8 +1259,9 @@ __blockdev_direct_IO(int rw, struct kioc ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - int release_i_mutex = 0; int acquire_i_mutex = 0; + struct page **pages = NULL; + unsigned long nrpages; if (rw & WRITE) rw = WRITE_SYNC; @@ -1221,12 +1290,21 @@ __blockdev_direct_IO(int rw, struct kioc goto out; } } - dio = kmalloc(sizeof(*dio), GFP_KERNEL); retval = -ENOMEM; if (!dio) goto out; + memset(&dio->fake, 0, sizeof(struct page)); + SetPagePlaceHolder(&dio->fake); + nrpages = (end + PAGE_CACHE_SIZE - 1 - offset) >> PAGE_CACHE_SHIFT; + dio->fs_start_off = offset >> PAGE_CACHE_SHIFT; + pages = kmalloc(sizeof(struct page *) * nrpages, GFP_KERNEL); + dio->fspages = pages; + dio->nr_fspages = nrpages; + if (lock_page_range(inode->i_mapping, pages, dio->fs_start_off, nrpages, + &dio->fake)) + goto out; /* * For block device access DIO_NO_LOCKING is used, * neither readers nor writers do any locking at all @@ -1240,30 +1318,11 @@ __blockdev_direct_IO(int rw, struct kioc if (dio_lock_type != DIO_NO_LOCKING) { /* watch out for a 0 len io from a tricksy fs */ if (rw == READ && end > offset) { - struct address_space *mapping; - - mapping = iocb->ki_filp->f_mapping; - if (dio_lock_type != DIO_OWN_LOCKING) { - mutex_lock(&inode->i_mutex); - release_i_mutex = 1; - } - - retval = filemap_write_and_wait_range(mapping, offset, - end - 1); - if (retval) { - kfree(dio); - goto out; - } - if (dio_lock_type == DIO_OWN_LOCKING) { mutex_unlock(&inode->i_mutex); acquire_i_mutex = 1; } } - - if (dio_lock_type == DIO_LOCKING) - /* lockdep: not the owner will release it */ - down_read_non_owner(&inode->i_alloc_sem); } /* @@ -1278,13 +1337,8 @@ __blockdev_direct_IO(int rw, struct kioc retval = direct_io_worker(rw, iocb, inode, iov, offset, nr_segs, blkbits, get_block, end_io, dio); - if (rw == READ && dio_lock_type == DIO_LOCKING) - release_i_mutex = 0; - out: - if (release_i_mutex) - mutex_unlock(&inode->i_mutex); - else if (acquire_i_mutex) + if (acquire_i_mutex) mutex_lock(&inode->i_mutex); return retval; } - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html