This changes O_DIRECT to take page locks or insert placeholder pages to lock regions under direct io. Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> diff -r 18a9e9f5c707 fs/direct-io.c --- a/fs/direct-io.c Thu Oct 19 08:30:00 2006 +0700 +++ b/fs/direct-io.c Tue Oct 24 15:10:48 2006 -0400 @@ -35,6 +35,7 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <asm/atomic.h> +#include <linux/writeback.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -94,6 +95,14 @@ struct dio { struct buffer_head map_bh; /* last get_block() result */ /* + * kernel page pinning + */ + struct page fake; + struct page *tmppages[DIO_PAGES]; + unsigned long fspages_start_off; + unsigned long fspages_end_off; + + /* * Deferred addition of a page to the dio. These variables are * private to dio_send_cur_page(), submit_page_section() and * dio_bio_add_page(). @@ -190,6 +199,28 @@ out: return ret; } +static void unlock_page_range(struct dio *dio, unsigned long start, + unsigned long nr) +{ + remove_placeholder_pages(dio->inode->i_mapping, dio->tmppages, + &dio->fake, + start, start + nr, + ARRAY_SIZE(dio->tmppages)); +} + +static int lock_page_range(struct dio *dio, unsigned long start, + unsigned long nr) +{ + struct address_space *mapping = dio->inode->i_mapping; + struct page *fake = &dio->fake; + unsigned long end = start + nr; + return find_or_insert_placeholders(mapping, dio->tmppages, start, end, + ARRAY_SIZE(dio->tmppages), + GFP_KERNEL, fake, + dio->rw == READ); +} + + /* * Get another userspace page. Returns an ERR_PTR on error. Pages are * buffered inside the dio so that we can call get_user_pages() against a @@ -219,9 +250,9 @@ static void dio_complete(struct dio *dio { if (dio->end_io && dio->result) dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private); - if (dio->lock_type == DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); + unlock_page_range(dio, dio->fspages_start_off, + dio->fspages_end_off - dio->fspages_start_off); + dio->fspages_end_off = dio->fspages_start_off; } /* @@ -517,6 +548,7 @@ static int get_more_blocks(struct dio *d unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; + unsigned long index; int create; /* @@ -544,7 +576,21 @@ static int get_more_blocks(struct dio *d } else if (dio->lock_type == DIO_NO_LOCKING) { create = 0; } - + index = fs_startblk >> (PAGE_CACHE_SHIFT - + dio->inode->i_blkbits); + if (index >= dio->fspages_end_off) { + unsigned long end; + unsigned long nr; + end = (dio->final_block_in_request >> + dio->blkfactor) >> + (PAGE_CACHE_SHIFT - dio->inode->i_blkbits); + nr = min(end - index + 1, (unsigned long)DIO_PAGES); + ret = lock_page_range(dio, dio->fspages_end_off, nr); + if (ret) + goto error; + dio->fspages_end_off += nr; + BUG_ON(index >= dio->fspages_end_off); + } /* * For writes inside i_size we forbid block creations: only * overwrites are permitted. We fall back to buffered writes @@ -554,6 +600,7 @@ static int get_more_blocks(struct dio *d ret = (*dio->get_block)(dio->inode, fs_startblk, map_bh, create); } +error: return ret; } @@ -944,7 +991,7 @@ out: } /* - * Releases both i_mutex and i_alloc_sem + * Releases both i_mutex */ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, @@ -1191,7 +1238,6 @@ __blockdev_direct_IO(int rw, struct kioc ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - int release_i_mutex = 0; int acquire_i_mutex = 0; if (rw & WRITE) @@ -1221,11 +1267,14 @@ __blockdev_direct_IO(int rw, struct kioc goto out; } } - dio = kmalloc(sizeof(*dio), GFP_KERNEL); retval = -ENOMEM; if (!dio) goto out; + + set_page_placeholder(&dio->fake); + dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT; + dio->fspages_end_off = dio->fspages_start_off; /* * For block device access DIO_NO_LOCKING is used, @@ -1240,30 +1289,11 @@ __blockdev_direct_IO(int rw, struct kioc if (dio_lock_type != DIO_NO_LOCKING) { /* watch out for a 0 len io from a tricksy fs */ if (rw == READ && end > offset) { - struct address_space *mapping; - - mapping = iocb->ki_filp->f_mapping; - if (dio_lock_type != DIO_OWN_LOCKING) { - mutex_lock(&inode->i_mutex); - release_i_mutex = 1; - } - - retval = filemap_write_and_wait_range(mapping, offset, - end - 1); - if (retval) { - kfree(dio); - goto out; - } - if (dio_lock_type == DIO_OWN_LOCKING) { mutex_unlock(&inode->i_mutex); acquire_i_mutex = 1; } } - - if (dio_lock_type == DIO_LOCKING) - /* lockdep: not the owner will release it */ - down_read_non_owner(&inode->i_alloc_sem); } /* @@ -1278,13 +1308,8 @@ __blockdev_direct_IO(int rw, struct kioc retval = direct_io_worker(rw, iocb, inode, iov, offset, nr_segs, blkbits, get_block, end_io, dio); - if (rw == READ && dio_lock_type == DIO_LOCKING) - release_i_mutex = 0; - out: - if (release_i_mutex) - mutex_unlock(&inode->i_mutex); - else if (acquire_i_mutex) + if (acquire_i_mutex) mutex_lock(&inode->i_mutex); return retval; } - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html