On Thu, Nov 02, 2006 at 09:58:58AM +1100, David Chinner wrote: > > + if (!(dio->flags & DIO_PLACEHOLDERS) && end > offset) { > > + struct address_space *mapping = iocb->ki_filp->f_mapping; > > retval = filemap_write_and_wait_range(mapping, offset, end - 1); > > if (retval) > > goto out; > > So that means XFS will now do three cache flushes on write (one in > xfs_write(), one in generic_file_direct_IO() and yet another here.... > > Given the unconditional flush in generic_file_direct_IO(), is this > flush here needed at all? Ok, here's a new patch that replaces #4 of 7, and tries to rework the API a bit to be more flexible....or easier to cut and paste, depending on how you look at it. -chris -- Turn the DIO lock_type parameter into a flags field This creates a number of flags so that filesystems can control blockdev_direct_IO. It is based on code from Russell Cettelan. The new flags are: DIO_CREATE -- always pass create=1 to get_block on writes. This allows DIO to fill holes in the file. DIO_PLACEHOLDERS -- use placeholder pages to provide locking against buffered io and truncates. DIO_EXTEND -- use truncate to grow the file instead of falling back to buffered io. DIO_DROP_I_MUTEX -- drop i_mutex before starting the mapping, io submission, or io waiting. The mutex is still dropped for AIO as well. Some API changes are made so that filesystems can have more control over the DIO features. __blockdev_direct_IO is more or less renamed to blockdev_direct_IO_flags. All waiting and invalidating of page cache data is pushed down into blockdev_direct_IO_flags (and removed from mm/filemap.c) direct_io_worker is exported into the wild. Filesystems that want to be special can pull out the bits of blockdev_direct_IO_flags they care about and then call direct_io_worker directly. Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> diff -r 98112f002089 fs/direct-io.c --- a/fs/direct-io.c Wed Nov 08 10:33:48 2006 -0500 +++ b/fs/direct-io.c Wed Nov 08 11:46:26 2006 -0500 @@ -53,13 +53,6 @@ * * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. - * - * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. - * This determines whether we need to do the fancy locking which prevents - * direct-IO from being able to read uninitialised disk blocks. If its zero - * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is - * not held for the entire direct write (taken briefly, initially, during a - * direct read though, but its never held for the duration of a direct-IO). */ struct dio { @@ -68,8 +61,7 @@ struct dio { struct inode *inode; int rw; loff_t i_size; /* i_size when submitted */ - int lock_type; /* doesn't change */ - int reacquire_i_mutex; /* should we get i_mutex when done? */ + unsigned flags; /* doesn't change */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -203,7 +195,7 @@ static void unlock_page_range(struct dio static void unlock_page_range(struct dio *dio, unsigned long start, unsigned long nr) { - if (dio->lock_type != DIO_NO_LOCKING) { + if (dio->flags & DIO_PLACEHOLDERS) { remove_placeholder_pages(dio->inode->i_mapping, dio->tmppages, &dio->fake, start, start + nr, @@ -218,11 +210,13 @@ static int lock_page_range(struct dio *d struct page *fake = &dio->fake; unsigned long end = start + nr; - if (dio->lock_type == DIO_NO_LOCKING) - return 0; - return find_or_insert_placeholders(mapping, dio->tmppages, start, end, - ARRAY_SIZE(dio->tmppages), - GFP_KERNEL, fake, 1); + if (dio->flags & DIO_PLACEHOLDERS) { + return find_or_insert_placeholders(mapping, dio->tmppages, + start, end, + ARRAY_SIZE(dio->tmppages), + GFP_KERNEL, fake, 1); + } + return 0; } @@ -258,8 +252,6 @@ static void dio_complete(struct dio *dio unlock_page_range(dio, dio->fspages_start_off, dio->fspages_end_off - dio->fspages_start_off); dio->fspages_end_off = dio->fspages_start_off; - if (dio->reacquire_i_mutex) - mutex_lock(&dio->inode->i_mutex); } /* @@ -556,6 +548,7 @@ static int get_more_blocks(struct dio *d unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; unsigned long index; + unsigned long end; int create; /* @@ -575,8 +568,9 @@ static int get_more_blocks(struct dio *d map_bh->b_state = 0; map_bh->b_size = fs_count << dio->inode->i_blkbits; - create = dio->rw & WRITE; - if (dio->lock_type == DIO_NO_LOCKING) + if (dio->flags & DIO_CREATE) + create = dio->rw & WRITE; + else create = 0; index = fs_startblk >> (PAGE_CACHE_SHIFT - dio->inode->i_blkbits); @@ -990,18 +984,44 @@ out: return ret; } -static ssize_t -direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, - const struct iovec *iov, loff_t offset, unsigned long nr_segs, +/* + * This does all the real work of the direct io. Most filesystems want to + * call blockdev_direct_IO_flags instead, but if you have exotic locking + * routines you can call this directly. + * + * The flags parameter is a bitmask of: + * + * DIO_PLACEHOLDERS (use placeholder pages for locking) + * DIO_CREATE (pass create=1 to get_block for filling holes) + * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes) + * DIO_EXTEND (use truncate to expand the file if needed) + */ +ssize_t +direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, + const struct iovec *iov, loff_t offset, unsigned long nr_segs, unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, - struct dio *dio) -{ - unsigned long user_addr; + int is_async, unsigned flags) +{ + unsigned long user_addr; int seg; ssize_t ret = 0; ssize_t ret2; size_t bytes; - + struct dio *dio; + + if (rw & WRITE) + rw = WRITE_SYNC; + + dio = kmalloc(sizeof(*dio), GFP_KERNEL); + ret = -ENOMEM; + if (!dio) + goto out; + + set_page_placeholder(&dio->fake); + dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT; + dio->fspages_end_off = dio->fspages_start_off; + dio->flags = flags; + dio->is_async = is_async; dio->bio = NULL; dio->inode = inode; dio->rw = rw; @@ -1188,33 +1208,24 @@ direct_io_worker(int rw, struct kiocb *i aio_complete(iocb, ret, 0); kfree(dio); } +out: return ret; } - -/* - * This is a library function for use by filesystem drivers. - * The locking rules are governed by the dio_lock_type parameter. - * - * DIO_NO_LOCKING (no locking, for raw block device access) - * For writes, i_mutex is not held on entry; it is never taken. - * - * DIO_LOCKING (simple locking for regular files) - * For writes we are called under i_mutex and return with i_mutex held, even - * though it is internally dropped. - * - * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of - * uninitialised data, allowing parallel direct readers and writers) - * For writes we are called without i_mutex, return without it, never touch it. - * For reads we are called under i_mutex and return with i_mutex held, even - * though it may be internally dropped. - * - * Additional i_alloc_sem locking requirements described inline below. - */ -ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int dio_lock_type) +EXPORT_SYMBOL(direct_io_worker); + +/* + * A utility function fro blockdev_direct_IO_flags, this checks + * alignment of a O_DIRECT iovec against filesystem and blockdevice + * requirements. + * + * It returns a blkbits value that will work for the io, and returns the + * end offset of the io (via blkbits_ret and end_ret). + * + * The function returns 0 if everything will work or -EINVAL on error + */ +int check_dio_alignment(struct inode *inode, struct block_device *bdev, + const struct iovec *iov, loff_t offset, unsigned long nr_segs, + unsigned *blkbits_ret, loff_t *end_ret) { int seg; size_t size; @@ -1222,13 +1233,7 @@ __blockdev_direct_IO(int rw, struct kioc unsigned blkbits = inode->i_blkbits; unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; - ssize_t retval = -EINVAL; loff_t end = offset; - struct dio *dio; - struct address_space *mapping = iocb->ki_filp->f_mapping; - - if (rw & WRITE) - rw = WRITE_SYNC; if (bdev) bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); @@ -1238,7 +1243,7 @@ __blockdev_direct_IO(int rw, struct kioc blkbits = bdev_blkbits; blocksize_mask = (1 << blkbits) - 1; if (offset & blocksize_mask) - goto out; + return -EINVAL; } /* Check the memory alignment. Blocks cannot straddle pages */ @@ -1250,43 +1255,42 @@ __blockdev_direct_IO(int rw, struct kioc if (bdev) blkbits = bdev_blkbits; blocksize_mask = (1 << blkbits) - 1; - if ((addr & blocksize_mask) || (size & blocksize_mask)) - goto out; - } - } - dio = kmalloc(sizeof(*dio), GFP_KERNEL); - retval = -ENOMEM; - if (!dio) + if ((addr & blocksize_mask) || (size & blocksize_mask)) + return -EINVAL; + } + } + *end_ret = end; + *blkbits_ret = blkbits; + return 0; +} +EXPORT_SYMBOL(check_dio_alignment); + +/* + * This is a library function for use by filesystem drivers. + * The flags parameter is a bitmask of: + * + * DIO_PLACEHOLDERS (use placeholder pages for locking) + * DIO_CREATE (pass create=1 to get_block for filling holes) + * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes) + * DIO_EXTEND (use truncate to expand the file if needed) + */ +ssize_t +blockdev_direct_IO_flags(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + unsigned flags) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + unsigned blkbits = 0; + ssize_t retval = -EINVAL; + loff_t end = 0; + int is_async; + int grab_i_mutex = 0; + + + if (check_dio_alignment(inode, bdev, iov, offset, nr_segs, + &blkbits, &end)) goto out; - - set_page_placeholder(&dio->fake); - dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT; - dio->fspages_end_off = dio->fspages_start_off; - - /* - * For block device access DIO_NO_LOCKING is used, - * neither readers nor writers do any locking at all - * For regular files using DIO_LOCKING, - * No locks are taken - * For regular files using DIO_OWN_LOCKING, - * neither readers nor writers take any locks here - */ - dio->lock_type = dio_lock_type; - - if (dio->lock_type == DIO_NO_LOCKING && end > offset) { - retval = filemap_write_and_wait_range(mapping, offset, end - 1); - if (retval) - goto out; - } - - /* - * For file extending writes updating i_size before data - * writeouts complete can expose uninitialized blocks. So - * even for AIO, we need to wait for i/o to complete before - * returning in this case. - */ - dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && - (end > i_size_read(inode))); /* * extend the file if needed, and drop i_mutex for non-aio writes. @@ -1295,12 +1299,12 @@ __blockdev_direct_IO(int rw, struct kioc * are inserted, the locking rules end up being the same as * mmap'd writes using writepage to fill holes */ - dio->reacquire_i_mutex = 0; - if ((rw & WRITE) && dio_lock_type == DIO_LOCKING) { + if (rw & WRITE) { /* if our write goes past i_size, do an expanding * truncate to fill it before dropping i_mutex */ - if (end > i_size_read(inode) && iocb->ki_filp) { + if ((flags & DIO_EXTEND) && end > i_size_read(inode) && + iocb->ki_filp) { struct iattr newattrs; newattrs.ia_size = end; newattrs.ia_file = iocb->ki_filp; @@ -1310,14 +1314,59 @@ __blockdev_direct_IO(int rw, struct kioc if (retval) goto out; } - if (is_sync_kiocb(iocb)) { - dio->reacquire_i_mutex = 1; + /* + * If it's a write, unmap all mmappings of the file up-front. + * This will cause any pte dirty bits to be propagated into + * the pageframes for the subsequent filemap_write_and_wait(). + */ + if (mapping_mapped(mapping)) + unmap_mapping_range(mapping, offset, end - offset, 0); + if (flags & DIO_DROP_I_MUTEX) { mutex_unlock(&inode->i_mutex); - } - } + grab_i_mutex = 1; + } + } + + /* + * the placeholder code does filemap_write_and_wait, so if we + * aren't using placeholders we have to do it here + */ + if (!(flags & DIO_PLACEHOLDERS) && end > offset) { + retval = filemap_write_and_wait_range(mapping, offset, end - 1); + if (retval) + goto out; + } + + /* + * For file extending writes updating i_size before data + * writeouts complete can expose uninitialized blocks. So + * even for AIO, we need to wait for i/o to complete before + * returning in this case. + */ + is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && + (end > i_size_read(inode))); + retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_block, end_io, dio); + nr_segs, blkbits, get_block, end_io, is_async, + flags); out: + if (grab_i_mutex) + mutex_lock(&inode->i_mutex); + + if ((rw & WRITE) && mapping->nrpages) { + int err; + /* O_DIRECT is allowed to drop i_mutex, so more data + * could have been dirtied by others. Start io one more + * time + */ + err = filemap_write_and_wait_range(mapping, offset, end - 1); + if (!err) + err = invalidate_inode_pages2_range(mapping, + offset >> PAGE_CACHE_SHIFT, + (end - 1) >> PAGE_CACHE_SHIFT); + if (!retval && err) + retval = err; + } return retval; } -EXPORT_SYMBOL(__blockdev_direct_IO); +EXPORT_SYMBOL(blockdev_direct_IO_flags); diff -r 98112f002089 include/linux/fs.h --- a/include/linux/fs.h Wed Nov 08 10:33:48 2006 -0500 +++ b/include/linux/fs.h Wed Nov 08 11:37:36 2006 -0500 @@ -1798,24 +1798,29 @@ static inline void do_generic_file_read( } #ifdef CONFIG_BLOCK -ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +int check_dio_alignment(struct inode *inode, struct block_device *bdev, + const struct iovec *iov, loff_t offset, unsigned long nr_segs, + unsigned *blkbits_ret, loff_t *end_ret); + +ssize_t blockdev_direct_IO_flags(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int lock_type); - -enum { - DIO_LOCKING = 1, /* need locking between buffered and direct access */ - DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ - DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ -}; + unsigned int dio_flags); + +#define DIO_PLACEHOLDERS (1 << 0) /* insert placeholder pages */ +#define DIO_CREATE (1 << 1) /* pass create=1 to get_block when writing */ +#define DIO_DROP_I_MUTEX (1 << 2) /* drop i_mutex during writes */ +#define DIO_EXTEND (1 << 3) /* extend the file w/truncate if needed */ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io) { - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, DIO_LOCKING); + /* locking is on, FS wants to fill holes w/get_block */ + return blockdev_direct_IO_flags(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, DIO_PLACEHOLDERS | + DIO_CREATE | DIO_DROP_I_MUTEX | DIO_EXTEND); } static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, @@ -1823,17 +1828,9 @@ static inline ssize_t blockdev_direct_IO loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io) { - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, DIO_NO_LOCKING); -} - -static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, - struct inode *inode, struct block_device *bdev, const struct iovec *iov, - loff_t offset, unsigned long nr_segs, get_block_t get_block, - dio_iodone_t end_io) -{ - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, DIO_OWN_LOCKING); + /* locking is off, create is off */ + return blockdev_direct_IO_flags(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, 0); } #endif diff -r 98112f002089 mm/filemap.c --- a/mm/filemap.c Wed Nov 08 10:33:48 2006 -0500 +++ b/mm/filemap.c Wed Nov 08 11:49:14 2006 -0500 @@ -40,7 +40,7 @@ #include <asm/mman.h> -static ssize_t +static inline ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); @@ -2735,46 +2735,12 @@ EXPORT_SYMBOL(generic_file_aio_write); * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something * went wrong during pagecache shootdown. */ -static ssize_t +static inline ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - ssize_t retval; - size_t write_len = 0; - - /* - * If it's a write, unmap all mmappings of the file up-front. This - * will cause any pte dirty bits to be propagated into the pageframes - * for the subsequent filemap_write_and_wait(). - */ - if (rw == WRITE) { - write_len = iov_length(iov, nr_segs); - if (mapping_mapped(mapping)) - unmap_mapping_range(mapping, offset, write_len, 0); - } - - retval = mapping->a_ops->direct_IO(rw, iocb, iov, - offset, nr_segs); - if (rw == WRITE && mapping->nrpages) { - int err; - pgoff_t end = (offset + write_len - 1) - >> PAGE_CACHE_SHIFT; - - /* O_DIRECT is allowed to drop i_mutex, so more data - * could have been dirtied by others. Start io one more - * time - */ - err = filemap_fdatawrite_range(mapping, offset, - offset + write_len - 1); - if (!err) - err = invalidate_inode_pages2_range(mapping, - offset >> PAGE_CACHE_SHIFT, end); - if (err) - retval = err; - } - return retval; + return iocb->ki_filp->f_mapping->a_ops->direct_IO(rw, iocb, iov, + offset, nr_segs); } /** - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html