This creates a number of flags so that filesystems can control blockdev_direct_IO. It is based on code from Russell Cettelan. The new flags are: DIO_CREATE -- always pass create=1 to get_block on writes. This allows DIO to fill holes in the file. DIO_PLACEHOLDERS -- use placeholder pages to provide locking against buffered io and truncates. DIO_DROP_I_MUTEX -- drop i_mutex before starting the mapping, io submission, or io waiting. The mutex is still dropped for AIO as well. Some API changes are made so that filesystems can have more control over the DIO features. __blockdev_direct_IO is more or less renamed to blockdev_direct_IO_flags. All waiting and invalidating of page cache data is pushed down into blockdev_direct_IO_flags (and removed from mm/filemap.c) direct_io_worker is exported into the wild. Filesystems that want to be special can pull out the bits of blockdev_direct_IO_flags they care about and then call direct_io_worker directly. Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> diff -r 1a7105ab9c19 -r 04dd7ddd593e fs/direct-io.c --- a/fs/direct-io.c Tue Feb 06 20:02:55 2007 -0500 +++ b/fs/direct-io.c Tue Feb 06 20:02:56 2007 -0500 @@ -1,4 +1,3 @@ - GFP_KERNEL, 1); /* * fs/direct-io.c * @@ -55,13 +54,6 @@ * * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. - * - * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. - * This determines whether we need to do the fancy locking which prevents - * direct-IO from being able to read uninitialised disk blocks. If its zero - * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is - * not held for the entire direct write (taken briefly, initially, during a - * direct read though, but its never held for the duration of a direct-IO). */ struct dio { @@ -70,8 +62,7 @@ struct dio { struct inode *inode; int rw; loff_t i_size; /* i_size when submitted */ - int lock_type; /* doesn't change */ - int reacquire_i_mutex; /* should we get i_mutex when done? */ + unsigned flags; /* locking and get_block flags */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -211,7 +202,7 @@ out: static void dio_unlock_page_range(struct dio *dio) { - if (dio->lock_type != DIO_NO_LOCKING) { + if (dio->flags & DIO_PLACEHOLDERS) { remove_placeholder_pages(dio->inode->i_mapping, dio->fspages_start_off, dio->fspages_end_off); @@ -226,7 +217,7 @@ static int dio_lock_page_range(struct di unsigned long max_size; int ret = 0; - if (dio->lock_type == DIO_NO_LOCKING) + if (!(dio->flags & DIO_PLACEHOLDERS)) return 0; while (index >= dio->fspages_end_off) { @@ -310,9 +301,6 @@ static int dio_complete(struct dio *dio, dio->map_bh.b_private); dio_unlock_page_range(dio); - if (dio->reacquire_i_mutex) - mutex_lock(&dio->inode->i_mutex); - if (ret == 0) ret = dio->page_errors; if (ret == 0) @@ -597,8 +585,9 @@ static int get_more_blocks(struct dio *d map_bh->b_state = 0; map_bh->b_size = fs_count << dio->inode->i_blkbits; - create = dio->rw & WRITE; - if (dio->lock_type == DIO_NO_LOCKING) + if (dio->flags & DIO_CREATE) + create = dio->rw & WRITE; + else create = 0; index = fs_startblk >> (PAGE_CACHE_SHIFT - dio->inode->i_blkbits); @@ -1014,19 +1003,41 @@ out: return ret; } -static ssize_t -direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, - const struct iovec *iov, loff_t offset, unsigned long nr_segs, +/* + * This does all the real work of the direct io. Most filesystems want to + * call blockdev_direct_IO_flags instead, but if you have exotic locking + * routines you can call this directly. + * + * The flags parameter is a bitmask of: + * + * DIO_PLACEHOLDERS (use placeholder pages for locking) + * DIO_CREATE (pass create=1 to get_block for filling holes or extending) + * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes) + */ +ssize_t +direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, + const struct iovec *iov, loff_t offset, unsigned long nr_segs, unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, - struct dio *dio) -{ - unsigned long user_addr; + int is_async, unsigned dioflags) +{ + unsigned long user_addr; unsigned long flags; int seg; ssize_t ret = 0; ssize_t ret2; size_t bytes; - + struct dio *dio; + + if (rw & WRITE) + rw = WRITE_SYNC; + + dio = kmalloc(sizeof(*dio), GFP_KERNEL); + ret = -ENOMEM; + if (!dio) + goto out; + + dio->flags = dioflags; + dio->is_async = is_async; dio->bio = NULL; dio->inode = inode; dio->rw = rw; @@ -1057,7 +1068,7 @@ direct_io_worker(int rw, struct kiocb *i dio->bio_list = NULL; dio->waiter = NULL; - if (dio->lock_type != DIO_NO_LOCKING) { + if (dio->flags & DIO_PLACEHOLDERS) { dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT; dio->fspages_end_off = dio->fspages_start_off; @@ -1192,33 +1203,24 @@ direct_io_worker(int rw, struct kiocb *i } else BUG_ON(ret != -EIOCBQUEUED); +out: return ret; } - -/* - * This is a library function for use by filesystem drivers. - * The locking rules are governed by the dio_lock_type parameter. - * - * DIO_NO_LOCKING (no locking, for raw block device access) - * For writes, i_mutex is not held on entry; it is never taken. - * - * DIO_LOCKING (simple locking for regular files) - * For writes we are called under i_mutex and return with i_mutex held, even - * though it is internally dropped. - * - * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of - * uninitialised data, allowing parallel direct readers and writers) - * For writes we are called without i_mutex, return without it, never touch it. - * For reads we are called under i_mutex and return with i_mutex held, even - * though it may be internally dropped. - * - * Additional i_alloc_sem locking requirements described inline below. - */ -ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int dio_lock_type) +EXPORT_SYMBOL(direct_io_worker); + +/* + * A utility function fro blockdev_direct_IO_flags, this checks + * alignment of a O_DIRECT iovec against filesystem and blockdevice + * requirements. + * + * It returns a blkbits value that will work for the io, and returns the + * end offset of the io (via blkbits_ret and end_ret). + * + * The function returns 0 if everything will work or -EINVAL on error + */ +int check_dio_alignment(struct inode *inode, struct block_device *bdev, + const struct iovec *iov, loff_t offset, unsigned long nr_segs, + unsigned *blkbits_ret, loff_t *end_ret) { int seg; size_t size; @@ -1226,13 +1228,7 @@ __blockdev_direct_IO(int rw, struct kioc unsigned blkbits = inode->i_blkbits; unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; - ssize_t retval = -EINVAL; loff_t end = offset; - struct dio *dio; - struct address_space *mapping = iocb->ki_filp->f_mapping; - - if (rw & WRITE) - rw = WRITE_SYNC; if (bdev) bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); @@ -1242,7 +1238,7 @@ __blockdev_direct_IO(int rw, struct kioc blkbits = bdev_blkbits; blocksize_mask = (1 << blkbits) - 1; if (offset & blocksize_mask) - goto out; + return -EINVAL; } /* Check the memory alignment. Blocks cannot straddle pages */ @@ -1254,27 +1250,60 @@ __blockdev_direct_IO(int rw, struct kioc if (bdev) blkbits = bdev_blkbits; blocksize_mask = (1 << blkbits) - 1; - if ((addr & blocksize_mask) || (size & blocksize_mask)) - goto out; + if ((addr & blocksize_mask) || (size & blocksize_mask)) + return -EINVAL; } } - dio = kmalloc(sizeof(*dio), GFP_KERNEL); - retval = -ENOMEM; - if (!dio) + *end_ret = end; + *blkbits_ret = blkbits; + return 0; +} +EXPORT_SYMBOL(check_dio_alignment); + +/* + * This is a library function for use by filesystem drivers. + * The flags parameter is a bitmask of: + * + * DIO_PLACEHOLDERS (use placeholder pages for locking) + * DIO_CREATE (pass create=1 to get_block for filling holes) + * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes) + */ +ssize_t +blockdev_direct_IO_flags(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + unsigned flags) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + unsigned blkbits = 0; + ssize_t retval = -EINVAL; + loff_t end = 0; + int is_async; + int grab_i_mutex = 0; + + + if (check_dio_alignment(inode, bdev, iov, offset, nr_segs, + &blkbits, &end)) goto out; - - /* - * For block device access DIO_NO_LOCKING is used, - * neither readers nor writers do any locking at all - * For regular files using DIO_LOCKING, - * No locks are taken - * For regular files using DIO_OWN_LOCKING, - * neither readers nor writers take any locks here - */ - dio->lock_type = dio_lock_type; - - if (dio->lock_type == DIO_NO_LOCKING && end > offset) { + if (rw & WRITE) { + /* + * If it's a write, unmap all mmappings of the file up-front. + * This will cause any pte dirty bits to be propagated into + * the pageframes for the subsequent filemap_write_and_wait(). + */ + if (mapping_mapped(mapping)) + unmap_mapping_range(mapping, offset, end - offset, 0); + if (end <= i_size_read(inode) && (flags & DIO_DROP_I_MUTEX)) { + mutex_unlock(&inode->i_mutex); + grab_i_mutex = 1; + } + } + /* + * the placeholder code does filemap_write_and_wait, so if we + * aren't using placeholders we have to do it here + */ + if (!(flags & DIO_PLACEHOLDERS) && end > offset) { retval = filemap_write_and_wait_range(mapping, offset, end - 1); if (retval) goto out; @@ -1286,19 +1315,30 @@ __blockdev_direct_IO(int rw, struct kioc * even for AIO, we need to wait for i/o to complete before * returning in this case. */ - dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && + is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && (end > i_size_read(inode))); - /* if our write is inside i_size, we can drop i_mutex */ - dio->reacquire_i_mutex = 0; - if ((rw & WRITE) && dio_lock_type == DIO_LOCKING && - end <= i_size_read(inode) && is_sync_kiocb(iocb)) { - dio->reacquire_i_mutex = 1; - mutex_unlock(&inode->i_mutex); - } retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_block, end_io, dio); + nr_segs, blkbits, get_block, end_io, is_async, + flags); out: + if (grab_i_mutex) + mutex_lock(&inode->i_mutex); + + if ((rw & WRITE) && mapping->nrpages) { + int err; + /* O_DIRECT is allowed to drop i_mutex, so more data + * could have been dirtied by others. Start io one more + * time + */ + err = filemap_write_and_wait_range(mapping, offset, end - 1); + if (!err) + err = invalidate_inode_pages2_range(mapping, + offset >> PAGE_CACHE_SHIFT, + (end - 1) >> PAGE_CACHE_SHIFT); + if (!retval && err) + retval = err; + } return retval; } -EXPORT_SYMBOL(__blockdev_direct_IO); +EXPORT_SYMBOL(blockdev_direct_IO_flags); diff -r 1a7105ab9c19 -r 04dd7ddd593e include/linux/fs.h --- a/include/linux/fs.h Tue Feb 06 20:02:55 2007 -0500 +++ b/include/linux/fs.h Tue Feb 06 20:02:56 2007 -0500 @@ -1776,24 +1776,28 @@ static inline void do_generic_file_read( } #ifdef CONFIG_BLOCK -ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +int check_dio_alignment(struct inode *inode, struct block_device *bdev, + const struct iovec *iov, loff_t offset, unsigned long nr_segs, + unsigned *blkbits_ret, loff_t *end_ret); + +ssize_t blockdev_direct_IO_flags(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int lock_type); - -enum { - DIO_LOCKING = 1, /* need locking between buffered and direct access */ - DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ - DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ -}; + unsigned int dio_flags); + +#define DIO_PLACEHOLDERS (1 << 0) /* insert placeholder pages */ +#define DIO_CREATE (1 << 1) /* pass create=1 to get_block when writing */ +#define DIO_DROP_I_MUTEX (1 << 2) /* drop i_mutex during writes */ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io) { - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, DIO_LOCKING); + /* locking is on, FS wants to fill holes w/get_block */ + return blockdev_direct_IO_flags(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, DIO_PLACEHOLDERS | + DIO_CREATE | DIO_DROP_I_MUTEX); } static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, @@ -1801,17 +1805,9 @@ static inline ssize_t blockdev_direct_IO loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io) { - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, DIO_NO_LOCKING); -} - -static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, - struct inode *inode, struct block_device *bdev, const struct iovec *iov, - loff_t offset, unsigned long nr_segs, get_block_t get_block, - dio_iodone_t end_io) -{ - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, DIO_OWN_LOCKING); + /* locking is off, create is off */ + return blockdev_direct_IO_flags(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, 0); } #endif diff -r 1a7105ab9c19 -r 04dd7ddd593e mm/filemap.c --- a/mm/filemap.c Tue Feb 06 20:02:55 2007 -0500 +++ b/mm/filemap.c Tue Feb 06 20:02:56 2007 -0500 @@ -40,7 +40,7 @@ #include <asm/mman.h> -static ssize_t +static inline ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); @@ -2817,46 +2817,12 @@ EXPORT_SYMBOL(generic_file_aio_write); * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something * went wrong during pagecache shootdown. */ -static ssize_t +static inline ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - ssize_t retval; - size_t write_len = 0; - - /* - * If it's a write, unmap all mmappings of the file up-front. This - * will cause any pte dirty bits to be propagated into the pageframes - * for the subsequent filemap_write_and_wait(). - */ - if (rw == WRITE) { - write_len = iov_length(iov, nr_segs); - if (mapping_mapped(mapping)) - unmap_mapping_range(mapping, offset, write_len, 0); - } - - retval = mapping->a_ops->direct_IO(rw, iocb, iov, - offset, nr_segs); - if (rw == WRITE && mapping->nrpages) { - int err; - pgoff_t end = (offset + write_len - 1) - >> PAGE_CACHE_SHIFT; - - /* O_DIRECT is allowed to drop i_mutex, so more data - * could have been dirtied by others. Start io one more - * time - */ - err = filemap_fdatawrite_range(mapping, offset, - offset + write_len - 1); - if (!err) - err = invalidate_inode_pages2_range(mapping, - offset >> PAGE_CACHE_SHIFT, end); - if (err) - retval = err; - } - return retval; + return iocb->ki_filp->f_mapping->a_ops->direct_IO(rw, iocb, iov, + offset, nr_segs); } /** - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html