On Tue, Feb 06, 2007 at 08:32:49PM -0400, Chris Mason wrote: > This creates a number of flags so that filesystems can control > blockdev_direct_IO. It is based on code from Russell Cettelan. > > The new flags are: > DIO_CREATE -- always pass create=1 to get_block on writes. This allows > DIO to fill holes in the file. > DIO_PLACEHOLDERS -- use placeholder pages to provide locking against buffered > io and truncates. > DIO_DROP_I_MUTEX -- drop i_mutex before starting the mapping, io submission, > or io waiting. The mutex is still dropped for AIO > as well. > > Some API changes are made so that filesystems can have more control > over the DIO features. > > __blockdev_direct_IO is more or less renamed to blockdev_direct_IO_flags. > All waiting and invalidating of page cache data is pushed down into > blockdev_direct_IO_flags (and removed from mm/filemap.c) > > direct_io_worker is exported into the wild. Filesystems that want to be > special can pull out the bits of blockdev_direct_IO_flags they care about > and then call direct_io_worker directly. > > Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> > > diff -r 1a7105ab9c19 -r 04dd7ddd593e fs/direct-io.c > --- a/fs/direct-io.c Tue Feb 06 20:02:55 2007 -0500 > +++ b/fs/direct-io.c Tue Feb 06 20:02:56 2007 -0500 > @@ -1,4 +1,3 @@ > - GFP_KERNEL, 1); > /* > * fs/direct-io.c > * > @@ -55,13 +54,6 @@ > * > * If blkfactor is zero then the user's request was aligned to the filesystem's > * blocksize. > - * > - * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. > - * This determines whether we need to do the fancy locking which prevents > - * direct-IO from being able to read uninitialised disk blocks. If its zero > - * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is > - * not held for the entire direct write (taken briefly, initially, during a > - * direct read though, but its never held for the duration of a direct-IO). > */ > > struct dio { > @@ -70,8 +62,7 @@ struct dio { > struct inode *inode; > int rw; > loff_t i_size; /* i_size when submitted */ > - int lock_type; /* doesn't change */ > - int reacquire_i_mutex; /* should we get i_mutex when done? */ > + unsigned flags; /* locking and get_block flags */ > unsigned blkbits; /* doesn't change */ > unsigned blkfactor; /* When we're using an alignment which > is finer than the filesystem's soft > @@ -211,7 +202,7 @@ out: > > static void dio_unlock_page_range(struct dio *dio) > { > - if (dio->lock_type != DIO_NO_LOCKING) { > + if (dio->flags & DIO_PLACEHOLDERS) { > remove_placeholder_pages(dio->inode->i_mapping, > dio->fspages_start_off, > dio->fspages_end_off); > @@ -226,7 +217,7 @@ static int dio_lock_page_range(struct di > unsigned long max_size; > int ret = 0; > > - if (dio->lock_type == DIO_NO_LOCKING) > + if (!(dio->flags & DIO_PLACEHOLDERS)) > return 0; > > while (index >= dio->fspages_end_off) { > @@ -310,9 +301,6 @@ static int dio_complete(struct dio *dio, > dio->map_bh.b_private); > dio_unlock_page_range(dio); > > - if (dio->reacquire_i_mutex) > - mutex_lock(&dio->inode->i_mutex); > - > if (ret == 0) > ret = dio->page_errors; > if (ret == 0) > @@ -597,8 +585,9 @@ static int get_more_blocks(struct dio *d > map_bh->b_state = 0; > map_bh->b_size = fs_count << dio->inode->i_blkbits; > > - create = dio->rw & WRITE; > - if (dio->lock_type == DIO_NO_LOCKING) > + if (dio->flags & DIO_CREATE) > + create = dio->rw & WRITE; > + else > create = 0; > index = fs_startblk >> (PAGE_CACHE_SHIFT - > dio->inode->i_blkbits); > @@ -1014,19 +1003,41 @@ out: > return ret; > } > > -static ssize_t > -direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, > - const struct iovec *iov, loff_t offset, unsigned long nr_segs, > +/* > + * This does all the real work of the direct io. Most filesystems want to > + * call blockdev_direct_IO_flags instead, but if you have exotic locking > + * routines you can call this directly. > + * > + * The flags parameter is a bitmask of: > + * > + * DIO_PLACEHOLDERS (use placeholder pages for locking) > + * DIO_CREATE (pass create=1 to get_block for filling holes or extending) A little more explanation about why these options are needed, and examples of when one would specify each of these options would be good. Regards Suparna > + * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes) > + */ > +ssize_t > +direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, > + const struct iovec *iov, loff_t offset, unsigned long nr_segs, > unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, > - struct dio *dio) > -{ > - unsigned long user_addr; > + int is_async, unsigned dioflags) > +{ > + unsigned long user_addr; > unsigned long flags; > int seg; > ssize_t ret = 0; > ssize_t ret2; > size_t bytes; > - > + struct dio *dio; > + > + if (rw & WRITE) > + rw = WRITE_SYNC; > + > + dio = kmalloc(sizeof(*dio), GFP_KERNEL); > + ret = -ENOMEM; > + if (!dio) > + goto out; > + > + dio->flags = dioflags; > + dio->is_async = is_async; > dio->bio = NULL; > dio->inode = inode; > dio->rw = rw; > @@ -1057,7 +1068,7 @@ direct_io_worker(int rw, struct kiocb *i > dio->bio_list = NULL; > dio->waiter = NULL; > > - if (dio->lock_type != DIO_NO_LOCKING) { > + if (dio->flags & DIO_PLACEHOLDERS) { > dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT; > dio->fspages_end_off = dio->fspages_start_off; > > @@ -1192,33 +1203,24 @@ direct_io_worker(int rw, struct kiocb *i > } else > BUG_ON(ret != -EIOCBQUEUED); > > +out: > return ret; > } > - > -/* > - * This is a library function for use by filesystem drivers. > - * The locking rules are governed by the dio_lock_type parameter. > - * > - * DIO_NO_LOCKING (no locking, for raw block device access) > - * For writes, i_mutex is not held on entry; it is never taken. > - * > - * DIO_LOCKING (simple locking for regular files) > - * For writes we are called under i_mutex and return with i_mutex held, even > - * though it is internally dropped. > - * > - * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of > - * uninitialised data, allowing parallel direct readers and writers) > - * For writes we are called without i_mutex, return without it, never touch it. > - * For reads we are called under i_mutex and return with i_mutex held, even > - * though it may be internally dropped. > - * > - * Additional i_alloc_sem locking requirements described inline below. > - */ > -ssize_t > -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, > - struct block_device *bdev, const struct iovec *iov, loff_t offset, > - unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, > - int dio_lock_type) > +EXPORT_SYMBOL(direct_io_worker); > + > +/* > + * A utility function fro blockdev_direct_IO_flags, this checks > + * alignment of a O_DIRECT iovec against filesystem and blockdevice > + * requirements. > + * > + * It returns a blkbits value that will work for the io, and returns the > + * end offset of the io (via blkbits_ret and end_ret). > + * > + * The function returns 0 if everything will work or -EINVAL on error > + */ > +int check_dio_alignment(struct inode *inode, struct block_device *bdev, > + const struct iovec *iov, loff_t offset, unsigned long nr_segs, > + unsigned *blkbits_ret, loff_t *end_ret) > { > int seg; > size_t size; > @@ -1226,13 +1228,7 @@ __blockdev_direct_IO(int rw, struct kioc > unsigned blkbits = inode->i_blkbits; > unsigned bdev_blkbits = 0; > unsigned blocksize_mask = (1 << blkbits) - 1; > - ssize_t retval = -EINVAL; > loff_t end = offset; > - struct dio *dio; > - struct address_space *mapping = iocb->ki_filp->f_mapping; > - > - if (rw & WRITE) > - rw = WRITE_SYNC; > > if (bdev) > bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); > @@ -1242,7 +1238,7 @@ __blockdev_direct_IO(int rw, struct kioc > blkbits = bdev_blkbits; > blocksize_mask = (1 << blkbits) - 1; > if (offset & blocksize_mask) > - goto out; > + return -EINVAL; > } > > /* Check the memory alignment. Blocks cannot straddle pages */ > @@ -1254,27 +1250,60 @@ __blockdev_direct_IO(int rw, struct kioc > if (bdev) > blkbits = bdev_blkbits; > blocksize_mask = (1 << blkbits) - 1; > - if ((addr & blocksize_mask) || (size & blocksize_mask)) > - goto out; > + if ((addr & blocksize_mask) || (size & blocksize_mask)) > + return -EINVAL; > } > } > - dio = kmalloc(sizeof(*dio), GFP_KERNEL); > - retval = -ENOMEM; > - if (!dio) > + *end_ret = end; > + *blkbits_ret = blkbits; > + return 0; > +} > +EXPORT_SYMBOL(check_dio_alignment); > + > +/* > + * This is a library function for use by filesystem drivers. > + * The flags parameter is a bitmask of: > + * > + * DIO_PLACEHOLDERS (use placeholder pages for locking) > + * DIO_CREATE (pass create=1 to get_block for filling holes) > + * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes) > + */ > +ssize_t > +blockdev_direct_IO_flags(int rw, struct kiocb *iocb, struct inode *inode, > + struct block_device *bdev, const struct iovec *iov, loff_t offset, > + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, > + unsigned flags) > +{ > + struct address_space *mapping = iocb->ki_filp->f_mapping; > + unsigned blkbits = 0; > + ssize_t retval = -EINVAL; > + loff_t end = 0; > + int is_async; > + int grab_i_mutex = 0; > + > + > + if (check_dio_alignment(inode, bdev, iov, offset, nr_segs, > + &blkbits, &end)) > goto out; > > - > - /* > - * For block device access DIO_NO_LOCKING is used, > - * neither readers nor writers do any locking at all > - * For regular files using DIO_LOCKING, > - * No locks are taken > - * For regular files using DIO_OWN_LOCKING, > - * neither readers nor writers take any locks here > - */ > - dio->lock_type = dio_lock_type; > - > - if (dio->lock_type == DIO_NO_LOCKING && end > offset) { > + if (rw & WRITE) { > + /* > + * If it's a write, unmap all mmappings of the file up-front. > + * This will cause any pte dirty bits to be propagated into > + * the pageframes for the subsequent filemap_write_and_wait(). > + */ > + if (mapping_mapped(mapping)) > + unmap_mapping_range(mapping, offset, end - offset, 0); > + if (end <= i_size_read(inode) && (flags & DIO_DROP_I_MUTEX)) { > + mutex_unlock(&inode->i_mutex); > + grab_i_mutex = 1; > + } > + } > + /* > + * the placeholder code does filemap_write_and_wait, so if we > + * aren't using placeholders we have to do it here > + */ > + if (!(flags & DIO_PLACEHOLDERS) && end > offset) { > retval = filemap_write_and_wait_range(mapping, offset, end - 1); > if (retval) > goto out; > @@ -1286,19 +1315,30 @@ __blockdev_direct_IO(int rw, struct kioc > * even for AIO, we need to wait for i/o to complete before > * returning in this case. > */ > - dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && > + is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && > (end > i_size_read(inode))); > > - /* if our write is inside i_size, we can drop i_mutex */ > - dio->reacquire_i_mutex = 0; > - if ((rw & WRITE) && dio_lock_type == DIO_LOCKING && > - end <= i_size_read(inode) && is_sync_kiocb(iocb)) { > - dio->reacquire_i_mutex = 1; > - mutex_unlock(&inode->i_mutex); > - } > retval = direct_io_worker(rw, iocb, inode, iov, offset, > - nr_segs, blkbits, get_block, end_io, dio); > + nr_segs, blkbits, get_block, end_io, is_async, > + flags); > out: > + if (grab_i_mutex) > + mutex_lock(&inode->i_mutex); > + > + if ((rw & WRITE) && mapping->nrpages) { > + int err; > + /* O_DIRECT is allowed to drop i_mutex, so more data > + * could have been dirtied by others. Start io one more > + * time > + */ > + err = filemap_write_and_wait_range(mapping, offset, end - 1); > + if (!err) > + err = invalidate_inode_pages2_range(mapping, > + offset >> PAGE_CACHE_SHIFT, > + (end - 1) >> PAGE_CACHE_SHIFT); > + if (!retval && err) > + retval = err; > + } > return retval; > } > -EXPORT_SYMBOL(__blockdev_direct_IO); > +EXPORT_SYMBOL(blockdev_direct_IO_flags); > diff -r 1a7105ab9c19 -r 04dd7ddd593e include/linux/fs.h > --- a/include/linux/fs.h Tue Feb 06 20:02:55 2007 -0500 > +++ b/include/linux/fs.h Tue Feb 06 20:02:56 2007 -0500 > @@ -1776,24 +1776,28 @@ static inline void do_generic_file_read( > } > > #ifdef CONFIG_BLOCK > -ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, > +int check_dio_alignment(struct inode *inode, struct block_device *bdev, > + const struct iovec *iov, loff_t offset, unsigned long nr_segs, > + unsigned *blkbits_ret, loff_t *end_ret); > + > +ssize_t blockdev_direct_IO_flags(int rw, struct kiocb *iocb, struct inode *inode, > struct block_device *bdev, const struct iovec *iov, loff_t offset, > unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, > - int lock_type); > - > -enum { > - DIO_LOCKING = 1, /* need locking between buffered and direct access */ > - DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ > - DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ > -}; > + unsigned int dio_flags); > + > +#define DIO_PLACEHOLDERS (1 << 0) /* insert placeholder pages */ > +#define DIO_CREATE (1 << 1) /* pass create=1 to get_block when writing */ > +#define DIO_DROP_I_MUTEX (1 << 2) /* drop i_mutex during writes */ > > static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, > struct inode *inode, struct block_device *bdev, const struct iovec *iov, > loff_t offset, unsigned long nr_segs, get_block_t get_block, > dio_iodone_t end_io) > { > - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, > - nr_segs, get_block, end_io, DIO_LOCKING); > + /* locking is on, FS wants to fill holes w/get_block */ > + return blockdev_direct_IO_flags(rw, iocb, inode, bdev, iov, offset, > + nr_segs, get_block, end_io, DIO_PLACEHOLDERS | > + DIO_CREATE | DIO_DROP_I_MUTEX); > } > > static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, > @@ -1801,17 +1805,9 @@ static inline ssize_t blockdev_direct_IO > loff_t offset, unsigned long nr_segs, get_block_t get_block, > dio_iodone_t end_io) > { > - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, > - nr_segs, get_block, end_io, DIO_NO_LOCKING); > -} > - > -static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, > - struct inode *inode, struct block_device *bdev, const struct iovec *iov, > - loff_t offset, unsigned long nr_segs, get_block_t get_block, > - dio_iodone_t end_io) > -{ > - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, > - nr_segs, get_block, end_io, DIO_OWN_LOCKING); > + /* locking is off, create is off */ > + return blockdev_direct_IO_flags(rw, iocb, inode, bdev, iov, offset, > + nr_segs, get_block, end_io, 0); > } > #endif > > diff -r 1a7105ab9c19 -r 04dd7ddd593e mm/filemap.c > --- a/mm/filemap.c Tue Feb 06 20:02:55 2007 -0500 > +++ b/mm/filemap.c Tue Feb 06 20:02:56 2007 -0500 > @@ -40,7 +40,7 @@ > > #include <asm/mman.h> > > -static ssize_t > +static inline ssize_t > generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, > loff_t offset, unsigned long nr_segs); > > @@ -2817,46 +2817,12 @@ EXPORT_SYMBOL(generic_file_aio_write); > * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something > * went wrong during pagecache shootdown. > */ > -static ssize_t > +static inline ssize_t > generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, > loff_t offset, unsigned long nr_segs) > { > - struct file *file = iocb->ki_filp; > - struct address_space *mapping = file->f_mapping; > - ssize_t retval; > - size_t write_len = 0; > - > - /* > - * If it's a write, unmap all mmappings of the file up-front. This > - * will cause any pte dirty bits to be propagated into the pageframes > - * for the subsequent filemap_write_and_wait(). > - */ > - if (rw == WRITE) { > - write_len = iov_length(iov, nr_segs); > - if (mapping_mapped(mapping)) > - unmap_mapping_range(mapping, offset, write_len, 0); > - } > - > - retval = mapping->a_ops->direct_IO(rw, iocb, iov, > - offset, nr_segs); > - if (rw == WRITE && mapping->nrpages) { > - int err; > - pgoff_t end = (offset + write_len - 1) > - >> PAGE_CACHE_SHIFT; > - > - /* O_DIRECT is allowed to drop i_mutex, so more data > - * could have been dirtied by others. Start io one more > - * time > - */ > - err = filemap_fdatawrite_range(mapping, offset, > - offset + write_len - 1); > - if (!err) > - err = invalidate_inode_pages2_range(mapping, > - offset >> PAGE_CACHE_SHIFT, end); > - if (err) > - retval = err; > - } > - return retval; > + return iocb->ki_filp->f_mapping->a_ops->direct_IO(rw, iocb, iov, > + offset, nr_segs); > } > > /** > > > - > To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Suparna Bhattacharya (suparna@xxxxxxxxxx) Linux Technology Center IBM Software Lab, India - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html