This creates a number of flags so that filesystems can control blockdev_direct_IO. It is based on code from Russell Cettelan. The new flags are: DIO_CREATE -- always pass create=1 to get_block on writes. This allows DIO to fill holes in the file. DIO_PLACEHOLDERS -- use placeholder pages to provide locking against buffered io and truncates. DIO_EXTEND -- use truncate to grow the file instead of falling back to buffered io. DIO_DROP_I_MUTEX -- drop i_mutex before starting the IO on writes Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> diff -r 3fa8c25ec60f -r f84d3216430d fs/direct-io.c --- a/fs/direct-io.c Wed Nov 01 10:22:34 2006 -0500 +++ b/fs/direct-io.c Wed Nov 01 10:24:03 2006 -0500 @@ -53,13 +53,6 @@ * * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. - * - * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. - * This determines whether we need to do the fancy locking which prevents - * direct-IO from being able to read uninitialised disk blocks. If its zero - * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is - * not held for the entire direct write (taken briefly, initially, during a - * direct read though, but its never held for the duration of a direct-IO). */ struct dio { @@ -68,7 +61,7 @@ struct dio { struct inode *inode; int rw; loff_t i_size; /* i_size when submitted */ - int lock_type; /* doesn't change */ + unsigned flags; /* doesn't change */ int reacquire_i_mutex; /* should we get i_mutex when done? */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which @@ -203,7 +196,7 @@ static void unlock_page_range(struct dio static void unlock_page_range(struct dio *dio, unsigned long start, unsigned long nr) { - if (dio->lock_type != DIO_NO_LOCKING) { + if (dio->flags & DIO_PLACEHOLDERS) { remove_placeholder_pages(dio->inode->i_mapping, dio->tmppages, &dio->fake, start, start + nr, @@ -218,11 +211,13 @@ static int lock_page_range(struct dio *d struct page *fake = &dio->fake; unsigned long end = start + nr; - if (dio->lock_type == DIO_NO_LOCKING) - return 0; - return find_or_insert_placeholders(mapping, dio->tmppages, start, end, - ARRAY_SIZE(dio->tmppages), - GFP_KERNEL, fake, 1); + if (dio->flags & DIO_PLACEHOLDERS) { + return find_or_insert_placeholders(mapping, dio->tmppages, + start, end, + ARRAY_SIZE(dio->tmppages), + GFP_KERNEL, fake, 1); + } + return 0; } @@ -556,6 +551,7 @@ static int get_more_blocks(struct dio *d unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; unsigned long index; + unsigned long end; int create; /* @@ -575,8 +571,9 @@ static int get_more_blocks(struct dio *d map_bh->b_state = 0; map_bh->b_size = fs_count << dio->inode->i_blkbits; - create = dio->rw & WRITE; - if (dio->lock_type == DIO_NO_LOCKING) + if (dio->flags & DIO_CREATE) + create = dio->rw & WRITE; + else create = 0; index = fs_startblk >> (PAGE_CACHE_SHIFT - dio->inode->i_blkbits); @@ -1193,28 +1190,17 @@ direct_io_worker(int rw, struct kiocb *i /* * This is a library function for use by filesystem drivers. - * The locking rules are governed by the dio_lock_type parameter. - * - * DIO_NO_LOCKING (no locking, for raw block device access) - * For writes, i_mutex is not held on entry; it is never taken. - * - * DIO_LOCKING (simple locking for regular files) - * For writes we are called under i_mutex and return with i_mutex held, even - * though it is internally dropped. - * - * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of - * uninitialised data, allowing parallel direct readers and writers) - * For writes we are called without i_mutex, return without it, never touch it. - * For reads we are called under i_mutex and return with i_mutex held, even - * though it may be internally dropped. - * - * Additional i_alloc_sem locking requirements described inline below. + * The flags parameter is a bitmask of: + * + * DIO_PLACEHOLDERS (use placeholder pages for locking) + * DIO_CREATE (pass create=1 to get_block for filling holes) + * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes) */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int dio_lock_type) + unsigned flags) { int seg; size_t size; @@ -1225,7 +1211,6 @@ __blockdev_direct_IO(int rw, struct kioc ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - struct address_space *mapping = iocb->ki_filp->f_mapping; if (rw & WRITE) rw = WRITE_SYNC; @@ -1271,9 +1256,14 @@ __blockdev_direct_IO(int rw, struct kioc * For regular files using DIO_OWN_LOCKING, * neither readers nor writers take any locks here */ - dio->lock_type = dio_lock_type; - - if (dio->lock_type == DIO_NO_LOCKING && end > offset) { + dio->flags = flags; + + /* + * the placeholder code does filemap_write_and_wait, so if we + * aren't using placeholders we have to do it here + */ + if (!(dio->flags & DIO_PLACEHOLDERS) && end > offset) { + struct address_space *mapping = iocb->ki_filp->f_mapping; retval = filemap_write_and_wait_range(mapping, offset, end - 1); if (retval) goto out; @@ -1296,11 +1286,12 @@ __blockdev_direct_IO(int rw, struct kioc * mmap'd writes using writepage to fill holes */ dio->reacquire_i_mutex = 0; - if ((rw & WRITE) && dio_lock_type == DIO_LOCKING) { + if (rw & WRITE) { /* if our write goes past i_size, do an expanding * truncate to fill it before dropping i_mutex */ - if (end > i_size_read(inode) && iocb->ki_filp) { + if ((dio->flags & DIO_EXTEND) && end > i_size_read(inode) && + iocb->ki_filp) { struct iattr newattrs; newattrs.ia_size = end; newattrs.ia_file = iocb->ki_filp; @@ -1310,7 +1301,7 @@ __blockdev_direct_IO(int rw, struct kioc if (retval) goto out; } - if (is_sync_kiocb(iocb)) { + if ((dio->flags & DIO_DROP_I_MUTEX) && is_sync_kiocb(iocb)) { dio->reacquire_i_mutex = 1; mutex_unlock(&inode->i_mutex); } diff -r 3fa8c25ec60f -r f84d3216430d include/linux/fs.h --- a/include/linux/fs.h Wed Nov 01 10:22:34 2006 -0500 +++ b/include/linux/fs.h Wed Nov 01 10:24:03 2006 -0500 @@ -1801,21 +1801,32 @@ ssize_t __blockdev_direct_IO(int rw, str ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int lock_type); - -enum { - DIO_LOCKING = 1, /* need locking between buffered and direct access */ - DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ - DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ -}; + unsigned int dio_flags); + +#define DIO_PLACEHOLDERS (1 << 0) /* insert placeholder pages */ +#define DIO_CREATE (1 << 1) /* pass create=1 to get_block when writing */ +#define DIO_DROP_I_MUTEX (1 << 2) /* drop i_mutex during writes */ +#define DIO_EXTEND (1 << 3) /* extend the file w/truncate if needed */ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io) { + /* locking is on, FS wants to fill holes w/get_block */ return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, DIO_LOCKING); + nr_segs, get_block, end_io, DIO_PLACEHOLDERS | + DIO_CREATE | DIO_DROP_I_MUTEX | DIO_EXTEND); +} + +static inline ssize_t blockdev_direct_IO_flags(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_block_t get_block, + dio_iodone_t end_io, unsigned int flags) +{ + /* file system dictates locking and create behavior */ + return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, flags); } static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, @@ -1823,17 +1834,9 @@ static inline ssize_t blockdev_direct_IO loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io) { + /* locking is off, create is off */ return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, DIO_NO_LOCKING); -} - -static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, - struct inode *inode, struct block_device *bdev, const struct iovec *iov, - loff_t offset, unsigned long nr_segs, get_block_t get_block, - dio_iodone_t end_io) -{ - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, DIO_OWN_LOCKING); + nr_segs, get_block, end_io, 0); } #endif - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html