Re: [PATCH 4 of 8] Add flags to control direct IO helpers

Suparna Bhattacharya <suparna@xxxxxxxxxx> · Wed, 7 Feb 2007 22:38:45 +0530

On Tue, Feb 06, 2007 at 08:32:49PM -0400, Chris Mason wrote:
> This creates a number of flags so that filesystems can control
> blockdev_direct_IO.  It is based on code from Russell Cettelan.
> 
> The new flags are:
> DIO_CREATE -- always pass create=1 to get_block on writes.  This allows
> 	      DIO to fill holes in the file.
> DIO_PLACEHOLDERS -- use placeholder pages to provide locking against buffered
> 	            io and truncates.
> DIO_DROP_I_MUTEX -- drop i_mutex before starting the mapping, io submission,
> 		    or io waiting.  The mutex is still dropped for AIO
> 		    as well.
> 
> Some API changes are made so that filesystems can have more control
> over the DIO features.
> 
> __blockdev_direct_IO is more or less renamed to blockdev_direct_IO_flags.
> All waiting and invalidating of page cache data is pushed down into
> blockdev_direct_IO_flags (and removed from mm/filemap.c)
> 
> direct_io_worker is exported into the wild.  Filesystems that want to be
> special can pull out the bits of blockdev_direct_IO_flags they care about
> and then call direct_io_worker directly.
> 
> Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx>
> 
> diff -r 1a7105ab9c19 -r 04dd7ddd593e fs/direct-io.c
> --- a/fs/direct-io.c	Tue Feb 06 20:02:55 2007 -0500
> +++ b/fs/direct-io.c	Tue Feb 06 20:02:56 2007 -0500
> @@ -1,4 +1,3 @@
> -					  GFP_KERNEL, 1);
>  /*
>   * fs/direct-io.c
>   *
> @@ -55,13 +54,6 @@
>   *
>   * If blkfactor is zero then the user's request was aligned to the filesystem's
>   * blocksize.
> - *
> - * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
> - * This determines whether we need to do the fancy locking which prevents
> - * direct-IO from being able to read uninitialised disk blocks.  If its zero
> - * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
> - * not held for the entire direct write (taken briefly, initially, during a
> - * direct read though, but its never held for the duration of a direct-IO).
>   */
> 
>  struct dio {
> @@ -70,8 +62,7 @@ struct dio {
>  	struct inode *inode;
>  	int rw;
>  	loff_t i_size;			/* i_size when submitted */
> -	int lock_type;			/* doesn't change */
> -	int reacquire_i_mutex;		/* should we get i_mutex when done? */
> +	unsigned flags;			/* locking and get_block flags */
>  	unsigned blkbits;		/* doesn't change */
>  	unsigned blkfactor;		/* When we're using an alignment which
>  					   is finer than the filesystem's soft
> @@ -211,7 +202,7 @@ out:
> 
>  static void dio_unlock_page_range(struct dio *dio)
>  {
> -	if (dio->lock_type != DIO_NO_LOCKING) {
> +	if (dio->flags & DIO_PLACEHOLDERS) {
>  		remove_placeholder_pages(dio->inode->i_mapping,
>  					 dio->fspages_start_off,
>  					 dio->fspages_end_off);
> @@ -226,7 +217,7 @@ static int dio_lock_page_range(struct di
>  	unsigned long max_size;
>  	int ret = 0;
> 
> -	if (dio->lock_type == DIO_NO_LOCKING)
> +	if (!(dio->flags & DIO_PLACEHOLDERS))
>  		return 0;
> 
>  	while (index >= dio->fspages_end_off) {
> @@ -310,9 +301,6 @@ static int dio_complete(struct dio *dio,
>  			    dio->map_bh.b_private);
>  	dio_unlock_page_range(dio);
> 
> -	if (dio->reacquire_i_mutex)
> -		mutex_lock(&dio->inode->i_mutex);
> -
>  	if (ret == 0)
>  		ret = dio->page_errors;
>  	if (ret == 0)
> @@ -597,8 +585,9 @@ static int get_more_blocks(struct dio *d
>  		map_bh->b_state = 0;
>  		map_bh->b_size = fs_count << dio->inode->i_blkbits;
> 
> -		create = dio->rw & WRITE;
> -		if (dio->lock_type == DIO_NO_LOCKING)
> +		if (dio->flags & DIO_CREATE)
> +			create = dio->rw & WRITE;
> +		else
>  			create = 0;
>  	        index = fs_startblk >> (PAGE_CACHE_SHIFT -
>  		                        dio->inode->i_blkbits);
> @@ -1014,19 +1003,41 @@ out:
>  	return ret;
>  }
> 
> -static ssize_t
> -direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
> -	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
> +/*
> + * This does all the real work of the direct io.  Most filesystems want to
> + * call blockdev_direct_IO_flags instead, but if you have exotic locking
> + * routines you can call this directly.
> + *
> + * The flags parameter is a bitmask of:
> + *
> + * DIO_PLACEHOLDERS (use placeholder pages for locking)
> + * DIO_CREATE (pass create=1 to get_block for filling holes or extending)

A little more explanation about why these options are needed, and examples
of when one would specify each of these options would be good.

Regards
Suparna

> + * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes)
> + */
> +ssize_t
> +direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
> +	const struct iovec *iov, loff_t offset, unsigned long nr_segs,
>  	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
> -	struct dio *dio)
> -{
> -	unsigned long user_addr; 
> +	int is_async, unsigned dioflags)
> +{
> +	unsigned long user_addr;
>  	unsigned long flags;
>  	int seg;
>  	ssize_t ret = 0;
>  	ssize_t ret2;
>  	size_t bytes;
> -
> +	struct dio *dio;
> +
> +	if (rw & WRITE)
> +		rw = WRITE_SYNC;
> +
> +	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
> +	ret = -ENOMEM;
> +	if (!dio)
> +		goto out;
> +
> +	dio->flags = dioflags;
> +	dio->is_async = is_async;
>  	dio->bio = NULL;
>  	dio->inode = inode;
>  	dio->rw = rw;
> @@ -1057,7 +1068,7 @@ direct_io_worker(int rw, struct kiocb *i
>  	dio->bio_list = NULL;
>  	dio->waiter = NULL;
> 
> -	if (dio->lock_type != DIO_NO_LOCKING) {
> +	if (dio->flags & DIO_PLACEHOLDERS) {
>  		dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT;
>  		dio->fspages_end_off = dio->fspages_start_off;
> 
> @@ -1192,33 +1203,24 @@ direct_io_worker(int rw, struct kiocb *i
>  	} else
>  		BUG_ON(ret != -EIOCBQUEUED);
> 
> +out:
>  	return ret;
>  }
> -
> -/*
> - * This is a library function for use by filesystem drivers.
> - * The locking rules are governed by the dio_lock_type parameter.
> - *
> - * DIO_NO_LOCKING (no locking, for raw block device access)
> - * For writes, i_mutex is not held on entry; it is never taken.
> - *
> - * DIO_LOCKING (simple locking for regular files)
> - * For writes we are called under i_mutex and return with i_mutex held, even
> - * though it is internally dropped.
> - *
> - * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
> - *	uninitialised data, allowing parallel direct readers and writers)
> - * For writes we are called without i_mutex, return without it, never touch it.
> - * For reads we are called under i_mutex and return with i_mutex held, even
> - * though it may be internally dropped.
> - *
> - * Additional i_alloc_sem locking requirements described inline below.
> - */
> -ssize_t
> -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
> -	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
> -	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
> -	int dio_lock_type)
> +EXPORT_SYMBOL(direct_io_worker);
> +
> +/*
> + * A utility function fro blockdev_direct_IO_flags, this checks
> + * alignment of a O_DIRECT iovec against filesystem and blockdevice
> + * requirements.
> + *
> + * It returns a blkbits value that will work for the io, and returns the
> + * end offset of the io (via blkbits_ret and end_ret).
> + *
> + * The function returns 0 if everything will work or -EINVAL on error
> + */
> +int check_dio_alignment(struct inode *inode, struct block_device *bdev,
> +			const struct iovec *iov, loff_t offset, unsigned long nr_segs,
> +			unsigned *blkbits_ret, loff_t *end_ret)
>  {
>  	int seg;
>  	size_t size;
> @@ -1226,13 +1228,7 @@ __blockdev_direct_IO(int rw, struct kioc
>  	unsigned blkbits = inode->i_blkbits;
>  	unsigned bdev_blkbits = 0;
>  	unsigned blocksize_mask = (1 << blkbits) - 1;
> -	ssize_t retval = -EINVAL;
>  	loff_t end = offset;
> -	struct dio *dio;
> -	struct address_space *mapping = iocb->ki_filp->f_mapping;
> -
> -	if (rw & WRITE)
> -		rw = WRITE_SYNC;
> 
>  	if (bdev)
>  		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
> @@ -1242,7 +1238,7 @@ __blockdev_direct_IO(int rw, struct kioc
>  			 blkbits = bdev_blkbits;
>  		blocksize_mask = (1 << blkbits) - 1;
>  		if (offset & blocksize_mask)
> -			goto out;
> +			return -EINVAL;
>  	}
> 
>  	/* Check the memory alignment.  Blocks cannot straddle pages */
> @@ -1254,27 +1250,60 @@ __blockdev_direct_IO(int rw, struct kioc
>  			if (bdev)
>  				 blkbits = bdev_blkbits;
>  			blocksize_mask = (1 << blkbits) - 1;
> -			if ((addr & blocksize_mask) || (size & blocksize_mask))  
> -				goto out;
> +			if ((addr & blocksize_mask) || (size & blocksize_mask))
> +				return -EINVAL;
>  		}
>  	}
> -	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
> -	retval = -ENOMEM;
> -	if (!dio)
> +	*end_ret = end;
> +	*blkbits_ret = blkbits;
> +	return 0;
> +}
> +EXPORT_SYMBOL(check_dio_alignment);
> +
> +/*
> + * This is a library function for use by filesystem drivers.
> + * The flags parameter is a bitmask of:
> + *
> + * DIO_PLACEHOLDERS (use placeholder pages for locking)
> + * DIO_CREATE (pass create=1 to get_block for filling holes)
> + * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes)
> + */
> +ssize_t
> +blockdev_direct_IO_flags(int rw, struct kiocb *iocb, struct inode *inode,
> +	struct block_device *bdev, const struct iovec *iov, loff_t offset,
> +	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
> +	unsigned flags)
> +{
> +	struct address_space *mapping = iocb->ki_filp->f_mapping;
> +	unsigned blkbits = 0;
> +	ssize_t retval = -EINVAL;
> +	loff_t end = 0;
> +	int is_async;
> +	int grab_i_mutex = 0;
> +
> +
> +	if (check_dio_alignment(inode, bdev, iov, offset, nr_segs,
> +				&blkbits, &end))
>  		goto out;
> 
> -
> -	/*
> -	 * For block device access DIO_NO_LOCKING is used,
> -	 *	neither readers nor writers do any locking at all
> -	 * For regular files using DIO_LOCKING,
> -	 *	No locks are taken
> -	 * For regular files using DIO_OWN_LOCKING,
> -	 *	neither readers nor writers take any locks here
> -	 */
> -	dio->lock_type = dio_lock_type;
> -
> -	if (dio->lock_type == DIO_NO_LOCKING && end > offset) {
> +	if (rw & WRITE) {
> +		/*
> +		 * If it's a write, unmap all mmappings of the file up-front.
> +		 * This will cause any pte dirty bits to be propagated into
> +		 * the pageframes for the subsequent filemap_write_and_wait().
> +		 */
> +		if (mapping_mapped(mapping))
> +			unmap_mapping_range(mapping, offset, end - offset, 0);
> +		if (end <= i_size_read(inode) && (flags & DIO_DROP_I_MUTEX)) {
> +			mutex_unlock(&inode->i_mutex);
> +			grab_i_mutex = 1;
> +		}
> +	}
> +	/*
> +	 * the placeholder code does filemap_write_and_wait, so if we
> +	 * aren't using placeholders we have to do it here
> +	 */
> +	if (!(flags & DIO_PLACEHOLDERS) && end > offset) {
>  		retval = filemap_write_and_wait_range(mapping, offset, end - 1);
>  		if (retval)
>  			goto out;
> @@ -1286,19 +1315,30 @@ __blockdev_direct_IO(int rw, struct kioc
>  	 * even for AIO, we need to wait for i/o to complete before
>  	 * returning in this case.
>  	 */
> -	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
> +	is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
>  		(end > i_size_read(inode)));
> 
> -	/* if our write is inside i_size, we can drop i_mutex */
> -	dio->reacquire_i_mutex = 0;
> -	if ((rw & WRITE) && dio_lock_type == DIO_LOCKING &&
> -	   end <= i_size_read(inode) && is_sync_kiocb(iocb)) {
> -		dio->reacquire_i_mutex = 1;
> -		mutex_unlock(&inode->i_mutex);
> -	}
>  	retval = direct_io_worker(rw, iocb, inode, iov, offset,
> -				nr_segs, blkbits, get_block, end_io, dio);
> +				nr_segs, blkbits, get_block, end_io, is_async,
> +				flags);
>  out:
> +	if (grab_i_mutex)
> +		mutex_lock(&inode->i_mutex);
> +
> +	if ((rw & WRITE) && mapping->nrpages) {
> +		int err;
> +		/* O_DIRECT is allowed to drop i_mutex, so more data
> +		 * could have been dirtied by others.  Start io one more
> +		 * time
> +		 */
> +		err = filemap_write_and_wait_range(mapping, offset, end - 1);
> +		if (!err)
> +			err = invalidate_inode_pages2_range(mapping,
> +					offset >> PAGE_CACHE_SHIFT,
> +					(end - 1) >> PAGE_CACHE_SHIFT);
> +		if (!retval && err)
> +			retval = err;
> +	}
>  	return retval;
>  }
> -EXPORT_SYMBOL(__blockdev_direct_IO);
> +EXPORT_SYMBOL(blockdev_direct_IO_flags);
> diff -r 1a7105ab9c19 -r 04dd7ddd593e include/linux/fs.h
> --- a/include/linux/fs.h	Tue Feb 06 20:02:55 2007 -0500
> +++ b/include/linux/fs.h	Tue Feb 06 20:02:56 2007 -0500
> @@ -1776,24 +1776,28 @@ static inline void do_generic_file_read(
>  }
> 
>  #ifdef CONFIG_BLOCK
> -ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
> +int check_dio_alignment(struct inode *inode, struct block_device *bdev,
> +                        const struct iovec *iov, loff_t offset, unsigned long nr_segs,
> +			                        unsigned *blkbits_ret, loff_t *end_ret);
> +
> +ssize_t blockdev_direct_IO_flags(int rw, struct kiocb *iocb, struct inode *inode,
>  	struct block_device *bdev, const struct iovec *iov, loff_t offset,
>  	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
> -	int lock_type);
> -
> -enum {
> -	DIO_LOCKING = 1, /* need locking between buffered and direct access */
> -	DIO_NO_LOCKING,  /* bdev; no locking at all between buffered/direct */
> -	DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
> -};
> +	unsigned int dio_flags);
> +
> +#define DIO_PLACEHOLDERS (1 << 0)  /* insert placeholder pages */
> +#define DIO_CREATE	(1 << 1)  /* pass create=1 to get_block when writing */
> +#define DIO_DROP_I_MUTEX (1 << 2) /* drop i_mutex during writes */
> 
>  static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
>  	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
>  	loff_t offset, unsigned long nr_segs, get_block_t get_block,
>  	dio_iodone_t end_io)
>  {
> -	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
> -				nr_segs, get_block, end_io, DIO_LOCKING);
> +	/* locking is on, FS wants to fill holes w/get_block */
> +	return blockdev_direct_IO_flags(rw, iocb, inode, bdev, iov, offset,
> +				nr_segs, get_block, end_io, DIO_PLACEHOLDERS |
> +				DIO_CREATE | DIO_DROP_I_MUTEX);
>  }
> 
>  static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
> @@ -1801,17 +1805,9 @@ static inline ssize_t blockdev_direct_IO
>  	loff_t offset, unsigned long nr_segs, get_block_t get_block,
>  	dio_iodone_t end_io)
>  {
> -	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
> -				nr_segs, get_block, end_io, DIO_NO_LOCKING);
> -}
> -
> -static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
> -	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
> -	loff_t offset, unsigned long nr_segs, get_block_t get_block,
> -	dio_iodone_t end_io)
> -{
> -	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
> -				nr_segs, get_block, end_io, DIO_OWN_LOCKING);
> +	/* locking is off, create is off */
> +	return blockdev_direct_IO_flags(rw, iocb, inode, bdev, iov, offset,
> +				nr_segs, get_block, end_io, 0);
>  }
>  #endif
> 
> diff -r 1a7105ab9c19 -r 04dd7ddd593e mm/filemap.c
> --- a/mm/filemap.c	Tue Feb 06 20:02:55 2007 -0500
> +++ b/mm/filemap.c	Tue Feb 06 20:02:56 2007 -0500
> @@ -40,7 +40,7 @@
> 
>  #include <asm/mman.h>
> 
> -static ssize_t
> +static inline ssize_t
>  generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
>  	loff_t offset, unsigned long nr_segs);
> 
> @@ -2817,46 +2817,12 @@ EXPORT_SYMBOL(generic_file_aio_write);
>   * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
>   * went wrong during pagecache shootdown.
>   */
> -static ssize_t
> +static inline ssize_t
>  generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
>  	loff_t offset, unsigned long nr_segs)
>  {
> -	struct file *file = iocb->ki_filp;
> -	struct address_space *mapping = file->f_mapping;
> -	ssize_t retval;
> -	size_t write_len = 0;
> -
> -	/*
> -	 * If it's a write, unmap all mmappings of the file up-front.  This
> -	 * will cause any pte dirty bits to be propagated into the pageframes
> -	 * for the subsequent filemap_write_and_wait().
> -	 */
> -	if (rw == WRITE) {
> -		write_len = iov_length(iov, nr_segs);
> -	       	if (mapping_mapped(mapping))
> -			unmap_mapping_range(mapping, offset, write_len, 0);
> -	}
> -
> -	retval = mapping->a_ops->direct_IO(rw, iocb, iov,
> -					offset, nr_segs);
> -	if (rw == WRITE && mapping->nrpages) {
> -		int err;
> -		pgoff_t end = (offset + write_len - 1)
> -					>> PAGE_CACHE_SHIFT;
> -
> -		/* O_DIRECT is allowed to drop i_mutex, so more data
> -		 * could have been dirtied by others.  Start io one more
> -		 * time
> -		 */
> -		err = filemap_fdatawrite_range(mapping, offset,
> -		                               offset + write_len - 1);
> -		if (!err)
> -			err = invalidate_inode_pages2_range(mapping,
> -					offset >> PAGE_CACHE_SHIFT, end);
> -		if (err)
> -			retval = err;
> -	}
> -	return retval;
> +	return iocb->ki_filp->f_mapping->a_ops->direct_IO(rw, iocb, iov,
> +							  offset, nr_segs);
>  }
> 
>  /**
> 
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Suparna Bhattacharya (suparna@xxxxxxxxxx)
Linux Technology Center
IBM Software Lab, India

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html