Re: [PATCH v11 20/21] ext4: Add DAX functionality

Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx> · Thu, 16 Oct 2014 14:56:25 +0200

On 25-Sep-2014 04:33:37 PM, Matthew Wilcox wrote:
> From: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
> 
> This is a port of the DAX functionality found in the current version of
> ext2.
> 
> Signed-off-by: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
> Reviewed-by: Andreas Dilger <andreas.dilger@xxxxxxxxx>
> [heavily tweaked]
> Signed-off-by: Matthew Wilcox <matthew.r.wilcox@xxxxxxxxx>
> ---
>  Documentation/filesystems/dax.txt  |  1 +
>  Documentation/filesystems/ext4.txt |  2 +
>  fs/ext4/ext4.h                     |  6 +++
>  fs/ext4/file.c                     | 49 ++++++++++++++++++++-
>  fs/ext4/indirect.c                 | 18 +++++---
>  fs/ext4/inode.c                    | 89 ++++++++++++++++++++++++++------------
>  fs/ext4/namei.c                    | 10 ++++-
>  fs/ext4/super.c                    | 39 ++++++++++++++++-
>  8 files changed, 177 insertions(+), 37 deletions(-)
> 
> diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
> index ebcd97f..be376d9 100644
> --- a/Documentation/filesystems/dax.txt
> +++ b/Documentation/filesystems/dax.txt
> @@ -73,6 +73,7 @@ or a write()) work correctly.
>  
>  These filesystems may be used for inspiration:
>  - ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt
> +- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt
>  
>  
>  Shortcomings
> diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
> index 919a329..9c511c4 100644
> --- a/Documentation/filesystems/ext4.txt
> +++ b/Documentation/filesystems/ext4.txt
> @@ -386,6 +386,8 @@ max_dir_size_kb=n	This limits the size of directories so that any
>  i_version		Enable 64-bit inode version support. This option is
>  			off by default.
>  
> +dax			Use direct access if possible
> +
>  Data Mode
>  =========
>  There are 3 different data modes:
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index b0c225c..5b38569 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -969,6 +969,11 @@ struct ext4_inode_info {
>  #define EXT4_MOUNT_ERRORS_MASK		0x00070
>  #define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
>  #define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
> +#ifdef CONFIG_FS_DAX
> +#define EXT4_MOUNT_DAX			0x00200	/* Execute in place */

Execute in place -> Direct Access stuff... (comment above)

> +#else
> +#define EXT4_MOUNT_DAX			0
> +#endif
>  #define EXT4_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
>  #define EXT4_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
>  #define EXT4_MOUNT_ORDERED_DATA		0x00800	/* Flush data before commit */
> @@ -2574,6 +2579,7 @@ extern const struct file_operations ext4_dir_operations;
>  /* file.c */
>  extern const struct inode_operations ext4_file_inode_operations;
>  extern const struct file_operations ext4_file_operations;
> +extern const struct file_operations ext4_dax_file_operations;
>  extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
>  
>  /* inline.c */
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index aca7b24..9c7bde5 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	struct inode *inode = file_inode(iocb->ki_filp);
>  	struct mutex *aio_mutex = NULL;
>  	struct blk_plug plug;
> -	int o_direct = file->f_flags & O_DIRECT;
> +	int o_direct = io_is_direct(file);
>  	int overwrite = 0;
>  	size_t length = iov_iter_count(from);
>  	ssize_t ret;
> @@ -191,6 +191,27 @@ errout:
>  	return ret;
>  }
>  
> +#ifdef CONFIG_FS_DAX
> +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	return dax_fault(vma, vmf, ext4_get_block);
> +					/* Is this the right get_block? */

perhaps this needs a TODO or FIXME or XXX to make sure an ext4
maintainer does not miss this question.

> +}
> +
> +static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	return dax_mkwrite(vma, vmf, ext4_get_block);
> +}
> +
> +static const struct vm_operations_struct ext4_dax_vm_ops = {
> +	.fault		= ext4_dax_fault,
> +	.page_mkwrite	= ext4_dax_mkwrite,
> +	.remap_pages	= generic_file_remap_pages,
> +};
> +#else
> +#define ext4_dax_vm_ops	ext4_file_vm_ops
> +#endif
> +
>  static const struct vm_operations_struct ext4_file_vm_ops = {
>  	.fault		= filemap_fault,
>  	.map_pages	= filemap_map_pages,
> @@ -201,7 +222,12 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
>  static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
>  {
>  	file_accessed(file);
> -	vma->vm_ops = &ext4_file_vm_ops;
> +	if (IS_DAX(file_inode(file))) {
> +		vma->vm_ops = &ext4_dax_vm_ops;
> +		vma->vm_flags |= VM_MIXEDMAP;
> +	} else {
> +		vma->vm_ops = &ext4_file_vm_ops;
> +	}
>  	return 0;
>  }
>  
> @@ -600,6 +626,25 @@ const struct file_operations ext4_file_operations = {
>  	.fallocate	= ext4_fallocate,
>  };
>  
> +#ifdef CONFIG_FS_DAX
> +const struct file_operations ext4_dax_file_operations = {
> +	.llseek		= ext4_llseek,
> +	.read		= new_sync_read,
> +	.write		= new_sync_write,
> +	.read_iter	= generic_file_read_iter,
> +	.write_iter	= ext4_file_write_iter,
> +	.unlocked_ioctl = ext4_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= ext4_compat_ioctl,
> +#endif
> +	.mmap		= ext4_file_mmap,
> +	.open		= ext4_file_open,
> +	.release	= ext4_release_file,
> +	.fsync		= ext4_sync_file,
> +	.fallocate	= ext4_fallocate,

Perhaps adding comments saying that .splice_read and .splice_write are
unavailable here would help understanding why we need a different file
operations structure.

> +};
> +#endif
> +
>  const struct inode_operations ext4_file_inode_operations = {
>  	.setattr	= ext4_setattr,
>  	.getattr	= ext4_getattr,
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index e75f840..fa9ec8d 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -691,14 +691,22 @@ retry:
>  			inode_dio_done(inode);
>  			goto locked;
>  		}
> -		ret = __blockdev_direct_IO(rw, iocb, inode,
> -				 inode->i_sb->s_bdev, iter, offset,
> -				 ext4_get_block, NULL, NULL, 0);
> +		if (IS_DAX(inode))
> +			ret = dax_do_io(rw, iocb, inode, iter, offset,
> +					ext4_get_block, NULL, 0);
> +		else
> +			ret = __blockdev_direct_IO(rw, iocb, inode,
> +					inode->i_sb->s_bdev, iter, offset,
> +					ext4_get_block, NULL, NULL, 0);
>  		inode_dio_done(inode);
>  	} else {
>  locked:
> -		ret = blockdev_direct_IO(rw, iocb, inode, iter,
> -				 offset, ext4_get_block);
> +		if (IS_DAX(inode))
> +			ret = dax_do_io(rw, iocb, inode, iter, offset,
> +					ext4_get_block, NULL, DIO_LOCKING);
> +		else
> +			ret = blockdev_direct_IO(rw, iocb, inode, iter,
> +					offset, ext4_get_block);
>  
>  		if (unlikely((rw & WRITE) && ret < 0)) {
>  			loff_t isize = i_size_read(inode);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 3aa26e9..542205f 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -676,6 +676,18 @@ has_zeroout:
>  	return retval;
>  }
>  
> +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
> +{
> +	struct inode *inode = bh->b_assoc_map->host;
> +	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */

Good question! It would be interesting to get an answer :)

> +	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
> +	int err;

missing newline.

> +	if (!uptodate)
> +		return;
> +	WARN_ON(!buffer_unwritten(bh));
> +	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);

err is simply unused here, that does not look good (silent failure).

> +}
> +
>  /* Maximum number of blocks we map for direct IO at once. */
>  #define DIO_MAX_BLOCKS 4096
>  
> @@ -713,6 +725,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
>  
>  		map_bh(bh, inode->i_sb, map.m_pblk);
>  		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
> +		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
> +			bh->b_assoc_map = inode->i_mapping;
> +			bh->b_private = (void *)(unsigned long)iblock;
> +			bh->b_end_io = ext4_end_io_unwritten;
> +		}
>  		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
>  			set_buffer_defer_completion(bh);
>  		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
> @@ -3043,13 +3060,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  		get_block_func = ext4_get_block_write;
>  		dio_flags = DIO_LOCKING;
>  	}
> -	ret = __blockdev_direct_IO(rw, iocb, inode,
> -				   inode->i_sb->s_bdev, iter,
> -				   offset,
> -				   get_block_func,
> -				   ext4_end_io_dio,
> -				   NULL,
> -				   dio_flags);
> +	if (IS_DAX(inode))
> +		ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
> +				ext4_end_io_dio, dio_flags);
> +	else
> +		ret = __blockdev_direct_IO(rw, iocb, inode,
> +					   inode->i_sb->s_bdev, iter, offset,
> +					   get_block_func,
> +					   ext4_end_io_dio, NULL, dio_flags);
>  
>  	/*
>  	 * Put our reference to io_end. This can free the io_end structure e.g.
> @@ -3213,19 +3231,12 @@ void ext4_set_aops(struct inode *inode)
>  		inode->i_mapping->a_ops = &ext4_aops;
>  }
>  
> -/*
> - * ext4_block_zero_page_range() zeros out a mapping of length 'length'
> - * starting from file offset 'from'.  The range to be zero'd must
> - * be contained with in one block.  If the specified range exceeds
> - * the end of the block it will be shortened to end of the block
> - * that cooresponds to 'from'
> - */
> -static int ext4_block_zero_page_range(handle_t *handle,
> +static int __ext4_block_zero_page_range(handle_t *handle,
>  		struct address_space *mapping, loff_t from, loff_t length)
>  {
>  	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
>  	unsigned offset = from & (PAGE_CACHE_SIZE-1);
> -	unsigned blocksize, max, pos;
> +	unsigned blocksize, pos;
>  	ext4_lblk_t iblock;
>  	struct inode *inode = mapping->host;
>  	struct buffer_head *bh;
> @@ -3238,14 +3249,6 @@ static int ext4_block_zero_page_range(handle_t *handle,
>  		return -ENOMEM;
>  
>  	blocksize = inode->i_sb->s_blocksize;
> -	max = blocksize - (offset & (blocksize - 1));
> -
> -	/*
> -	 * correct length if it does not fall between
> -	 * 'from' and the end of the block
> -	 */
> -	if (length > max || length < 0)
> -		length = max;
>  
>  	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
>  
> @@ -3311,6 +3314,33 @@ unlock:
>  }
>  
>  /*
> + * ext4_block_zero_page_range() zeros out a mapping of length 'length'
> + * starting from file offset 'from'.  The range to be zero'd must
> + * be contained with in one block.  If the specified range exceeds
> + * the end of the block it will be shortened to end of the block
> + * that cooresponds to 'from'
> + */
> +static int ext4_block_zero_page_range(handle_t *handle,
> +		struct address_space *mapping, loff_t from, loff_t length)
> +{
> +	struct inode *inode = mapping->host;
> +	unsigned offset = from & (PAGE_CACHE_SIZE-1);
> +	unsigned blocksize = inode->i_sb->s_blocksize;
> +	unsigned max = blocksize - (offset & (blocksize - 1));
> +
> +	/*
> +	 * correct length if it does not fall between
> +	 * 'from' and the end of the block
> +	 */

Shouldn't a length < 0 be treated as an error instead ?

> +	if (length > max || length < 0)
> +		length = max;
> +
> +	if (IS_DAX(inode))
> +		return dax_zero_page_range(inode, from, length, ext4_get_block);
> +	return __ext4_block_zero_page_range(handle, mapping, from, length);
> +}
> +
> +/*
>   * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
>   * up to the end of the block which corresponds to `from'.
>   * This required during truncate. We need to physically zero the tail end
> @@ -3831,8 +3861,10 @@ void ext4_set_inode_flags(struct inode *inode)
>  		new_fl |= S_NOATIME;
>  	if (flags & EXT4_DIRSYNC_FL)
>  		new_fl |= S_DIRSYNC;
> +	if (test_opt(inode->i_sb, DAX))
> +		new_fl |= S_DAX;
>  	inode_set_flags(inode, new_fl,
> -			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
> +			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
>  }
>  
>  /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
> @@ -4086,7 +4118,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>  
>  	if (S_ISREG(inode->i_mode)) {
>  		inode->i_op = &ext4_file_inode_operations;
> -		inode->i_fop = &ext4_file_operations;
> +		if (test_opt(inode->i_sb, DAX))
> +			inode->i_fop = &ext4_dax_file_operations;
> +		else
> +			inode->i_fop = &ext4_file_operations;
>  		ext4_set_aops(inode);
>  	} else if (S_ISDIR(inode->i_mode)) {
>  		inode->i_op = &ext4_dir_inode_operations;
> @@ -4556,7 +4591,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
>  		 * Truncate pagecache after we've waited for commit
>  		 * in data=journal mode to make pages freeable.
>  		 */
> -			truncate_pagecache(inode, inode->i_size);
> +		truncate_pagecache(inode, inode->i_size);
>  	}
>  	/*
>  	 * We want to call ext4_truncate() even if attr->ia_size ==
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 603e4eb..8d744a5 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -2264,7 +2264,10 @@ retry:
>  	err = PTR_ERR(inode);
>  	if (!IS_ERR(inode)) {
>  		inode->i_op = &ext4_file_inode_operations;
> -		inode->i_fop = &ext4_file_operations;
> +		if (test_opt(inode->i_sb, DAX))
> +			inode->i_fop = &ext4_dax_file_operations;
> +		else
> +			inode->i_fop = &ext4_file_operations;
>  		ext4_set_aops(inode);
>  		err = ext4_add_nondir(handle, dentry, inode);
>  		if (!err && IS_DIRSYNC(dir))
> @@ -2328,7 +2331,10 @@ retry:
>  	err = PTR_ERR(inode);
>  	if (!IS_ERR(inode)) {
>  		inode->i_op = &ext4_file_inode_operations;
> -		inode->i_fop = &ext4_file_operations;
> +		if (test_opt(inode->i_sb, DAX))
> +			inode->i_fop = &ext4_dax_file_operations;
> +		else
> +			inode->i_fop = &ext4_file_operations;
>  		ext4_set_aops(inode);
>  		d_tmpfile(dentry, inode);
>  		err = ext4_orphan_add(handle, inode);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 0b28b36..b94b6b9 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1162,7 +1162,7 @@ enum {
>  	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
>  	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
>  	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
> -	Opt_usrquota, Opt_grpquota, Opt_i_version,
> +	Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
>  	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
>  	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
>  	Opt_inode_readahead_blks, Opt_journal_ioprio,
> @@ -1224,6 +1224,7 @@ static const match_table_t tokens = {
>  	{Opt_barrier, "barrier"},
>  	{Opt_nobarrier, "nobarrier"},
>  	{Opt_i_version, "i_version"},
> +	{Opt_dax, "dax"},
>  	{Opt_stripe, "stripe=%u"},
>  	{Opt_delalloc, "delalloc"},
>  	{Opt_nodelalloc, "nodelalloc"},
> @@ -1406,6 +1407,7 @@ static const struct mount_opts {
>  	{Opt_min_batch_time, 0, MOPT_GTE0},
>  	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
>  	{Opt_init_itable, 0, MOPT_GTE0},
> +	{Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
>  	{Opt_stripe, 0, MOPT_GTE0},
>  	{Opt_resuid, 0, MOPT_GTE0},
>  	{Opt_resgid, 0, MOPT_GTE0},
> @@ -1642,6 +1644,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
>  		}
>  		sbi->s_jquota_fmt = m->mount_opt;
>  #endif
> +#ifndef CONFIG_FS_DAX
> +	} else if (token == Opt_dax) {
> +		ext4_msg(sb, KERN_INFO, "dax option not supported");
> +		return -1;
> +#endif
>  	} else {
>  		if (!args->from)
>  			arg = 1;
> @@ -3572,6 +3579,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  				 "both data=journal and dioread_nolock");
>  			goto failed_mount;
>  		}
> +		if (test_opt(sb, DAX)) {
> +			ext4_msg(sb, KERN_ERR, "can't mount with "
> +				 "both data=journal and dax");

This limitation regarding ext4 and dax should be documented in dax
Documentation.

Thanks,

Mathieu

> +			goto failed_mount;
> +		}
>  		if (test_opt(sb, DELALLOC))
>  			clear_opt(sb, DELALLOC);
>  	}
> @@ -3635,6 +3647,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  		goto failed_mount;
>  	}
>  
> +	if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
> +		if (blocksize != PAGE_SIZE) {
> +			ext4_msg(sb, KERN_ERR,
> +					"error: unsupported blocksize for dax");
> +			goto failed_mount;
> +		}
> +		if (!sb->s_bdev->bd_disk->fops->direct_access) {
> +			ext4_msg(sb, KERN_ERR,
> +					"error: device does not support dax");
> +			goto failed_mount;
> +		}
> +	}
> +
>  	if (sb->s_blocksize != blocksize) {
>  		/* Validate the filesystem blocksize */
>  		if (!sb_set_blocksize(sb, blocksize)) {
> @@ -4837,6 +4862,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
>  			err = -EINVAL;
>  			goto restore_opts;
>  		}
> +		if (test_opt(sb, DAX)) {
> +			ext4_msg(sb, KERN_ERR, "can't mount with "
> +				 "both data=journal and dax");
> +			err = -EINVAL;
> +			goto restore_opts;
> +		}
> +	}
> +
> +	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
> +		ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
> +			"dax flag with busy inodes while remounting");
> +		sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
>  	}
>  
>  	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
> -- 
> 2.1.0
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>
> 
> 

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
Key fingerprint: 2A0B 4ED9 15F2 D3FA 45F5  B162 1728 0A97 8118 6ACF
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html