Re: [PATCH 1/2] ext4: introduce new i_write_mutex to protect fallocate

Lukáš Czerner <lczerner@xxxxxxxxxx> · Thu, 29 May 2014 14:42:04 +0200 (CEST)

On Tue, 13 May 2014, Namjae Jeon wrote:

> Date: Tue, 13 May 2014 09:19:17 +0900
> From: Namjae Jeon <namjae.jeon@xxxxxxxxxxx>
> To: Theodore Ts'o <tytso@xxxxxxx>
> Cc: linux-ext4 <linux-ext4@xxxxxxxxxxxxxxx>,
>     Lukáš Czerner <lczerner@xxxxxxxxxx>,
>     Ashish Sangwan <a.sangwan@xxxxxxxxxxx>
> Subject: [PATCH 1/2] ext4: introduce new i_write_mutex to protect fallocate
> 
> Introduce new i_write_mutex to protect new writes from coming while doing
> fallocate operations. Also, get rid of aio_mutex as it is covered by
> i_write_mutex.

I wonder what is the performance impact of this change ? Especially
since we're not longer taking the lock only in unaligned aio/dio
case but in all cases ?

Also, against what tree is this patch ?

The description is quite sparse and I would like see a reasoning for
this change, because it's completely missing!

Thanks!
-Lukas

> 
> Signed-off-by: Namjae Jeon <namjae.jeon@xxxxxxxxxxx>
> Signed-off-by: Ashish Sangwan <a.sangwan@xxxxxxxxxxx>
> ---
>  fs/ext4/ext4.h    |  6 +++---
>  fs/ext4/extents.c | 18 +++++++++++++++---
>  fs/ext4/file.c    | 18 +++++++++++-------
>  fs/ext4/inode.c   |  7 ++++++-
>  fs/ext4/super.c   |  3 +--
>  5 files changed, 36 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 6b45afa..77e5705 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -943,6 +943,9 @@ struct ext4_inode_info {
>  
>  	/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
>  	__u32 i_csum_seed;
> +
> +	/* protects fallocate operations racing with new writes */
> +	struct mutex i_write_mutex;
>  };
>  
>  /*
> @@ -2827,10 +2830,7 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
>  #define EXT4_WQ_HASH_SZ		37
>  #define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
>  					    EXT4_WQ_HASH_SZ])
> -#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
> -					     EXT4_WQ_HASH_SZ])
>  extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
> -extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
>  
>  #define EXT4_RESIZING	0
>  extern int ext4_resize_begin(struct super_block *sb);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 086baa9..5262750 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4741,6 +4741,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>  	if (!S_ISREG(inode->i_mode))
>  		return -EINVAL;
>  
> +	mutex_lock(&EXT4_I(inode)->i_write_mutex);
> +
>  	/*
>  	 * Write out all dirty pages to avoid race conditions
>  	 * Then release them.
> @@ -4748,8 +4750,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>  	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
>  		ret = filemap_write_and_wait_range(mapping, offset,
>  						   offset + len - 1);
> -		if (ret)
> +		if (ret) {
> +			mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  			return ret;
> +		}
>  	}
>  
>  	/*
> @@ -4761,8 +4765,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
>  	start = round_up(offset, 1 << blkbits);
>  	end = round_down((offset + len), 1 << blkbits);
>  
> -	if (start < offset || end > offset + len)
> +	if (start < offset || end > offset + len) {
> +		mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  		return -EINVAL;
> +	}
>  	partial = (offset + len) & ((1 << blkbits) - 1);
>  
>  	lblk = start >> blkbits;
> @@ -4859,6 +4865,7 @@ out_dio:
>  	ext4_inode_resume_unlocked_dio(inode);
>  out_mutex:
>  	mutex_unlock(&inode->i_mutex);
> +	mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  	return ret;
>  }
>  
> @@ -5428,11 +5435,15 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
>  	punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
>  	punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
>  
> +	mutex_lock(&EXT4_I(inode)->i_write_mutex);
> +
>  	/* Call ext4_force_commit to flush all data in case of data=journal. */
>  	if (ext4_should_journal_data(inode)) {
>  		ret = ext4_force_commit(inode->i_sb);
> -		if (ret)
> +		if (ret) {
> +			mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  			return ret;
> +		}
>  	}
>  
>  	/*
> @@ -5518,5 +5529,6 @@ out_dio:
>  	ext4_inode_resume_unlocked_dio(inode);
>  out_mutex:
>  	mutex_unlock(&inode->i_mutex);
> +	mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  	return ret;
>  }
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 708aad7..557b4ac 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -93,7 +93,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file_inode(iocb->ki_filp);
> -	struct mutex *aio_mutex = NULL;
> +	bool unaligned_direct_aio = false;
>  	struct blk_plug plug;
>  	int o_direct = file->f_flags & O_DIRECT;
>  	int overwrite = 0;
> @@ -101,6 +101,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	ssize_t ret;
>  	loff_t pos = iocb->ki_pos;
>  
> +	mutex_lock(&EXT4_I(inode)->i_write_mutex);
> +
>  	/*
>  	 * Unaligned direct AIO must be serialized; see comment above
>  	 * In the case of O_APPEND, assume that we must always serialize
> @@ -110,8 +112,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	    !is_sync_kiocb(iocb) &&
>  	    (file->f_flags & O_APPEND ||
>  	     ext4_unaligned_aio(inode, from, pos))) {
> -		aio_mutex = ext4_aio_mutex(inode);
> -		mutex_lock(aio_mutex);
> +		unaligned_direct_aio = true;
>  		ext4_unwritten_wait(inode);
>  	}
>  
> @@ -143,8 +144,9 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		iocb->private = &overwrite;
>  
>  		/* check whether we do a DIO overwrite or not */
> -		if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
> -		    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
> +		if (ext4_should_dioread_nolock(inode) &&
> +		    !unaligned_direct_aio && !file->f_mapping->nrpages &&
> +		    pos + length <= i_size_read(inode)) {
>  			struct ext4_map_blocks map;
>  			unsigned int blkbits = inode->i_blkbits;
>  			int err, len;
> @@ -174,6 +176,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  
>  	ret = __generic_file_write_iter(iocb, from);
>  	mutex_unlock(&inode->i_mutex);
> +	if (!unaligned_direct_aio)
> +		mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  
>  	if (ret > 0) {
>  		ssize_t err;
> @@ -186,8 +190,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		blk_finish_plug(&plug);
>  
>  errout:
> -	if (aio_mutex)
> -		mutex_unlock(aio_mutex);
> +	if (unaligned_direct_aio)
> +		mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  	return ret;
>  }
>  
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index b1dc334..d804120 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3528,6 +3528,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
>  
>  	trace_ext4_punch_hole(inode, offset, length, 0);
>  
> +	mutex_lock(&EXT4_I(inode)->i_write_mutex);
> +
>  	/*
>  	 * Write out all dirty pages to avoid race conditions
>  	 * Then release them.
> @@ -3535,8 +3537,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
>  	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
>  		ret = filemap_write_and_wait_range(mapping, offset,
>  						   offset + length - 1);
> -		if (ret)
> +		if (ret) {
> +			mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  			return ret;
> +		}
>  	}
>  
>  	mutex_lock(&inode->i_mutex);
> @@ -3637,6 +3641,7 @@ out_dio:
>  	ext4_inode_resume_unlocked_dio(inode);
>  out_mutex:
>  	mutex_unlock(&inode->i_mutex);
> +	mutex_unlock(&EXT4_I(inode)->i_write_mutex);
>  	return ret;
>  }
>  
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 1f8cb18..e236c85 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -904,6 +904,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
>  	atomic_set(&ei->i_ioend_count, 0);
>  	atomic_set(&ei->i_unwritten, 0);
>  	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
> +	mutex_init(&ei->i_write_mutex);
>  
>  	return &ei->vfs_inode;
>  }
> @@ -5505,7 +5506,6 @@ static void ext4_exit_feat_adverts(void)
>  
>  /* Shared across all ext4 file systems */
>  wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
> -struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
>  
>  static int __init ext4_init_fs(void)
>  {
> @@ -5518,7 +5518,6 @@ static int __init ext4_init_fs(void)
>  	ext4_check_flag_values();
>  
>  	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
> -		mutex_init(&ext4__aio_mutex[i]);
>  		init_waitqueue_head(&ext4__ioend_wq[i]);
>  	}
>  
>