Re: [PATCH 3/3] ceph: fix potential races in ceph_uninline_data

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Jeff Layton <jlayton@xxxxxxxxxx> writes:

> The current code will do the uninlining but it relies on the caller to
> set the i_inline_version appropriately afterward. There are several
> potential races here.
>
> Protect against competing uninlining attempts by having the callers
> take the i_truncate_mutex and then have them update the version
> themselves before dropping it.
>
> Other callers can then re-check the i_inline_version after acquiring the
> mutex and if it has changed to CEPH_INLINE_NONE, they can just drop it
> and do nothing.
>
> Finally since we are doing a lockless check first in all cases, just
> move that into ceph_uninline_data as well, and have the callers call
> it unconditionally.
>
> Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> ---
>  fs/ceph/addr.c | 33 ++++++++++++++++++++++++---------
>  fs/ceph/file.c | 18 ++++++------------
>  2 files changed, 30 insertions(+), 21 deletions(-)
>
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 5f1e2b6577fb..e9700c997d12 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -1541,11 +1541,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
>  
>  	ceph_block_sigs(&oldset);
>  
> -	if (ci->i_inline_version != CEPH_INLINE_NONE) {
> -		err = ceph_uninline_data(inode, off == 0 ? page : NULL);
> -		if (err < 0)
> -			goto out_free;
> -	}
> +	err = ceph_uninline_data(inode, off == 0 ? page : NULL);
> +	if (err < 0)
> +		goto out_free;
>  
>  	if (off + PAGE_SIZE <= size)
>  		len = PAGE_SIZE;
> @@ -1593,7 +1591,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
>  	    ci->i_inline_version != CEPH_INLINE_NONE) {
>  		int dirty;
>  		spin_lock(&ci->i_ceph_lock);
> -		ci->i_inline_version = CEPH_INLINE_NONE;
>  		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
>  					       &prealloc_cf);
>  		spin_unlock(&ci->i_ceph_lock);
> @@ -1656,6 +1653,10 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
>  	}
>  }
>  
> +/*
> + * We borrow the i_truncate_mutex to serialize callers that may be racing to
> + * uninline the data.
> + */
>  int ceph_uninline_data(struct inode *inode, struct page *page)
>  {
>  	struct ceph_inode_info *ci = ceph_inode(inode);
> @@ -1665,15 +1666,23 @@ int ceph_uninline_data(struct inode *inode, struct page *page)
>  	int err = 0;
>  	bool from_pagecache = false;
>  
> -	spin_lock(&ci->i_ceph_lock);
> -	inline_version = ci->i_inline_version;
> -	spin_unlock(&ci->i_ceph_lock);
> +	/* Do a lockless check first -- paired with i_ceph_lock for changes */
> +	inline_version = READ_ONCE(ci->i_inline_version);
>  
>  	dout("uninline_data %p %llx.%llx inline_version %llu\n",
>  	     inode, ceph_vinop(inode), inline_version);
>  
>  	if (inline_version == 1 || /* initial version, no data */
>  	    inline_version == CEPH_INLINE_NONE)
> +		return 0;

We may need to do the unlock_page(page) before returning.

> + + mutex_lock(&ci->i_truncate_mutex); + + /* Double check the version
> after taking mutex */ + spin_lock(&ci->i_ceph_lock); + inline_version
> = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + if
> (inline_version == CEPH_INLINE_NONE) goto out;
>  
>  	if (page) {
> @@ -1770,11 +1779,17 @@ int ceph_uninline_data(struct inode *inode, struct page *page)
>  	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
>  	if (!err)
>  		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> +	if (!err) {
> +		spin_lock(&ci->i_ceph_lock);
> +		inline_version = CEPH_INLINE_NONE;

Shouldn't this be ci->i_inline_version = CEPH_INLINE_NONE ?  Or maybe
both since the dout() below uses inline_version.

Cheers,
-- 
Luis

> +		spin_unlock(&ci->i_ceph_lock);
> +	}
>  out_put:
>  	ceph_osdc_put_request(req);
>  	if (err == -ECANCELED)
>  		err = 0;
>  out:
> +	mutex_unlock(&ci->i_truncate_mutex);
>  	if (page) {
>  		unlock_page(page);
>  		if (from_pagecache)
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 7bb090fa99d3..3ff83135562c 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -1438,11 +1438,9 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  
>  	inode_inc_iversion_raw(inode);
>  
> -	if (ci->i_inline_version != CEPH_INLINE_NONE) {
> -		err = ceph_uninline_data(inode, NULL);
> -		if (err < 0)
> -			goto out;
> -	}
> +	err = ceph_uninline_data(inode, NULL);
> +	if (err < 0)
> +		goto out;
>  
>  	/* FIXME: not complete since it doesn't account for being at quota */
>  	if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL)) {
> @@ -1513,7 +1511,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		int dirty;
>  
>  		spin_lock(&ci->i_ceph_lock);
> -		ci->i_inline_version = CEPH_INLINE_NONE;
>  		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
>  					       &prealloc_cf);
>  		spin_unlock(&ci->i_ceph_lock);
> @@ -1762,11 +1759,9 @@ static long ceph_fallocate(struct file *file, int mode,
>  		goto unlock;
>  	}
>  
> -	if (ci->i_inline_version != CEPH_INLINE_NONE) {
> -		ret = ceph_uninline_data(inode, NULL);
> -		if (ret < 0)
> -			goto unlock;
> -	}
> +	ret = ceph_uninline_data(inode, NULL);
> +	if (ret < 0)
> +		goto unlock;
>  
>  	size = i_size_read(inode);
>  
> @@ -1790,7 +1785,6 @@ static long ceph_fallocate(struct file *file, int mode,
>  
>  	if (!ret) {
>  		spin_lock(&ci->i_ceph_lock);
> -		ci->i_inline_version = CEPH_INLINE_NONE;
>  		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
>  					       &prealloc_cf);
>  		spin_unlock(&ci->i_ceph_lock);



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux