Re: [PATCH 4/4] NFS: Always wait for I/O completion before unlock

Trond Myklebust <trondmy@xxxxxxxxxxxxxxx> · Fri, 17 Feb 2017 19:00:44 +0000

On Fri, 2017-02-17 at 13:46 -0500, Benjamin Coddington wrote:
> NFS attempts to wait for read and write completion before unlocking
> in
> order to ensure that the data returned was protected by the
> lock.  When
> this waiting is interrupted by a signal, the unlock may never be
> sent, and
> messages similar to the following are seen in the kernel ring buffer:
> 
> [20.167876] Leaked locks on dev=0x0:0x2b ino=0x8dd4c3:
> [20.168286] POSIX: fl_owner=ffff880078b06940 fl_flags=0x1 fl_type=0x0
> fl_pid=20183
> [20.168727] POSIX: fl_owner=ffff880078b06680 fl_flags=0x1 fl_type=0x0
> fl_pid=20185
> 
> For NFSv3, the missing unlock will cause the server to refuse
> conflicting
> locks indefinitely.  For NFSv4, the leftover lock will be removed by
> the
> server after the lease timeout.
> 
> This patch fixes this for NFSv3 by skipping the wait in order to
> immediately send the unlock if the FL_CLOSE flag is set when
> signaled.  For
> NFSv4, this approach may cause the server to see the I/O as arriving
> with
> an old stateid, so, for the v4 case the fix is different: the wait on
> I/O
> completion is moved into nfs4_locku_ops' rpc_call_prepare().  This
> will
> cause the sleep to happen in rpciod context, and a signal to the
> originally
> waiting process will not cause the unlock to be skipped.

NACK. I/O waits in rpciod contexts are NOT acceptable. rpciod is part
of the memory reclaim chain, so having it sleep on I/O is deadlock
prone.

Why is there a need to wait for I/O completion in the first place if
the user has killed the task that held the lock? 'kill -9' will cause
corruption; that's a fact that no amount of paper will cover over.

> 
> Signed-off-by: Benjamin Coddington <bcodding@xxxxxxxxxx>
> ---
>  fs/nfs/file.c     | 13 -------------
>  fs/nfs/nfs3proc.c | 13 +++++++++++++
>  fs/nfs/nfs4proc.c |  7 +++++++
>  fs/nfs/pagelist.c |  1 +
>  4 files changed, 21 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> index a490f45df4db..adc67fe762e3 100644
> --- a/fs/nfs/file.c
> +++ b/fs/nfs/file.c
> @@ -684,7 +684,6 @@ static int
>  do_unlk(struct file *filp, int cmd, struct file_lock *fl, int
> is_local)
>  {
>  	struct inode *inode = filp->f_mapping->host;
> -	struct nfs_lock_context *l_ctx;
>  	int status;
>  
>  	/*
> @@ -693,18 +692,6 @@ do_unlk(struct file *filp, int cmd, struct
> file_lock *fl, int is_local)
>  	 */
>  	vfs_fsync(filp, 0);
>  
> -	l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
> -	if (!IS_ERR(l_ctx)) {
> -		status = nfs_iocounter_wait(l_ctx);
> -		nfs_put_lock_context(l_ctx);
> -		if (status < 0)
> -			return status;
> -	}
> -
> -	/* NOTE: special case
> -	 * 	If we're signalled while cleaning up locks on
> process exit, we
> -	 * 	still need to complete the unlock.
> -	 */
>  	/*
>  	 * Use local locking if mounted with "-onolock" or with
> appropriate
>  	 * "-olocal_lock="
> diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
> index dc925b531f32..ec3f12571c82 100644
> --- a/fs/nfs/nfs3proc.c
> +++ b/fs/nfs/nfs3proc.c
> @@ -869,6 +869,19 @@ static int
>  nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
>  {
>  	struct inode *inode = file_inode(filp);
> +	int status;
> +	struct nfs_lock_context *l_ctx;
> +
> +	/* For v3, always send the unlock on FL_CLOSE */
> +	if (fl->fl_type == F_UNLCK) {
> +		l_ctx =
> nfs_get_lock_context(nfs_file_open_context(filp));
> +		if (!IS_ERR(l_ctx)) {
> +			status = nfs_iocounter_wait(l_ctx);
> +			nfs_put_lock_context(l_ctx);
> +			if (status < 0 && !(fl->fl_flags &
> FL_CLOSE))
> +				return status;
> +		}
> +	}
>  
>  	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
>  }
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 91f88bfbbe79..052b97fd653d 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -5906,6 +5906,7 @@ struct nfs4_unlockdata {
>  	struct nfs_locku_res res;
>  	struct nfs4_lock_state *lsp;
>  	struct nfs_open_context *ctx;
> +	struct nfs_lock_context *l_ctx;
>  	struct file_lock fl;
>  	struct nfs_server *server;
>  	unsigned long timestamp;
> @@ -5930,6 +5931,7 @@ static struct nfs4_unlockdata
> *nfs4_alloc_unlockdata(struct file_lock *fl,
>  	atomic_inc(&lsp->ls_count);
>  	/* Ensure we don't close file until we're done freeing
> locks! */
>  	p->ctx = get_nfs_open_context(ctx);
> +	p->l_ctx = nfs_get_lock_context(ctx);
>  	memcpy(&p->fl, fl, sizeof(p->fl));
>  	p->server = NFS_SERVER(inode);
>  	return p;
> @@ -5988,6 +5990,11 @@ static void nfs4_locku_prepare(struct rpc_task
> *task, void *data)
>  		/* Note: exit _without_ running nfs4_locku_done */
>  		goto out_no_action;
>  	}
> +
> +	if (!IS_ERR(calldata->l_ctx)) {
> +		nfs_iocounter_wait(calldata->l_ctx);
> +		nfs_put_lock_context(calldata->l_ctx);
> +	}
>  	calldata->timestamp = jiffies;
>  	if (nfs4_setup_sequence(calldata->server,
>  				&calldata->arg.seq_args,
> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> index 6e629b856a00..8bf3cefdaa96 100644
> --- a/fs/nfs/pagelist.c
> +++ b/fs/nfs/pagelist.c
> @@ -114,6 +114,7 @@ nfs_iocounter_wait(struct nfs_lock_context
> *l_ctx)
>  	return wait_on_atomic_t(&l_ctx->io_count,
> nfs_wait_atomic_killable,
>  			TASK_KILLABLE);
>  }
> +EXPORT_SYMBOL(nfs_iocounter_wait);
>  
>  /*
>   * nfs_page_group_lock - lock the head of the page group
-- 

Trond Myklebust
Principal System Architect
4300 El Camino Real | Suite 100
Los Altos, CA  94022
W: 650-422-3800
C: 801-921-4583 
www.primarydata.com

��.n��������+%������w��{.n�����{��w���jg��������ݢj����G�������j:+v���w�m������w�������h�����٥