On 17 Feb 2017, at 14:00, Trond Myklebust wrote: > On Fri, 2017-02-17 at 13:46 -0500, Benjamin Coddington wrote: >> NFS attempts to wait for read and write completion before unlocking >> in >> order to ensure that the data returned was protected by the >> lock. When >> this waiting is interrupted by a signal, the unlock may never be >> sent, and >> messages similar to the following are seen in the kernel ring buffer: >> >> [20.167876] Leaked locks on dev=0x0:0x2b ino=0x8dd4c3: >> [20.168286] POSIX: fl_owner=ffff880078b06940 fl_flags=0x1 fl_type=0x0 >> fl_pid=20183 >> [20.168727] POSIX: fl_owner=ffff880078b06680 fl_flags=0x1 fl_type=0x0 >> fl_pid=20185 >> >> For NFSv3, the missing unlock will cause the server to refuse >> conflicting >> locks indefinitely. For NFSv4, the leftover lock will be removed by >> the >> server after the lease timeout. >> >> This patch fixes this for NFSv3 by skipping the wait in order to >> immediately send the unlock if the FL_CLOSE flag is set when >> signaled. For >> NFSv4, this approach may cause the server to see the I/O as arriving >> with >> an old stateid, so, for the v4 case the fix is different: the wait on >> I/O >> completion is moved into nfs4_locku_ops' rpc_call_prepare(). This >> will >> cause the sleep to happen in rpciod context, and a signal to the >> originally >> waiting process will not cause the unlock to be skipped. > > NACK. I/O waits in rpciod contexts are NOT acceptable. rpciod is part > of the memory reclaim chain, so having it sleep on I/O is deadlock > prone. > > Why is there a need to wait for I/O completion in the first place if > the user has killed the task that held the lock? 'kill -9' will cause > corruption; that's a fact that no amount of paper will cover over. To avoid an unnecessary recovery situation where the server asks us to resend I/O due to an invalid stateid. I'm fine with skipping the wait on signaled FL_CLOSE if the that's an acceptable trade-off. I can send a v3. Ben >> >> Signed-off-by: Benjamin Coddington <bcodding@xxxxxxxxxx> >> --- >> fs/nfs/file.c | 13 ------------- >> fs/nfs/nfs3proc.c | 13 +++++++++++++ >> fs/nfs/nfs4proc.c | 7 +++++++ >> fs/nfs/pagelist.c | 1 + >> 4 files changed, 21 insertions(+), 13 deletions(-) >> >> diff --git a/fs/nfs/file.c b/fs/nfs/file.c >> index a490f45df4db..adc67fe762e3 100644 >> --- a/fs/nfs/file.c >> +++ b/fs/nfs/file.c >> @@ -684,7 +684,6 @@ static int >> do_unlk(struct file *filp, int cmd, struct file_lock *fl, int >> is_local) >> { >> struct inode *inode = filp->f_mapping->host; >> - struct nfs_lock_context *l_ctx; >> int status; >> >> /* >> @@ -693,18 +692,6 @@ do_unlk(struct file *filp, int cmd, struct >> file_lock *fl, int is_local) >> */ >> vfs_fsync(filp, 0); >> >> - l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); >> - if (!IS_ERR(l_ctx)) { >> - status = nfs_iocounter_wait(l_ctx); >> - nfs_put_lock_context(l_ctx); >> - if (status < 0) >> - return status; >> - } >> - >> - /* NOTE: special case >> - * If we're signalled while cleaning up locks on >> process exit, we >> - * still need to complete the unlock. >> - */ >> /* >> * Use local locking if mounted with "-onolock" or with >> appropriate >> * "-olocal_lock=" >> diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c >> index dc925b531f32..ec3f12571c82 100644 >> --- a/fs/nfs/nfs3proc.c >> +++ b/fs/nfs/nfs3proc.c >> @@ -869,6 +869,19 @@ static int >> nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) >> { >> struct inode *inode = file_inode(filp); >> + int status; >> + struct nfs_lock_context *l_ctx; >> + >> + /* For v3, always send the unlock on FL_CLOSE */ >> + if (fl->fl_type == F_UNLCK) { >> + l_ctx = >> nfs_get_lock_context(nfs_file_open_context(filp)); >> + if (!IS_ERR(l_ctx)) { >> + status = nfs_iocounter_wait(l_ctx); >> + nfs_put_lock_context(l_ctx); >> + if (status < 0 && !(fl->fl_flags & >> FL_CLOSE)) >> + return status; >> + } >> + } >> >> return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); >> } >> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c >> index 91f88bfbbe79..052b97fd653d 100644 >> --- a/fs/nfs/nfs4proc.c >> +++ b/fs/nfs/nfs4proc.c >> @@ -5906,6 +5906,7 @@ struct nfs4_unlockdata { >> struct nfs_locku_res res; >> struct nfs4_lock_state *lsp; >> struct nfs_open_context *ctx; >> + struct nfs_lock_context *l_ctx; >> struct file_lock fl; >> struct nfs_server *server; >> unsigned long timestamp; >> @@ -5930,6 +5931,7 @@ static struct nfs4_unlockdata >> *nfs4_alloc_unlockdata(struct file_lock *fl, >> atomic_inc(&lsp->ls_count); >> /* Ensure we don't close file until we're done freeing >> locks! */ >> p->ctx = get_nfs_open_context(ctx); >> + p->l_ctx = nfs_get_lock_context(ctx); >> memcpy(&p->fl, fl, sizeof(p->fl)); >> p->server = NFS_SERVER(inode); >> return p; >> @@ -5988,6 +5990,11 @@ static void nfs4_locku_prepare(struct rpc_task >> *task, void *data) >> /* Note: exit _without_ running nfs4_locku_done */ >> goto out_no_action; >> } >> + >> + if (!IS_ERR(calldata->l_ctx)) { >> + nfs_iocounter_wait(calldata->l_ctx); >> + nfs_put_lock_context(calldata->l_ctx); >> + } >> calldata->timestamp = jiffies; >> if (nfs4_setup_sequence(calldata->server, >> &calldata->arg.seq_args, >> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c >> index 6e629b856a00..8bf3cefdaa96 100644 >> --- a/fs/nfs/pagelist.c >> +++ b/fs/nfs/pagelist.c >> @@ -114,6 +114,7 @@ nfs_iocounter_wait(struct nfs_lock_context >> *l_ctx) >> return wait_on_atomic_t(&l_ctx->io_count, >> nfs_wait_atomic_killable, >> TASK_KILLABLE); >> } >> +EXPORT_SYMBOL(nfs_iocounter_wait); >> >> /* >> * nfs_page_group_lock - lock the head of the page group > -- > > > > > > > > Trond Myklebust > Principal System Architect > 4300 El Camino Real | Suite 100 > Los Altos, CA 94022 > W: 650-422-3800 > C: 801-921-4583 > www.primarydata.com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html