from: Huang Le <huangle1@xxxxxx> In move_to_close_lru(), which only be called on path of nfsd4 CLOSE op, the code could wait for its stid ref count drop to 2 while holding its state owner replay mutex. However, the other stid ref holder (normally a parallel CLOSE op) that move_to_close_lru() is waiting for might be accquiring the same replay mutex. This patch fix the issue by clearing the replay owner before waiting, and assign it back after then. Signed-off-by: Huang Le <huangle1@xxxxxx> --- I guess we should cc this patch to stable tree, since a malicious client could craft parallel CLOSE ops to put all nfsd tasks in D state shortly. diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 618e660..5f6a48f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3829,12 +3829,12 @@ static void nfs4_free_openowner(struct nfs4_stateowner *so) * them before returning however. */ static void -move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) +move_to_close_lru(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *s, + struct net *net) { struct nfs4_ol_stateid *last; struct nfs4_openowner *oo = openowner(s->st_stateowner); - struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net, - nfsd_net_id); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); @@ -3846,8 +3846,19 @@ static void nfs4_free_openowner(struct nfs4_stateowner *so) * Wait for the refcount to drop to 2. Since it has been unhashed, * there should be no danger of the refcount going back up again at * this point. + * + * Before waiting, we clear cstate->replay_owner to release its + * so_replay.rp_mutex, since other reference holder might be accquiring + * the same mutex before they could drop the references. The replay_owner + * can be assigned back safely after they done their jobs. */ - wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2); + if (refcount_read(&s->st_stid.sc_count) != 2) { + struct nfs4_stateowner *so = cstate->replay_owner; + + nfsd4_cstate_clear_replay(cstate); + wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2); + nfsd4_cstate_assign_replay(cstate, so); + } release_all_access(s); if (s->st_stid.sc_file) { @@ -5531,7 +5542,8 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac return status; } -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) +static void nfsd4_close_open_stateid(struct nfsd4_compound_state *cstate, + struct nfs4_ol_stateid *s) { struct nfs4_client *clp = s->st_stid.sc_client; bool unhashed; @@ -5549,7 +5561,7 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); if (unhashed) - move_to_close_lru(s, clp->net); + move_to_close_lru(cstate, s, clp->net); } } @@ -5587,7 +5599,7 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) */ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); - nfsd4_close_open_stateid(stp); + nfsd4_close_open_stateid(cstate, stp); mutex_unlock(&stp->st_mutex); /* v4.1+ suggests that we send a special stateid in here, since the