Hi Dai- Some comments and questions below: > On Dec 6, 2021, at 12:59 PM, Dai Ngo <dai.ngo@xxxxxxxxxx> wrote: > > Currently an NFSv4 client must maintain its lease by using the at least > one of the state tokens or if nothing else, by issuing a RENEW (4.0), or > a singleton SEQUENCE (4.1) at least once during each lease period. If the > client fails to renew the lease, for any reason, the Linux server expunges > the state tokens immediately upon detection of the "failure to renew the > lease" condition and begins returning NFS4ERR_EXPIRED if the client should > reconnect and attempt to use the (now) expired state. > > The default lease period for the Linux server is 90 seconds. The typical > client cuts that in half and will issue a lease renewing operation every > 45 seconds. The 90 second lease period is very short considering the > potential for moderately long term network partitions. A network partition > refers to any loss of network connectivity between the NFS client and the > NFS server, regardless of its root cause. This includes NIC failures, NIC > driver bugs, network misconfigurations & administrative errors, routers & > switches crashing and/or having software updates applied, even down to > cables being physically pulled. In most cases, these network failures are > transient, although the duration is unknown. > > A server which does not immediately expunge the state on lease expiration > is known as a Courteous Server. A Courteous Server continues to recognize > previously generated state tokens as valid until conflict arises between > the expired state and the requests from another client, or the server > reboots. > > The initial implementation of the Courteous Server will do the following: > > . when the laundromat thread detects an expired client and if that client > still has established states on the Linux server and there is no waiters > for the client's locks then mark the client as a COURTESY_CLIENT and skip > destroying the client and all its states, otherwise destroy the client as > usual. > > . detects conflict of OPEN request with COURTESY_CLIENT, destroys the > expired client and all its states, skips the delegation recall then allows > the conflicting request to succeed. > > . detects conflict of LOCK/LOCKT, NLM LOCK and TEST, and local locks > requests with COURTESY_CLIENT, destroys the expired client and all its > states then allows the conflicting request to succeed. > > . detects conflict of LOCK/LOCKT, NLM LOCK and TEST, and local locks > requests with COURTESY_CLIENT, destroys the expired client and all its > states then allows the conflicting request to succeed. > > Signed-off-by: Dai Ngo <dai.ngo@xxxxxxxxxx> > --- > fs/nfsd/nfs4state.c | 293 +++++++++++++++++++++++++++++++++++++++++++++++++++- > fs/nfsd/state.h | 3 + > 2 files changed, 293 insertions(+), 3 deletions(-) > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > index 3f4027a5de88..759f61dc6685 100644 > --- a/fs/nfsd/nfs4state.c > +++ b/fs/nfsd/nfs4state.c > @@ -125,6 +125,11 @@ static void free_session(struct nfsd4_session *); > static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; > static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; > > +static struct workqueue_struct *laundry_wq; > +static void laundromat_main(struct work_struct *); > + > +static int courtesy_client_expiry = (24 * 60 * 60); /* in secs */ > + > static bool is_session_dead(struct nfsd4_session *ses) > { > return ses->se_flags & NFS4_SESSION_DEAD; > @@ -172,6 +177,7 @@ renew_client_locked(struct nfs4_client *clp) > > list_move_tail(&clp->cl_lru, &nn->client_lru); > clp->cl_time = ktime_get_boottime_seconds(); > + clear_bit(NFSD4_COURTESY_CLIENT, &clp->cl_flags); > } > > static void put_client_renew_locked(struct nfs4_client *clp) > @@ -2389,6 +2395,10 @@ static int client_info_show(struct seq_file *m, void *v) > seq_puts(m, "status: confirmed\n"); > else > seq_puts(m, "status: unconfirmed\n"); > + seq_printf(m, "courtesy client: %s\n", > + test_bit(NFSD4_COURTESY_CLIENT, &clp->cl_flags) ? "yes" : "no"); > + seq_printf(m, "seconds from last renew: %lld\n", > + ktime_get_boottime_seconds() - clp->cl_time); > seq_printf(m, "name: "); > seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); > seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); > @@ -4662,6 +4672,33 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) > nfsd4_run_cb(&dp->dl_recall); > } > > +/* > + * This function is called when a file is opened and there is a > + * delegation conflict with another client. If the other client > + * is a courtesy client then kick start the laundromat to destroy > + * it. > + */ > +static bool > +nfsd_check_courtesy_client(struct nfs4_delegation *dp) > +{ > + struct svc_rqst *rqst; > + struct nfs4_client *clp = dp->dl_recall.cb_clp; > + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); > + > + if (!i_am_nfsd()) > + goto out; > + rqst = kthread_data(current); > + if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4) > + return false; > +out: > + if (test_bit(NFSD4_COURTESY_CLIENT, &clp->cl_flags)) { > + set_bit(NFSD4_DESTROY_COURTESY_CLIENT, &clp->cl_flags); > + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); > + return true; > + } > + return false; > +} > + > /* Called from break_lease() with i_lock held. */ > static bool > nfsd_break_deleg_cb(struct file_lock *fl) > @@ -4670,6 +4707,8 @@ nfsd_break_deleg_cb(struct file_lock *fl) > struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; > struct nfs4_file *fp = dp->dl_stid.sc_file; > > + if (nfsd_check_courtesy_client(dp)) > + return false; > trace_nfsd_cb_recall(&dp->dl_stid); > > /* > @@ -4912,6 +4951,136 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, > return nfsd_setattr(rqstp, fh, &iattr, 0, (time64_t)0); > } > > +static bool > +__nfs4_check_deny_bmap(struct nfs4_ol_stateid *stp, u32 access, > + bool share_access) > +{ > + if (share_access) { > + if (!stp->st_deny_bmap) > + return false; > + > + if ((stp->st_deny_bmap & (1 << NFS4_SHARE_DENY_BOTH)) || Aren't the NFS4_SHARE_DENY macros already bit masks? NFS4_SHARE_DENY_BOTH is (NFS4_SHARE_DENY_READ | NFS4_SHARE_DENY_WRITE). > + (access & NFS4_SHARE_ACCESS_READ && > + stp->st_deny_bmap & (1 << NFS4_SHARE_DENY_READ)) || > + (access & NFS4_SHARE_ACCESS_WRITE && > + stp->st_deny_bmap & (1 << NFS4_SHARE_DENY_WRITE))) { > + return true; > + } > + return false; > + } > + if ((access & NFS4_SHARE_DENY_BOTH) || > + (access & NFS4_SHARE_DENY_READ && > + stp->st_access_bmap & (1 << NFS4_SHARE_ACCESS_READ)) || > + (access & NFS4_SHARE_DENY_WRITE && > + stp->st_access_bmap & (1 << NFS4_SHARE_ACCESS_WRITE))) { > + return true; > + } > + return false; > +} > + > +/* > + * access: if share_access is true then check access mode else check deny mode > + */ > +static bool > +nfs4_check_deny_bmap(struct nfs4_client *clp, struct nfs4_file *fp, > + struct nfs4_ol_stateid *st, u32 access, bool share_access) > +{ > + int i; > + struct nfs4_openowner *oo; > + struct nfs4_stateowner *so, *tmp; > + struct nfs4_ol_stateid *stp, *stmp; > + > + spin_lock(&clp->cl_lock); > + for (i = 0; i < OWNER_HASH_SIZE; i++) { > + list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i], > + so_strhash) { > + if (!so->so_is_open_owner) > + continue; > + oo = openowner(so); > + list_for_each_entry_safe(stp, stmp, > + &oo->oo_owner.so_stateids, st_perstateowner) { > + if (stp == st || stp->st_stid.sc_file != fp) > + continue; > + if (__nfs4_check_deny_bmap(stp, access, > + share_access)) { > + spin_unlock(&clp->cl_lock); > + return true; > + } > + } > + } > + } > + spin_unlock(&clp->cl_lock); > + return false; > +} > + > +/* > + * Function to check if the nfserr_share_denied error for 'fp' resulted > + * from conflict with courtesy clients then release their state to resolve > + * the conflict. > + * > + * Function returns: > + * 0 - no conflict with courtesy clients > + * >0 - conflict with courtesy clients resolved, try access/deny check again > + * -1 - conflict with courtesy clients being resolved in background > + * return nfserr_jukebox to NFS client > + */ > +static int > +nfs4_destroy_clnts_with_sresv_conflict(struct svc_rqst *rqstp, > + struct nfs4_file *fp, struct nfs4_ol_stateid *stp, > + u32 access, bool share_access) > +{ > + int cnt = 0; > + int async_cnt = 0; > + bool no_retry = false; > + struct nfs4_client *cl; > + struct list_head *pos, *next, reaplist; > + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); > + > + INIT_LIST_HEAD(&reaplist); > + spin_lock(&nn->client_lock); > + list_for_each_safe(pos, next, &nn->client_lru) { > + cl = list_entry(pos, struct nfs4_client, cl_lru); > + /* > + * check all nfs4_ol_stateid of this client > + * for conflicts with 'access'mode. > + */ > + if (nfs4_check_deny_bmap(cl, fp, stp, access, share_access)) { > + if (!test_bit(NFSD4_COURTESY_CLIENT, &cl->cl_flags)) { > + /* conflict with non-courtesy client */ > + no_retry = true; > + cnt = 0; > + goto out; > + } > + /* > + * if too many to resolve synchronously > + * then do the rest in background > + */ > + if (cnt > 100) { > + set_bit(NFSD4_DESTROY_COURTESY_CLIENT, &cl->cl_flags); > + async_cnt++; > + continue; > + } > + if (mark_client_expired_locked(cl)) > + continue; > + cnt++; > + list_add(&cl->cl_lru, &reaplist); > + } > + } Bruce suggested simply returning NFS4ERR_DELAY for all cases. That would simplify this quite a bit for what is a rare edge case. > +out: > + spin_unlock(&nn->client_lock); > + list_for_each_safe(pos, next, &reaplist) { > + cl = list_entry(pos, struct nfs4_client, cl_lru); > + list_del_init(&cl->cl_lru); > + expire_client(cl); > + } A slightly nicer construct here would be something like this: while ((pos = list_del_first(&reaplist))) expire_client(list_entry(pos, struct nfs4_client, cl_lru)); > + if (async_cnt) { > + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); > + if (!no_retry) > + cnt = -1; > + } > + return cnt; > +} > + > static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, > struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, > struct nfsd4_open *open) > @@ -4921,6 +5090,7 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, > int oflag = nfs4_access_to_omode(open->op_share_access); > int access = nfs4_access_to_access(open->op_share_access); > unsigned char old_access_bmap, old_deny_bmap; > + int cnt = 0; > > spin_lock(&fp->fi_lock); > > @@ -4928,16 +5098,38 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, > * Are we trying to set a deny mode that would conflict with > * current access? > */ > +chk_deny: > status = nfs4_file_check_deny(fp, open->op_share_deny); > if (status != nfs_ok) { > spin_unlock(&fp->fi_lock); > + if (status != nfserr_share_denied) > + goto out; > + cnt = nfs4_destroy_clnts_with_sresv_conflict(rqstp, fp, > + stp, open->op_share_deny, false); > + if (cnt > 0) { > + spin_lock(&fp->fi_lock); > + goto chk_deny; I'm pondering whether a distributed set of clients can cause this loop to never terminate. > + } > + if (cnt == -1) > + status = nfserr_jukebox; > goto out; > } > > /* set access to the file */ > +get_access: > status = nfs4_file_get_access(fp, open->op_share_access); > if (status != nfs_ok) { > spin_unlock(&fp->fi_lock); > + if (status != nfserr_share_denied) > + goto out; > + cnt = nfs4_destroy_clnts_with_sresv_conflict(rqstp, fp, > + stp, open->op_share_access, true); > + if (cnt > 0) { > + spin_lock(&fp->fi_lock); > + goto get_access; Ditto. > + } > + if (cnt == -1) > + status = nfserr_jukebox; > goto out; > } > > @@ -5289,6 +5481,22 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, > */ > } > > +static bool > +nfs4_destroy_courtesy_client(struct nfs4_client *clp) > +{ > + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); > + > + spin_lock(&nn->client_lock); > + if (!test_bit(NFSD4_COURTESY_CLIENT, &clp->cl_flags) || > + mark_client_expired_locked(clp)) { > + spin_unlock(&nn->client_lock); > + return false; > + } > + spin_unlock(&nn->client_lock); > + expire_client(clp); > + return true; > +} > + Perhaps nfs4_destroy_courtesy_client() could be merged into nfsd4_fl_expire_lock(), it's only caller. > __be32 > nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) > { > @@ -5572,6 +5780,47 @@ static void nfsd4_ssc_expire_umount(struct nfsd_net *nn) > } > #endif > > +static > +bool nfs4_anylock_conflict(struct nfs4_client *clp) > +{ > + int i; > + struct nfs4_stateowner *so, *tmp; > + struct nfs4_lockowner *lo; > + struct nfs4_ol_stateid *stp; > + struct nfs4_file *nf; > + struct inode *ino; > + struct file_lock_context *ctx; > + struct file_lock *fl; > + > + for (i = 0; i < OWNER_HASH_SIZE; i++) { > + /* scan each lock owner */ > + list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i], > + so_strhash) { > + if (so->so_is_open_owner) > + continue; Isn't cl_lock needed to protect the cl_ownerstr_hashtbl lists? > + > + /* scan lock states of this lock owner */ > + lo = lockowner(so); > + list_for_each_entry(stp, &lo->lo_owner.so_stateids, > + st_perstateowner) { > + nf = stp->st_stid.sc_file; > + ino = nf->fi_inode; > + ctx = ino->i_flctx; > + if (!ctx) > + continue; > + /* check each lock belongs to this lock state */ > + list_for_each_entry(fl, &ctx->flc_posix, fl_list) { > + if (fl->fl_owner != lo) > + continue; > + if (!list_empty(&fl->fl_blocked_requests)) > + return true; > + } > + } > + } > + } > + return false; > +} > + > static time64_t > nfs4_laundromat(struct nfsd_net *nn) > { > @@ -5587,7 +5836,9 @@ nfs4_laundromat(struct nfsd_net *nn) > }; > struct nfs4_cpntf_state *cps; > copy_stateid_t *cps_t; > + struct nfs4_stid *stid; > int i; > + int id = 0; > > if (clients_still_reclaiming(nn)) { > lt.new_timeo = 0; > @@ -5608,8 +5859,33 @@ nfs4_laundromat(struct nfsd_net *nn) > spin_lock(&nn->client_lock); > list_for_each_safe(pos, next, &nn->client_lru) { > clp = list_entry(pos, struct nfs4_client, cl_lru); > + if (test_bit(NFSD4_DESTROY_COURTESY_CLIENT, &clp->cl_flags)) { > + clear_bit(NFSD4_COURTESY_CLIENT, &clp->cl_flags); > + goto exp_client; > + } > + if (test_bit(NFSD4_COURTESY_CLIENT, &clp->cl_flags)) { > + if (ktime_get_boottime_seconds() >= clp->courtesy_client_expiry) > + goto exp_client; > + /* > + * after umount, v4.0 client is still > + * around waiting to be expired > + */ > + if (clp->cl_minorversion) > + continue; > + } > if (!state_expired(<, clp->cl_time)) > break; > + spin_lock(&clp->cl_lock); > + stid = idr_get_next(&clp->cl_stateids, &id); > + spin_unlock(&clp->cl_lock); > + if (stid && !nfs4_anylock_conflict(clp)) { > + /* client still has states */ > + clp->courtesy_client_expiry = > + ktime_get_boottime_seconds() + courtesy_client_expiry; > + set_bit(NFSD4_COURTESY_CLIENT, &clp->cl_flags); > + continue; > + } > +exp_client: > if (mark_client_expired_locked(clp)) > continue; > list_add(&clp->cl_lru, &reaplist); > @@ -5689,9 +5965,6 @@ nfs4_laundromat(struct nfsd_net *nn) > return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); > } > > -static struct workqueue_struct *laundry_wq; > -static void laundromat_main(struct work_struct *); > - > static void > laundromat_main(struct work_struct *laundry) > { > @@ -6496,6 +6769,19 @@ nfs4_transform_lock_offset(struct file_lock *lock) > lock->fl_end = OFFSET_MAX; > } > > +/* return true if lock was expired else return false */ > +static bool > +nfsd4_fl_expire_lock(struct file_lock *fl, bool testonly) > +{ > + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner; > + struct nfs4_client *clp = lo->lo_owner.so_client; > + > + if (testonly) > + return test_bit(NFSD4_COURTESY_CLIENT, &clp->cl_flags) ? > + true : false; Hm. I know test_bit() returns an integer rather than a boolean, but the ternary here is a bit unwieldy. How about just: return !!test_bit(NFSD4_COURTESY_CLIENT, &clp->cl_flags); > + return nfs4_destroy_courtesy_client(clp); > +} > + > static fl_owner_t > nfsd4_fl_get_owner(fl_owner_t owner) > { > @@ -6543,6 +6829,7 @@ static const struct lock_manager_operations nfsd_posix_mng_ops = { > .lm_notify = nfsd4_lm_notify, > .lm_get_owner = nfsd4_fl_get_owner, > .lm_put_owner = nfsd4_fl_put_owner, > + .lm_expire_lock = nfsd4_fl_expire_lock, > }; > > static inline void > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h > index e73bdbb1634a..93e30b101578 100644 > --- a/fs/nfsd/state.h > +++ b/fs/nfsd/state.h > @@ -345,6 +345,8 @@ struct nfs4_client { > #define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */ > #define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ > 1 << NFSD4_CLIENT_CB_KILL) > +#define NFSD4_COURTESY_CLIENT (6) /* be nice to expired client */ > +#define NFSD4_DESTROY_COURTESY_CLIENT (7) > unsigned long cl_flags; > const struct cred *cl_cb_cred; > struct rpc_clnt *cl_cb_client; > @@ -385,6 +387,7 @@ struct nfs4_client { > struct list_head async_copies; /* list of async copies */ > spinlock_t async_lock; /* lock for async copies */ > atomic_t cl_cb_inflight; /* Outstanding callbacks */ > + int courtesy_client_expiry; > }; > > /* struct nfs4_client_reset > -- > 2.9.5 > -- Chuck Lever