Forgetful client model: If we receive a CB_LAYOUTRECALL - we spawn a thread to handle the recall (xxx: now only one recall can be active at a time, else NFS4ERR_DELAY) - we check the stateid seqid if it does not match we return NFS4ERR_DELAY - we check for pending I/O if there is we return NFS4ERR_DELAY Else we return NO_MATCHING_LAYOUT. Note that for whole file layouts there is no need to serialize LAYOUTGETs/LAYOUTRETURNs For bulk layouts, if there is a layout active, we return NFS4_OK and we start cleaning the layouts asynchronously. At the end we send a bulk LAYOUTRETURN. Note that there is no need to prevent any new LAYOUTGETs explicitly as the server should reject them. Signed-off-by: Alexandros Batsakis <batsakis@xxxxxxxxxx> --- fs/nfs/callback_proc.c | 131 ++++++++++++++++++++++++++++++++++-------------- fs/nfs/inode.c | 2 +- fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4proc.c | 4 +- fs/nfs/nfs4state.c | 2 +- fs/nfs/pnfs.c | 78 +++++++++++++--------------- fs/nfs/pnfs.h | 8 ++- 7 files changed, 139 insertions(+), 87 deletions(-) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 24e2571..419fee7 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -129,6 +129,38 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf #if defined(CONFIG_NFS_V4_1) +static int +pnfs_is_next_layout_stateid(const struct pnfs_layout_type *lo, + const nfs4_stateid stateid) +{ + int seq; + int res; + u32 oldseqid, newseqid; + + do { + seq = read_seqbegin(&lo->seqlock); + oldseqid = ntohl(lo->stateid.u.stateid.seqid); + newseqid = ntohl(stateid.u.stateid.seqid); + res = !memcmp(lo->stateid.u.stateid.other, + stateid.u.stateid.other, + NFS4_STATEID_OTHER_SIZE); + if (res) { /* comparing layout stateids */ + if (oldseqid == ~0) + res = (newseqid == 1); + else + res = (newseqid == oldseqid + 1); + } else { /* open stateid */ + res = !memcmp(lo->stateid.u.data, + &zero_stateid, + NFS4_STATEID_SIZE); + if (res) + res = (newseqid == 1); + } + } while (read_seqretry(&lo->seqlock, seq)); + + return res; +} + /* * Retrieve an inode based on layout recall parameters * @@ -190,10 +222,11 @@ static int pnfs_recall_layout(void *data) { struct inode *inode, *ino; struct nfs_client *clp; + struct nfs4_pnfs_layoutreturn *lrp; struct cb_pnfs_layoutrecallargs rl; struct recall_layout_threadargs *args = (struct recall_layout_threadargs *)data; - int status; + int status = 0; daemonize("nfsv4-layoutreturn"); @@ -204,46 +237,57 @@ static int pnfs_recall_layout(void *data) clp = args->clp; inode = args->inode; rl = *args->rl; - args->result = 0; - complete(&args->started); - args = NULL; - /* Note: args must not be used after this point!!! */ - -/* FIXME: need barrier here: - pause I/O to data servers - pause layoutgets - drain all outstanding writes to storage devices - wait for any outstanding layoutreturns and layoutgets mentioned in - cb_sequence. - then return layouts, resume after layoutreturns complete - */ /* support whole file layouts only */ rl.cbl_seg.offset = 0; rl.cbl_seg.length = NFS4_MAX_UINT64; if (rl.cbl_recall_type == RETURN_FILE) { - status = pnfs_return_layout(inode, &rl.cbl_seg, &rl.cbl_stateid, - RETURN_FILE); + if (pnfs_is_next_layout_stateid(&NFS_I(inode)->layout, rl.cbl_stateid)) + status = pnfs_return_layout(inode, &rl.cbl_seg, &rl.cbl_stateid, + RETURN_FILE, 0); + else + status = htonl(NFS4ERR_DELAY); + /* forgetful client */ if (status) - dprintk("%s RETURN_FILE error: %d\n", __func__, status); + dprintk("%s RETURN_FILE : %d\n", __func__, status); + else + status = htonl(NFS4ERR_NOMATCHING_LAYOUT); + args->result = status; + complete(&args->started); goto out; } - /* FIXME: This loop is inefficient, running in O(|s_inodes|^2) */ + status = htonl(NFS4_OK); + args->result = status; + complete(&args->started); + args = NULL; + while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) { /* XXX need to check status on pnfs_return_layout */ - pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE); + pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, 0); iput(ino); } - /* send final layoutreturn */ - status = pnfs_return_layout(inode, &rl.cbl_seg, NULL, rl.cbl_recall_type); - if (status) - printk(KERN_INFO "%s: ignoring pnfs_return_layout status=%d\n", - __func__, status); + lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); + if (!lrp) { + dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n", + __func__); + goto out; + } + + /* send final layoutreturn (bulk) */ + lrp->args.reclaim = 0; + lrp->args.layout_type = rl.cbl_layout_type; + lrp->args.return_type = rl.cbl_recall_type; + lrp->args.lseg = rl.cbl_seg; + lrp->args.inode = inode; + lrp->lo = NULL; + + pnfs4_proc_layoutreturn(lrp); out: - iput(inode); + clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); + nfs_put_client(clp); module_put_and_exit(0); dprintk("%s: exit status %d\n", __func__, 0); return 0; @@ -261,15 +305,18 @@ static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, .rl = rl, }; struct task_struct *t; - int status; - - /* should have returned NFS4ERR_NOMATCHING_LAYOUT... */ - BUG_ON(inode == NULL); + int status = htonl(NFS4ERR_DELAY); dprintk("%s: -->\n", __func__); + /* XXX: do not allow two concurrent layout recalls */ + if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) + return status; + init_completion(&data.started); __module_get(THIS_MODULE); + if (!atomic_inc_not_zero(&clp->cl_count)) + goto out_put_no_client; t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout"); if (IS_ERR(t)) { @@ -283,6 +330,9 @@ static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, wait_for_completion(&data.started); return data.result; out_module_put: + nfs_put_client(clp); +out_put_no_client: + clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); module_put(THIS_MODULE); return status; } @@ -309,19 +359,24 @@ __be32 pnfs_cb_layoutrecall(struct cb_pnfs_layoutrecallargs *args, do { struct nfs_client *prev = clp; num_client++; - inode = nfs_layoutrecall_find_inode(clp, args); - if (inode != NULL) { - if (PNFS_LD(&NFS_I(inode)->layout)->id == - args->cbl_layout_type) { - /* Set up a helper thread to actually - * return the delegation */ + /* the callback must come from the MDS personality */ + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) + goto loop; + if (args->cbl_recall_type == RETURN_FILE) { + inode = nfs_layoutrecall_find_inode(clp, args); + if (inode != NULL) { res = pnfs_async_return_layout(clp, inode, args); - if (res != 0) - res = htonl(NFS4ERR_RESOURCE); - break; + iput(inode); } + } else { + /* we need the inode to get the nfs_server struct */ + inode = nfs_layoutrecall_find_inode(clp, args); + if (!inode) + goto loop; + res = pnfs_async_return_layout(clp, inode, args); iput(inode); } +loop: clp = nfs_find_client_next(prev); nfs_put_client(prev); } while (clp != NULL); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index adf3280..584c9b8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1321,7 +1321,7 @@ void nfs4_clear_inode(struct inode *inode) /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); #ifdef CONFIG_NFS_V4_1 - pnfs_return_layout(inode, NULL, NULL, RETURN_FILE); + pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, 1); #endif /* CONFIG_NFS_V4_1 */ } #endif diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 40ebe1d..90c57b3 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -47,6 +47,7 @@ enum nfs4_client_state { NFS4CLNT_SESSION_RESET, NFS4CLNT_SESSION_DRAINING, NFS4CLNT_RECALL_SLOT, + NFS4CLNT_LAYOUT_RECALL, }; /* diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6739d15..14e90e1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1085,7 +1085,7 @@ static void pnfs4_layout_reclaim(struct nfs4_state *state) /* FIXME: send gratuitous layout commits and return with the reclaim * flag during grace period */ - pnfs_return_layout(state->inode, NULL, &state->open_stateid, RETURN_FILE); + pnfs_return_layout(state->inode, NULL, NULL, RETURN_FILE, 1); pnfs_set_layout_stateid(&NFS_I(state->inode)->layout, &zero_stateid); #endif /* CONFIG_NFS_V4_1 */ } @@ -2382,7 +2382,7 @@ pnfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, if (pnfs_enabled_sb(server) && has_layout(nfsi) && pnfs_ld_layoutret_on_setattr(server->pnfs_curr_ld)) - pnfs_return_layout(inode, NULL, NULL, RETURN_FILE); + pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, 1); return nfs4_proc_setattr(dentry, fattr, sattr); } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2d52300..cd2ae83 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -598,7 +598,7 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fm range.offset = 0; range.length = NFS4_MAX_UINT64; pnfs_return_layout(state->inode, &range, NULL, - RETURN_FILE); + RETURN_FILE, 1); } #endif /* CONFIG_NFS_V4_1 */ nfs4_do_close(path, state, wait); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3b4dea0..2c96723 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -610,7 +610,7 @@ get_layout(struct inode *ino, */ static inline int should_free_lseg(struct pnfs_layout_segment *lseg, - struct nfs4_pnfs_layout_segment *range) + struct nfs4_pnfs_layout_segment *range) { return (range->iomode == IOMODE_ANY || lseg->range.iomode == range->iomode) && @@ -703,6 +703,8 @@ return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, dprintk("--> %s\n", __func__); + BUG_ON(type != RETURN_FILE); + lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (lrp == NULL) { if (lo && (type == RETURN_FILE)) @@ -729,7 +731,8 @@ out: int _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, const nfs4_stateid *stateid, /* optional */ - enum pnfs_layoutreturn_type type) + enum pnfs_layoutreturn_type type, + int sendreturn) { struct pnfs_layout_type *lo = NULL; struct nfs_inode *nfsi = NFS_I(ino); @@ -739,14 +742,19 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, dprintk("--> %s type %d\n", __func__, type); if (range) - arg = *range; - else { + arg.iomode = range->iomode; + else arg.iomode = IOMODE_ANY; - arg.offset = 0; - arg.length = ~0; - } + arg.offset = 0; + arg.length = ~0; + if (type == RETURN_FILE) { if (nfsi->layout.layoutcommit_ctx) { + if (stateid && !sendreturn) { /* callback */ + dprintk("%s: layoutcommit pending\n", __func__); + status = htonl(NFS4ERR_DELAY); + goto out; + } status = pnfs_layoutcommit_inode(ino, 1); if (status) { dprintk("%s: layoutcommit failed, status=%d. " @@ -763,11 +771,7 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, } if (!lo) { dprintk("%s: no layout segments to return\n", __func__); - /* must send the LAYOUTRETURN in response to recall */ - if (stateid) - goto send_return; - else - goto out; + goto out; } /* unlock w/o put rebalanced by eventual call to @@ -776,13 +780,22 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, unlock_current_layout(nfsi); if (pnfs_return_layout_barrier(nfsi, &arg)) { + if (stateid) { /* callback */ + status = htonl(NFS4ERR_DELAY); + lock_current_layout(nfsi); + put_unlock_current_layout(lo); + goto out; + } dprintk("%s: waiting\n", __func__); wait_event(nfsi->lo_waitq, - !pnfs_return_layout_barrier(nfsi, &arg)); + !pnfs_return_layout_barrier(nfsi, &arg)); } + + if (sendreturn) + status = return_layout(ino, &arg, stateid, type, lo); + else + pnfs_layout_release(lo, &arg); } -send_return: - status = return_layout(ino, &arg, stateid, type, lo); out: dprintk("<-- %s status: %d\n", __func__, status); return status; @@ -1080,31 +1093,12 @@ pnfs_update_layout(struct inode *ino, /* Check to see if the layout for the given range already exists */ lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref); if (lseg && !lseg->valid) { - unlock_current_layout(nfsi); if (take_ref) put_lseg(lseg); - for (;;) { - prepare_to_wait(&nfsi->lo_waitq, &__wait, - TASK_KILLABLE); - lock_current_layout(nfsi); - lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref); - if (!lseg || lseg->valid) - break; - dprintk("%s: invalid lseg %p ref %d\n", __func__, - lseg, atomic_read(&lseg->kref.refcount)-1); - if (take_ref) - put_lseg(lseg); - if (signal_pending(current)) { - lseg = NULL; - result = -ERESTARTSYS; - break; - } - unlock_current_layout(nfsi); - schedule(); - } - finish_wait(&nfsi->lo_waitq, &__wait); - if (result) - goto out_put; + + /* someone is cleaning the layout */ + result = -EAGAIN; + goto out_put; } if (lseg) { @@ -1434,7 +1428,7 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, if (count > 0 && !below_threshold(inode, count, 0)) { status = pnfs_update_layout(inode, ctx, count, - loff, IOMODE_READ, NULL); + loff, IOMODE_READ, NULL); dprintk("%s *rsize %Zd virt update returned %d\n", __func__, *rsize, status); if (status != 0) @@ -1673,7 +1667,7 @@ pnfs_writeback_done(struct nfs_write_data *data) .length = data->args.count, }; dprintk("%s: retrying\n", __func__); - _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE); + _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, 1); pnfs_initiate_write(data, NFS_CLIENT(data->inode), pdata->call_ops, pdata->how); } @@ -1804,7 +1798,7 @@ pnfs_read_done(struct nfs_read_data *data) .length = data->args.count, }; dprintk("%s: retrying\n", __func__); - _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE); + _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, 1); pnfs_initiate_read(data, NFS_CLIENT(data->inode), pdata->call_ops); } @@ -2030,7 +2024,7 @@ pnfs_commit_done(struct nfs_write_data *data) .length = data->args.count, }; dprintk("%s: retrying\n", __func__); - _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE); + _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, 1); pnfs_initiate_commit(data, NFS_CLIENT(data->inode), pdata->call_ops, pdata->how); } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 9e43973..794a2d3 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -40,7 +40,8 @@ int pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, int _pnfs_return_layout(struct inode *, struct nfs4_pnfs_layout_segment *, const nfs4_stateid *stateid, /* optional */ - enum pnfs_layoutreturn_type); + enum pnfs_layoutreturn_type, + int sendreturn); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id); void unmount_pnfs_layoutdriver(struct nfs_server *); int pnfs_use_read(struct inode *inode, ssize_t count); @@ -236,14 +237,15 @@ static inline void pnfs_modify_new_request(struct nfs_page *req, static inline int pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *lseg, const nfs4_stateid *stateid, /* optional */ - enum pnfs_layoutreturn_type type) + enum pnfs_layoutreturn_type type, + int sendreturn) { struct nfs_inode *nfsi = NFS_I(ino); struct nfs_server *nfss = NFS_SERVER(ino); if (pnfs_enabled_sb(nfss) && (type != RETURN_FILE || has_layout(nfsi))) - return _pnfs_return_layout(ino, lseg, stateid, type); + return _pnfs_return_layout(ino, lseg, stateid, type, sendreturn); return 0; } -- 1.6.2.5 -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html