On Tue, Jun 8, 2010 at 12:23 AM, Benny Halevy <bhalevy@xxxxxxxxxxx> wrote: > On Jun. 08, 2010, 0:11 +0300, Alexandros Batsakis <batsakis@xxxxxxxxxx> wrote: >> Forgetful client model: >> >> If we receive a CB_LAYOUTRECALL >> - we spawn a thread to handle the recall >> (xxx: now only one recall can be active at a time, else NFS4ERR_DELAY) >> - we check the stateid seqid >> if it does not match we return NFS4ERR_DELAY >> - we check for pending I/O >> if there is we return NFS4ERR_DELAY >> Else we return NO_MATCHING_LAYOUT. >> Note that for whole file layouts there is no need to serialize LAYOUTGETs/LAYOUTRETURNs >> For bulk layouts, if there is a layout active, we return NFS4_OK and we start >> cleaning the layouts asynchronously. At the end we send a bulk LAYOUTRETURN. >> Note that there is no need to prevent any new LAYOUTGETs explicitly as the server should reject them. >> >> Signed-off-by: Alexandros Batsakis <batsakis@xxxxxxxxxx> >> --- >> fs/nfs/callback_proc.c | 146 ++++++++++++++++++++++++++++++++++-------------- >> fs/nfs/nfs4_fs.h | 1 + >> fs/nfs/pnfs.c | 70 ++++++++++------------- >> 3 files changed, 136 insertions(+), 81 deletions(-) >> >> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c >> index 3bae785..af7a01d 100644 >> --- a/fs/nfs/callback_proc.c >> +++ b/fs/nfs/callback_proc.c >> @@ -129,6 +129,38 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf >> >> #if defined(CONFIG_NFS_V4_1) >> >> +static bool >> +pnfs_is_next_layout_stateid(const struct pnfs_layout_type *lo, >> + const nfs4_stateid stateid) >> +{ >> + int seqlock; >> + bool res; >> + u32 oldseqid, newseqid; >> + >> + do { >> + seqlock = read_seqbegin(&lo->seqlock); >> + oldseqid = be32_to_cpu(lo->stateid.u.stateid.seqid); >> + newseqid = be32_to_cpu(stateid.u.stateid.seqid); >> + res = !memcmp(lo->stateid.u.stateid.other, >> + stateid.u.stateid.other, >> + NFS4_STATEID_OTHER_SIZE); >> + if (res) { /* comparing layout stateids */ >> + if (oldseqid == ~0) >> + res = (newseqid == 1); >> + else >> + res = (newseqid == oldseqid + 1); >> + } else { /* open stateid */ >> + res = !memcmp(lo->stateid.u.data, >> + &zero_stateid, >> + NFS4_STATEID_SIZE); >> + if (res) >> + res = (newseqid == 1); >> + } >> + } while (read_seqretry(&lo->seqlock, seqlock)); >> + >> + return res; >> +} >> + >> /* >> * Retrieve an inode based on layout recall parameters >> * >> @@ -191,9 +223,10 @@ static int pnfs_recall_layout(void *data) >> struct inode *inode, *ino; >> struct nfs_client *clp; >> struct cb_pnfs_layoutrecallargs rl; >> + struct nfs4_pnfs_layoutreturn *lrp; >> struct recall_layout_threadargs *args = >> (struct recall_layout_threadargs *)data; >> - int status; >> + int status = 0; >> >> daemonize("nfsv4-layoutreturn"); >> >> @@ -204,47 +237,59 @@ static int pnfs_recall_layout(void *data) >> clp = args->clp; >> inode = args->inode; >> rl = *args->rl; >> - args->result = 0; >> - complete(&args->started); >> - args = NULL; >> - /* Note: args must not be used after this point!!! */ >> - >> -/* FIXME: need barrier here: >> - pause I/O to data servers >> - pause layoutgets >> - drain all outstanding writes to storage devices >> - wait for any outstanding layoutreturns and layoutgets mentioned in >> - cb_sequence. >> - then return layouts, resume after layoutreturns complete >> - */ >> >> /* support whole file layouts only */ >> rl.cbl_seg.offset = 0; >> rl.cbl_seg.length = NFS4_MAX_UINT64; >> >> if (rl.cbl_recall_type == RETURN_FILE) { >> - status = pnfs_return_layout(inode, &rl.cbl_seg, &rl.cbl_stateid, >> - RETURN_FILE, true); >> + if (pnfs_is_next_layout_stateid(&NFS_I(inode)->layout, >> + rl.cbl_stateid)) >> + status = pnfs_return_layout(inode, &rl.cbl_seg, >> + &rl.cbl_stateid, RETURN_FILE, >> + false); >> + else >> + status = cpu_to_be32(NFS4ERR_DELAY); >> if (status) >> dprintk("%s RETURN_FILE error: %d\n", __func__, status); >> + else >> + status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); >> + args->result = status; >> + complete(&args->started); >> goto out; >> } >> >> - /* FIXME: This loop is inefficient, running in O(|s_inodes|^2) */ >> + status = cpu_to_be32(NFS4_OK); >> + args->result = status; >> + complete(&args->started); >> + args = NULL; >> + >> + /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */ >> while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) { >> - /* XXX need to check status on pnfs_return_layout */ >> - pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, true); >> + /* FIXME: need to check status on pnfs_return_layout */ >> + pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false); >> iput(ino); >> } >> >> + lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); >> + if (!lrp) { >> + dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n", >> + __func__); >> + goto out; >> + } >> + >> /* send final layoutreturn */ >> - status = pnfs_return_layout(inode, &rl.cbl_seg, NULL, >> - rl.cbl_recall_type, true); >> - if (status) >> - printk(KERN_INFO "%s: ignoring pnfs_return_layout status=%d\n", >> - __func__, status); >> + lrp->args.reclaim = 0; >> + lrp->args.layout_type = rl.cbl_layout_type; >> + lrp->args.return_type = rl.cbl_recall_type; >> + lrp->args.lseg = rl.cbl_seg; >> + lrp->args.inode = inode; >> + lrp->lo = NULL; >> + pnfs4_proc_layoutreturn(lrp, true); >> + >> out: >> - iput(inode); >> + clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); >> + nfs_put_client(clp); >> module_put_and_exit(0); >> dprintk("%s: exit status %d\n", __func__, 0); >> return 0; >> @@ -262,15 +307,18 @@ static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, >> .rl = rl, >> }; >> struct task_struct *t; >> - int status; >> - >> - /* should have returned NFS4ERR_NOMATCHING_LAYOUT... */ >> - BUG_ON(inode == NULL); >> + int status = -EAGAIN; >> >> dprintk("%s: -->\n", __func__); >> >> + /* FIXME: do not allow two concurrent layout recalls */ >> + if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) >> + return status; >> + >> init_completion(&data.started); >> __module_get(THIS_MODULE); >> + if (!atomic_inc_not_zero(&clp->cl_count)) >> + goto out_put_no_client; >> >> t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout"); >> if (IS_ERR(t)) { >> @@ -284,6 +332,9 @@ static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, >> wait_for_completion(&data.started); >> return data.result; >> out_module_put: >> + nfs_put_client(clp); >> +out_put_no_client: >> + clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); >> module_put(THIS_MODULE); >> return status; >> } >> @@ -294,35 +345,46 @@ __be32 pnfs_cb_layoutrecall(struct cb_pnfs_layoutrecallargs *args, >> struct nfs_client *clp; >> struct inode *inode = NULL; >> __be32 res; >> + int status; >> unsigned int num_client = 0; >> >> dprintk("%s: -->\n", __func__); >> >> - res = htonl(NFS4ERR_INVAL); >> - clp = nfs_find_client(args->cbl_addr, 4); >> + res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); >> + clp = nfs_find_client(args->cbl_addr, 4); >> if (clp == NULL) { >> dprintk("%s: no client for addr %u.%u.%u.%u\n", >> __func__, NIPQUAD(args->cbl_addr)); >> goto out; >> } >> >> - res = htonl(NFS4ERR_NOMATCHING_LAYOUT); >> + res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); >> do { >> struct nfs_client *prev = clp; >> num_client++; >> - inode = nfs_layoutrecall_find_inode(clp, args); >> - if (inode != NULL) { >> - if (PNFS_LD(&NFS_I(inode)->layout)->id == >> - args->cbl_layout_type) { >> - /* Set up a helper thread to actually >> - * return the delegation */ >> - res = pnfs_async_return_layout(clp, inode, args); >> - if (res != 0) >> - res = htonl(NFS4ERR_RESOURCE); >> - break; >> + /* the callback must come from the MDS personality */ >> + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) >> + goto loop; >> + if (args->cbl_recall_type == RETURN_FILE) { >> + inode = nfs_layoutrecall_find_inode(clp, args); >> + if (inode != NULL) { >> + status = pnfs_async_return_layout(clp, inode, >> + args); >> + if (status == -EAGAIN) >> + res = cpu_to_be32(NFS4ERR_DELAY); > > what about other errors? > pnfs_async_return_layout does not send any RPCs, so it's either EAGAIN or an "out of memory" error in which case I guess it's better to return NFS4ERR_RESOURCE than NFS4ERR_NO_MATCHING_LAYOUT. So you are right, I ll send a fix. >> + iput(inode); >> } >> + } else { /* _ALL or _FSID */ >> + /* we need the inode to get the nfs_server struct */ >> + inode = nfs_layoutrecall_find_inode(clp, args); >> + if (!inode) >> + goto loop; >> + status = pnfs_async_return_layout(clp, inode, args); >> + if (status == -EAGAIN) >> + res = cpu_to_be32(NFS4ERR_DELAY); > > ditto > >> iput(inode); >> } >> +loop: >> clp = nfs_find_client_next(prev); >> nfs_put_client(prev); >> } while (clp != NULL); >> diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h >> index ebc9b3b..2f7974b 100644 >> --- a/fs/nfs/nfs4_fs.h >> +++ b/fs/nfs/nfs4_fs.h >> @@ -47,6 +47,7 @@ enum nfs4_client_state { >> NFS4CLNT_SESSION_RESET, >> NFS4CLNT_SESSION_DRAINING, >> NFS4CLNT_RECALL_SLOT, >> + NFS4CLNT_LAYOUT_RECALL, >> }; >> >> /* >> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c >> index d0b45bf..2006926 100644 >> --- a/fs/nfs/pnfs.c >> +++ b/fs/nfs/pnfs.c >> @@ -709,6 +709,8 @@ return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, >> >> dprintk("--> %s\n", __func__); >> >> + BUG_ON(type != RETURN_FILE); >> + >> lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); >> if (lrp == NULL) { >> if (lo && (type == RETURN_FILE)) >> @@ -745,13 +747,11 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, >> >> dprintk("--> %s type %d\n", __func__, type); >> >> - if (range) >> - arg = *range; >> - else { >> - arg.iomode = IOMODE_ANY; >> - arg.offset = 0; >> - arg.length = NFS4_MAX_UINT64; >> - } >> + >> + arg.iomode = range ? range->iomode : IOMODE_ANY; >> + arg.offset = 0; >> + arg.length = NFS4_MAX_UINT64; >> + >> if (type == RETURN_FILE) { >> lo = get_lock_current_layout(nfsi); >> if (lo && !has_layout_to_return(lo, &arg)) { >> @@ -760,11 +760,7 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, >> } >> if (!lo) { >> dprintk("%s: no layout segments to return\n", __func__); >> - /* must send the LAYOUTRETURN in response to recall */ >> - if (stateid) >> - goto send_return; >> - else >> - goto out; >> + goto out; >> } >> >> /* unlock w/o put rebalanced by eventual call to >> @@ -773,12 +769,23 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, >> unlock_current_layout(nfsi); >> >> if (pnfs_return_layout_barrier(nfsi, &arg)) { >> + if (stateid) { /* callback */ >> + status = -EAGAIN; >> + lock_current_layout(nfsi); >> + put_unlock_current_layout(lo); >> + goto out; >> + } >> dprintk("%s: waiting\n", __func__); >> wait_event(nfsi->lo_waitq, >> - !pnfs_return_layout_barrier(nfsi, &arg)); >> + !pnfs_return_layout_barrier(nfsi, &arg)); >> } >> >> if (layoutcommit_needed(nfsi)) { >> + if (stateid && !wait) { /* callback */ >> + dprintk("%s: layoutcommit pending\n", __func__); >> + status = -EAGAIN; >> + goto out; >> + } >> status = pnfs_layoutcommit_inode(ino, wait); >> if (status) { >> dprintk("%s: layoutcommit failed, status=%d. " >> @@ -787,9 +794,13 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range, >> status = 0; >> } >> } >> + >> + if (stateid && wait) >> + status = return_layout(ino, &arg, stateid, type, >> + lo, wait); >> + else >> + pnfs_layout_release(lo, &arg); >> } >> -send_return: >> - status = return_layout(ino, &arg, stateid, type, lo, wait); >> out: >> dprintk("<-- %s status: %d\n", __func__, status); >> return status; >> @@ -1044,7 +1055,7 @@ pnfs_update_layout(struct inode *ino, >> struct nfs4_pnfs_layout_segment arg = { >> .iomode = iomode, >> .offset = 0, >> - .length = ~0 >> + .length = NFS4_MAX_UINT64, > > why do you have to ask for whole file layouts? > Isn't it enough to always return the whole layout > but potentially having more than one layout segment? > Supposedly version A will not support multiple segments. Is this what you mean ? I guarantee it by setting "minlength" equal to "length" in Layoutget. I just wanted to enforce it here too. -alexandros > Benny > >> }; >> struct nfs_inode *nfsi = NFS_I(ino); >> struct pnfs_layout_type *lo; >> @@ -1063,31 +1074,12 @@ pnfs_update_layout(struct inode *ino, >> /* Check to see if the layout for the given range already exists */ >> lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref); >> if (lseg && !lseg->valid) { >> - unlock_current_layout(nfsi); >> if (take_ref) >> put_lseg(lseg); >> - for (;;) { >> - prepare_to_wait(&nfsi->lo_waitq, &__wait, >> - TASK_KILLABLE); >> - lock_current_layout(nfsi); >> - lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref); >> - if (!lseg || lseg->valid) >> - break; >> - dprintk("%s: invalid lseg %p ref %d\n", __func__, >> - lseg, atomic_read(&lseg->kref.refcount)-1); >> - if (take_ref) >> - put_lseg(lseg); >> - if (signal_pending(current)) { >> - lseg = NULL; >> - result = -ERESTARTSYS; >> - break; >> - } >> - unlock_current_layout(nfsi); >> - schedule(); >> - } >> - finish_wait(&nfsi->lo_waitq, &__wait); >> - if (result) >> - goto out_put; >> + >> + /* someone is cleaning the layout */ >> + result = -EAGAIN; >> + goto out_put; >> } >> >> if (lseg) { > > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html