Re: [PATCH 7/8] pnfs-submit: forgetful client (layouts)

Alexandros Batsakis <alexandros@xxxxxxx> · Tue, 8 Jun 2010 00:51:16 -0700

On Tue, Jun 8, 2010 at 12:23 AM, Benny Halevy <bhalevy@xxxxxxxxxxx> wrote:
> On Jun. 08, 2010, 0:11 +0300, Alexandros Batsakis <batsakis@xxxxxxxxxx> wrote:
>> Forgetful client model:
>>
>> If we receive a CB_LAYOUTRECALL
>>         - we spawn a thread to handle the recall
>>         (xxx: now only one recall can be active at a time, else NFS4ERR_DELAY)
>>         - we check the stateid seqid
>>         if it does not match we return NFS4ERR_DELAY
>>         - we check for pending I/O
>>         if there is we return NFS4ERR_DELAY
>>         Else we return NO_MATCHING_LAYOUT.
>>         Note that for whole file layouts there is no need to serialize LAYOUTGETs/LAYOUTRETURNs
>> For bulk layouts, if there is a layout active, we return NFS4_OK and we start
>> cleaning the layouts asynchronously. At the end we send a bulk LAYOUTRETURN.
>> Note that there is no need to prevent any new LAYOUTGETs explicitly as the server should reject them.
>>
>> Signed-off-by: Alexandros Batsakis <batsakis@xxxxxxxxxx>
>> ---
>>  fs/nfs/callback_proc.c |  146 ++++++++++++++++++++++++++++++++++--------------
>>  fs/nfs/nfs4_fs.h       |    1 +
>>  fs/nfs/pnfs.c          |   70 ++++++++++-------------
>>  3 files changed, 136 insertions(+), 81 deletions(-)
>>
>> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
>> index 3bae785..af7a01d 100644
>> --- a/fs/nfs/callback_proc.c
>> +++ b/fs/nfs/callback_proc.c
>> @@ -129,6 +129,38 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
>>
>>  #if defined(CONFIG_NFS_V4_1)
>>
>> +static bool
>> +pnfs_is_next_layout_stateid(const struct pnfs_layout_type *lo,
>> +                         const nfs4_stateid stateid)
>> +{
>> +     int seqlock;
>> +     bool res;
>> +     u32 oldseqid, newseqid;
>> +
>> +     do {
>> +             seqlock = read_seqbegin(&lo->seqlock);
>> +             oldseqid = be32_to_cpu(lo->stateid.u.stateid.seqid);
>> +             newseqid = be32_to_cpu(stateid.u.stateid.seqid);
>> +             res = !memcmp(lo->stateid.u.stateid.other,
>> +                           stateid.u.stateid.other,
>> +                           NFS4_STATEID_OTHER_SIZE);
>> +             if (res) { /* comparing layout stateids */
>> +                     if (oldseqid == ~0)
>> +                             res = (newseqid == 1);
>> +                     else
>> +                             res = (newseqid == oldseqid + 1);
>> +             } else { /* open stateid */
>> +                     res = !memcmp(lo->stateid.u.data,
>> +                                   &zero_stateid,
>> +                                   NFS4_STATEID_SIZE);
>> +                     if (res)
>> +                             res = (newseqid == 1);
>> +             }
>> +     } while (read_seqretry(&lo->seqlock, seqlock));
>> +
>> +     return res;
>> +}
>> +
>>  /*
>>   * Retrieve an inode based on layout recall parameters
>>   *
>> @@ -191,9 +223,10 @@ static int pnfs_recall_layout(void *data)
>>       struct inode *inode, *ino;
>>       struct nfs_client *clp;
>>       struct cb_pnfs_layoutrecallargs rl;
>> +     struct nfs4_pnfs_layoutreturn *lrp;
>>       struct recall_layout_threadargs *args =
>>               (struct recall_layout_threadargs *)data;
>> -     int status;
>> +     int status = 0;
>>
>>       daemonize("nfsv4-layoutreturn");
>>
>> @@ -204,47 +237,59 @@ static int pnfs_recall_layout(void *data)
>>       clp = args->clp;
>>       inode = args->inode;
>>       rl = *args->rl;
>> -     args->result = 0;
>> -     complete(&args->started);
>> -     args = NULL;
>> -     /* Note: args must not be used after this point!!! */
>> -
>> -/* FIXME: need barrier here:
>> -   pause I/O to data servers
>> -   pause layoutgets
>> -   drain all outstanding writes to storage devices
>> -   wait for any outstanding layoutreturns and layoutgets mentioned in
>> -   cb_sequence.
>> -   then return layouts, resume after layoutreturns complete
>> - */
>>
>>       /* support whole file layouts only */
>>       rl.cbl_seg.offset = 0;
>>       rl.cbl_seg.length = NFS4_MAX_UINT64;
>>
>>       if (rl.cbl_recall_type == RETURN_FILE) {
>> -             status = pnfs_return_layout(inode, &rl.cbl_seg, &rl.cbl_stateid,
>> -                                         RETURN_FILE, true);
>> +             if (pnfs_is_next_layout_stateid(&NFS_I(inode)->layout,
>> +                                             rl.cbl_stateid))
>> +                     status = pnfs_return_layout(inode, &rl.cbl_seg,
>> +                                                 &rl.cbl_stateid, RETURN_FILE,
>> +                                                 false);
>> +             else
>> +                     status = cpu_to_be32(NFS4ERR_DELAY);
>>               if (status)
>>                       dprintk("%s RETURN_FILE error: %d\n", __func__, status);
>> +             else
>> +                     status =  cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
>> +             args->result = status;
>> +             complete(&args->started);
>>               goto out;
>>       }
>>
>> -     /* FIXME: This loop is inefficient, running in O(|s_inodes|^2) */
>> +     status = cpu_to_be32(NFS4_OK);
>> +     args->result = status;
>> +     complete(&args->started);
>> +     args = NULL;
>> +
>> +     /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */
>>       while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) {
>> -             /* XXX need to check status on pnfs_return_layout */
>> -             pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, true);
>> +             /* FIXME: need to check status on pnfs_return_layout */
>> +             pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false);
>>               iput(ino);
>>       }
>>
>> +     lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
>> +     if (!lrp) {
>> +             dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n",
>> +                     __func__);
>> +             goto out;
>> +     }
>> +
>>       /* send final layoutreturn */
>> -     status = pnfs_return_layout(inode, &rl.cbl_seg, NULL,
>> -                                 rl.cbl_recall_type, true);
>> -     if (status)
>> -             printk(KERN_INFO "%s: ignoring pnfs_return_layout status=%d\n",
>> -                             __func__, status);
>> +     lrp->args.reclaim = 0;
>> +     lrp->args.layout_type = rl.cbl_layout_type;
>> +     lrp->args.return_type = rl.cbl_recall_type;
>> +     lrp->args.lseg = rl.cbl_seg;
>> +     lrp->args.inode = inode;
>> +     lrp->lo = NULL;
>> +     pnfs4_proc_layoutreturn(lrp, true);
>> +
>>  out:
>> -     iput(inode);
>> +     clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
>> +     nfs_put_client(clp);
>>       module_put_and_exit(0);
>>       dprintk("%s: exit status %d\n", __func__, 0);
>>       return 0;
>> @@ -262,15 +307,18 @@ static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
>>               .rl = rl,
>>       };
>>       struct task_struct *t;
>> -     int status;
>> -
>> -     /* should have returned NFS4ERR_NOMATCHING_LAYOUT... */
>> -     BUG_ON(inode == NULL);
>> +     int status = -EAGAIN;
>>
>>       dprintk("%s: -->\n", __func__);
>>
>> +     /* FIXME: do not allow two concurrent layout recalls */
>> +     if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state))
>> +             return status;
>> +
>>       init_completion(&data.started);
>>       __module_get(THIS_MODULE);
>> +     if (!atomic_inc_not_zero(&clp->cl_count))
>> +             goto out_put_no_client;
>>
>>       t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout");
>>       if (IS_ERR(t)) {
>> @@ -284,6 +332,9 @@ static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
>>       wait_for_completion(&data.started);
>>       return data.result;
>>  out_module_put:
>> +     nfs_put_client(clp);
>> +out_put_no_client:
>> +     clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
>>       module_put(THIS_MODULE);
>>       return status;
>>  }
>> @@ -294,35 +345,46 @@ __be32 pnfs_cb_layoutrecall(struct cb_pnfs_layoutrecallargs *args,
>>       struct nfs_client *clp;
>>       struct inode *inode = NULL;
>>       __be32 res;
>> +     int status;
>>       unsigned int num_client = 0;
>>
>>       dprintk("%s: -->\n", __func__);
>>
>> -     res = htonl(NFS4ERR_INVAL);
>> -     clp = nfs_find_client(args->cbl_addr, 4);
>> +     res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
>> +     clp  = nfs_find_client(args->cbl_addr, 4);
>>       if (clp == NULL) {
>>               dprintk("%s: no client for addr %u.%u.%u.%u\n",
>>                       __func__, NIPQUAD(args->cbl_addr));
>>               goto out;
>>       }
>>
>> -     res = htonl(NFS4ERR_NOMATCHING_LAYOUT);
>> +     res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
>>       do {
>>               struct nfs_client *prev = clp;
>>               num_client++;
>> -             inode = nfs_layoutrecall_find_inode(clp, args);
>> -             if (inode != NULL) {
>> -                     if (PNFS_LD(&NFS_I(inode)->layout)->id ==
>> -                         args->cbl_layout_type) {
>> -                             /* Set up a helper thread to actually
>> -                              * return the delegation */
>> -                             res = pnfs_async_return_layout(clp, inode, args);
>> -                             if (res != 0)
>> -                                     res = htonl(NFS4ERR_RESOURCE);
>> -                             break;
>> +             /* the callback must come from the MDS personality */
>> +             if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
>> +                     goto loop;
>> +             if (args->cbl_recall_type == RETURN_FILE) {
>> +                     inode = nfs_layoutrecall_find_inode(clp, args);
>> +                     if (inode != NULL) {
>> +                             status = pnfs_async_return_layout(clp, inode,
>> +                                                               args);
>> +                             if (status == -EAGAIN)
>> +                                     res = cpu_to_be32(NFS4ERR_DELAY);
>
> what about other errors?
>

pnfs_async_return_layout does not send any RPCs, so it's either EAGAIN
or an "out of memory" error in which case I guess it's better to
return NFS4ERR_RESOURCE than NFS4ERR_NO_MATCHING_LAYOUT. So you are
right, I ll send a fix.

>> +                             iput(inode);
>>                       }
>> +             } else { /* _ALL or _FSID */
>> +                     /* we need the inode to get the nfs_server struct */
>> +                     inode = nfs_layoutrecall_find_inode(clp, args);
>> +                     if (!inode)
>> +                             goto loop;
>> +                     status = pnfs_async_return_layout(clp, inode, args);
>> +                     if (status == -EAGAIN)
>> +                             res = cpu_to_be32(NFS4ERR_DELAY);
>
> ditto
>
>>                       iput(inode);
>>               }
>> +loop:
>>               clp = nfs_find_client_next(prev);
>>               nfs_put_client(prev);
>>       } while (clp != NULL);
>> diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
>> index ebc9b3b..2f7974b 100644
>> --- a/fs/nfs/nfs4_fs.h
>> +++ b/fs/nfs/nfs4_fs.h
>> @@ -47,6 +47,7 @@ enum nfs4_client_state {
>>       NFS4CLNT_SESSION_RESET,
>>       NFS4CLNT_SESSION_DRAINING,
>>       NFS4CLNT_RECALL_SLOT,
>> +     NFS4CLNT_LAYOUT_RECALL,
>>  };
>>
>>  /*
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index d0b45bf..2006926 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -709,6 +709,8 @@ return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
>>
>>       dprintk("--> %s\n", __func__);
>>
>> +     BUG_ON(type != RETURN_FILE);
>> +
>>       lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
>>       if (lrp == NULL) {
>>               if (lo && (type == RETURN_FILE))
>> @@ -745,13 +747,11 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
>>
>>       dprintk("--> %s type %d\n", __func__, type);
>>
>> -     if (range)
>> -             arg = *range;
>> -     else {
>> -             arg.iomode = IOMODE_ANY;
>> -             arg.offset = 0;
>> -             arg.length = NFS4_MAX_UINT64;
>> -     }
>> +
>> +     arg.iomode = range ? range->iomode : IOMODE_ANY;
>> +     arg.offset = 0;
>> +     arg.length = NFS4_MAX_UINT64;
>> +
>>       if (type == RETURN_FILE) {
>>               lo = get_lock_current_layout(nfsi);
>>               if (lo && !has_layout_to_return(lo, &arg)) {
>> @@ -760,11 +760,7 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
>>               }
>>               if (!lo) {
>>                       dprintk("%s: no layout segments to return\n", __func__);
>> -                     /* must send the LAYOUTRETURN in response to recall */
>> -                     if (stateid)
>> -                             goto send_return;
>> -                     else
>> -                             goto out;
>> +                     goto out;
>>               }
>>
>>               /* unlock w/o put rebalanced by eventual call to
>> @@ -773,12 +769,23 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
>>               unlock_current_layout(nfsi);
>>
>>               if (pnfs_return_layout_barrier(nfsi, &arg)) {
>> +                     if (stateid) { /* callback */
>> +                             status = -EAGAIN;
>> +                             lock_current_layout(nfsi);
>> +                             put_unlock_current_layout(lo);
>> +                             goto out;
>> +                     }
>>                       dprintk("%s: waiting\n", __func__);
>>                       wait_event(nfsi->lo_waitq,
>> -                             !pnfs_return_layout_barrier(nfsi, &arg));
>> +                                !pnfs_return_layout_barrier(nfsi, &arg));
>>               }
>>
>>               if (layoutcommit_needed(nfsi)) {
>> +                     if (stateid && !wait) { /* callback */
>> +                             dprintk("%s: layoutcommit pending\n", __func__);
>> +                             status = -EAGAIN;
>> +                             goto out;
>> +                     }
>>                       status = pnfs_layoutcommit_inode(ino, wait);
>>                       if (status) {
>>                               dprintk("%s: layoutcommit failed, status=%d. "
>> @@ -787,9 +794,13 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
>>                               status = 0;
>>                       }
>>               }
>> +
>> +             if (stateid && wait)
>> +                     status = return_layout(ino, &arg, stateid, type,
>> +                                            lo, wait);
>> +             else
>> +                     pnfs_layout_release(lo, &arg);
>>       }
>> -send_return:
>> -     status = return_layout(ino, &arg, stateid, type, lo, wait);
>>  out:
>>       dprintk("<-- %s status: %d\n", __func__, status);
>>       return status;
>> @@ -1044,7 +1055,7 @@ pnfs_update_layout(struct inode *ino,
>>       struct nfs4_pnfs_layout_segment arg = {
>>               .iomode = iomode,
>>               .offset = 0,
>> -             .length = ~0
>> +             .length = NFS4_MAX_UINT64,
>
> why do you have to ask for whole file layouts?
> Isn't it enough to always return the whole layout
> but potentially having more than one layout segment?
>

Supposedly version A will not support multiple segments. Is this what
you mean ? I guarantee it by setting "minlength" equal to "length" in
Layoutget. I just wanted to enforce it here too.

-alexandros

> Benny
>
>>       };
>>       struct nfs_inode *nfsi = NFS_I(ino);
>>       struct pnfs_layout_type *lo;
>> @@ -1063,31 +1074,12 @@ pnfs_update_layout(struct inode *ino,
>>       /* Check to see if the layout for the given range already exists */
>>       lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref);
>>       if (lseg && !lseg->valid) {
>> -             unlock_current_layout(nfsi);
>>               if (take_ref)
>>                       put_lseg(lseg);
>> -             for (;;) {
>> -                     prepare_to_wait(&nfsi->lo_waitq, &__wait,
>> -                                     TASK_KILLABLE);
>> -                     lock_current_layout(nfsi);
>> -                     lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref);
>> -                     if (!lseg || lseg->valid)
>> -                             break;
>> -                     dprintk("%s: invalid lseg %p ref %d\n", __func__,
>> -                             lseg, atomic_read(&lseg->kref.refcount)-1);
>> -                     if (take_ref)
>> -                             put_lseg(lseg);
>> -                     if (signal_pending(current)) {
>> -                             lseg = NULL;
>> -                             result = -ERESTARTSYS;
>> -                             break;
>> -                     }
>> -                     unlock_current_layout(nfsi);
>> -                     schedule();
>> -             }
>> -             finish_wait(&nfsi->lo_waitq, &__wait);
>> -             if (result)
>> -                     goto out_put;
>> +
>> +             /* someone is cleaning the layout */
>> +             result = -EAGAIN;
>> +             goto out_put;
>>       }
>>
>>       if (lseg) {
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html