On Dec 23, 2010, at 2:25 PM, Trond Myklebust wrote: > On Thu, 2010-12-23 at 12:29 -0500, Fred Isaman wrote: >> This is the heart of the wave 2 submission. Add the code to trigger >> drain and forget of any afected layouts. In addition, we set a >> "barrier", below which any LAYOUTGET reply is ignored. This is to >> compensate for the fact that we do not wait for outstanding LAYOUTGETs >> to complete as per section 12.5.5.2.1 of RFC 5661. >> >> Signed-off-by: Fred Isaman <iisaman@xxxxxxxxxx> >> --- >> fs/nfs/callback_proc.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++- >> fs/nfs/nfs4_fs.h | 1 + >> fs/nfs/pnfs.c | 84 ++++++++++++++++++++++++++-------- >> fs/nfs/pnfs.h | 12 +++++ >> 4 files changed, 196 insertions(+), 20 deletions(-) >> >> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c >> index c1bb157..a8e8e20 100644 >> --- a/fs/nfs/callback_proc.c >> +++ b/fs/nfs/callback_proc.c >> @@ -12,6 +12,7 @@ >> #include "callback.h" >> #include "delegation.h" >> #include "internal.h" >> +#include "pnfs.h" >> >> #ifdef NFS_DEBUG >> #define NFSDBG_FACILITY NFSDBG_CALLBACK >> @@ -107,10 +108,126 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf >> >> #if defined(CONFIG_NFS_V4_1) >> >> +static int initiate_layout_draining(struct nfs_client *clp, > ^^^ u32 > >> + struct cb_layoutrecallargs *args) >> +{ >> + struct pnfs_layout_hdr *lo; >> + int rv = NFS4ERR_NOMATCHING_LAYOUT; > ^^^ u32 > OK >> + >> + if (args->cbl_recall_type == RETURN_FILE) { >> + LIST_HEAD(free_me_list); >> + >> + args->cbl_inode = NULL; >> + spin_lock(&clp->cl_lock); >> + list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { >> + if (nfs_compare_fh(&args->cbl_fh, >> + &NFS_I(lo->plh_inode)->fh)) >> + continue; >> + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) >> + rv = NFS4ERR_DELAY; >> + else { >> + /* Without this, layout can be freed as soon >> + * as we release cl_lock. Matched in >> + * do_callback_layoutrecall. >> + */ >> + get_layout_hdr(lo); >> + args->cbl_inode = lo->plh_inode; >> + rv = NFS4_OK; >> + } >> + break; >> + } >> + spin_unlock(&clp->cl_lock); >> + >> + spin_lock(&lo->plh_inode->i_lock); > > What guarantees do you have that the inode still exists here? Does the > layout segment hold a reference to it? > This section is why I once asked about whether the layout needs to reference the inode. I believe the answer is nothing. The lseg/layout definitely does not hold direct references. I assume doing grab/put of inode here and below is acceptable? > Also, How do you know that (!test_bit(NFS_LAYOUT_BULK_RECALL, > &lo->plh_flags)) is still valid when you get here? > Because it can not be set elsewhere while the NFS4CLNT_LAYOUTRECALL bit is set. >> + if (rv == NFS4_OK) { >> + lo->plh_block_lgets++; >> + mark_matching_lsegs_invalid(lo, &free_me_list, >> + args->cbl_range.iomode); >> + if (list_empty(&lo->plh_segs)) >> + rv = NFS4ERR_NOMATCHING_LAYOUT; >> + } >> + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); >> + spin_unlock(&lo->plh_inode->i_lock); >> + pnfs_free_lseg_list(&free_me_list); >> + } else { >> + struct pnfs_layout_hdr *tmp; >> + LIST_HEAD(recall_list); >> + LIST_HEAD(free_me_list); >> + struct pnfs_layout_range range = { >> + .iomode = IOMODE_ANY, >> + .offset = 0, >> + .length = NFS4_MAX_UINT64, >> + }; >> + >> + spin_lock(&clp->cl_lock); >> + list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { >> + if ((args->cbl_recall_type == RETURN_FSID) && >> + memcmp(&NFS_SERVER(lo->plh_inode)->fsid, >> + &args->cbl_fsid, sizeof(struct nfs_fsid))) >> + continue; >> + get_layout_hdr(lo); >> + BUG_ON(!list_empty(&lo->plh_bulk_recall)); >> + list_add(&lo->plh_bulk_recall, &recall_list); >> + } >> + spin_unlock(&clp->cl_lock); >> + list_for_each_entry_safe(lo, tmp, >> + &recall_list, plh_bulk_recall) { >> + spin_lock(&lo->plh_inode->i_lock); > > Ditto. What guarantees that the inode still exists here, and how do you > avoid races with a file return request? > >> + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); >> + mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode); >> + list_del_init(&lo->plh_bulk_recall); >> + if (!list_empty(&lo->plh_segs)) >> + rv = NFS4_OK; >> + spin_unlock(&lo->plh_inode->i_lock); >> + put_layout_hdr(lo); >> + } >> + pnfs_free_lseg_list(&free_me_list); >> + } >> + return rv; >> +} > > Shouldn't the above really be 2 different functions? One for file > return, and the other for bulk layout returns? > OK >> +static u32 do_callback_layoutrecall(struct nfs_client *clp, >> + struct cb_layoutrecallargs *args) >> +{ >> + u32 status, res = NFS4ERR_DELAY; >> + >> + dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); >> + if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state)) >> + goto out; >> + status = initiate_layout_draining(clp, args); >> + if (status) > ^^^^^^^^^^^ > If you are going to insist on spelling out 'rv = NFS4_OK' instead of 'rv > = 0' in initiate_layout_draining(), then shouldn't the above test be 'if > (status != NFS4_OK)'? > > Please just dump all these NFS4_OK references, and replace them with 'rv > = 0' so that tests like the above make sense. That's consistent with the > rest of the NFS code. OK > >> + res = status; >> + else if (args->cbl_recall_type == RETURN_FILE) { >> + struct pnfs_layout_hdr *lo; >> + >> + lo = NFS_I(args->cbl_inode)->layout; >> + spin_lock(&lo->plh_inode->i_lock); >> + lo->plh_block_lgets--; >> + spin_unlock(&lo->plh_inode->i_lock); >> + put_layout_hdr(lo); >> + res = NFS4ERR_DELAY; > > Eh? Why doesn't this belong in (the split up version of) > initiate_layout_draining()? You're right. This is an artifact from when LAYOUTRETURN was sent. I'll split initiate_layout_draining and add this hunk in. Fred > >> + } >> + clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state); >> +out: >> + dprintk("%s returning %i\n", __func__, res); >> + return res; >> + >> +} >> + > > -- > Trond Myklebust > Linux NFS client maintainer > > NetApp > Trond.Myklebust@xxxxxxxxxx > www.netapp.com > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html