On 2010-09-14 14:47, Fred Isaman wrote: > On Tue, Sep 14, 2010 at 5:40 AM, Benny Halevy <bhalevy@xxxxxxxxxxx> wrote: >> On 2010-09-14 14:00, Fred Isaman wrote: >>> Note that this waits on any range that overlaps both an outstanding >>> LAYOUTRETURN and a current layout. Whereas what you really want is to >>> wait on any range that overlaps an outstanding LAYOUTRETURN. One way >>> to get the desired behavior would be to have LAYOUTRETURN insert an >>> "invalid" lseg covering its range. >> >> What do you mean by "current layout"? > > Consider the case where the server asks for a LAYOUTRETURN of the > range 0-1000, but the client currently only has lsegs covering 0-500. > The client should wait for the LAYOUTRETURN to complete before sending > any LAYOUTGET for the range 500-1000, but there is currently no > mechanism to do this. > Yes, this could be yet another optimization to avoid getting NFS4ERR_RECALLCONFLICT. Benny > Fred > >> The way it works now is that layout return does mark the layout segments >> it's about to return as invalid (in pnfs_return_layout_barrier). >> layout get inserts a valid layout segment only upon completion >> (in pnfs_layout_process) and it does not yet insert an invalid one >> before sending the RPC, something we may wan to consider in the future >> to prevent superfluous parallel gets. >> >> Benny >> >>> >>> Fred >>> >>> >>> On Tue, Sep 14, 2010 at 3:53 AM, Benny Halevy <bhalevy@xxxxxxxxxxx> wrote: >>>> Introduce lo_rpcwaitq and sleep on it while there's an lseg marked invalid, >>>> i.e. it is being returned (in the future, it could also be layoutget in progress). >>>> >>>> Signed-off-by: Benny Halevy <bhalevy@xxxxxxxxxxx> >>>> --- >>>> fs/nfs/inode.c | 1 + >>>> fs/nfs/nfs4proc.c | 25 ++++++++++++++++++++++--- >>>> fs/nfs/pnfs.c | 37 ++++++++++++++++++------------------- >>>> fs/nfs/pnfs.h | 3 +++ >>>> include/linux/nfs_fs.h | 1 + >>>> 5 files changed, 45 insertions(+), 22 deletions(-) >>>> >>>> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c >>>> index 7621cfc..059bdf7 100644 >>>> --- a/fs/nfs/inode.c >>>> +++ b/fs/nfs/inode.c >>>> @@ -1462,6 +1462,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) >>>> init_rwsem(&nfsi->rwsem); >>>> #ifdef CONFIG_NFS_V4_1 >>>> init_waitqueue_head(&nfsi->lo_waitq); >>>> + rpc_init_wait_queue(&nfsi->lo_rpcwaitq, "pNFS Layout"); >>>> nfsi->pnfs_layout_suspend = 0; >>>> nfsi->layout = NULL; >>>> #endif /* CONFIG_NFS_V4_1 */ >>>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c >>>> index 48a763e..f12568d 100644 >>>> --- a/fs/nfs/nfs4proc.c >>>> +++ b/fs/nfs/nfs4proc.c >>>> @@ -5442,13 +5442,32 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) >>>> { >>>> struct nfs4_layoutget *lgp = calldata; >>>> struct inode *ino = lgp->args.inode; >>>> + struct nfs_inode *nfsi = NFS_I(ino); >>>> struct nfs_server *server = NFS_SERVER(ino); >>>> + struct pnfs_layout_segment *lseg; >>>> >>>> dprintk("--> %s\n", __func__); >>>> - if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, >>>> - &lgp->res.seq_res, 0, task)) >>>> + spin_lock(&ino->i_lock); >>>> + lseg = pnfs_has_layout(nfsi->layout, &lgp->args.range); >>>> + if (likely(!lseg)) { >>>> + spin_unlock(&ino->i_lock); >>>> + dprintk("%s: no lseg found, proceeding\n", __func__); >>>> + if (!nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, >>>> + &lgp->res.seq_res, 0, task)) >>>> + rpc_call_start(task); >>>> return; >>>> - rpc_call_start(task); >>>> + } >>>> + if (!lseg->valid) { >>>> + put_lseg_locked(lseg); >>>> + spin_unlock(&ino->i_lock); >>>> + dprintk("%s: invalid lseg found, waiting\n", __func__); >>>> + rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL); >>>> + return; >>>> + } >>>> + *lgp->lsegpp = lseg; >>>> + spin_unlock(&ino->i_lock); >>>> + dprintk("%s: valid lseg found, no rpc required\n", __func__); >>>> + rpc_exit(task, NFS4_OK); >>>> } >>>> >>>> static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) >>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c >>>> index bc4f0c1..2450383 100644 >>>> --- a/fs/nfs/pnfs.c >>>> +++ b/fs/nfs/pnfs.c >>>> @@ -430,7 +430,7 @@ destroy_lseg(struct kref *kref) >>>> PNFS_LD_IO_OPS(lseg->layout)->free_lseg(lseg); >>>> } >>>> >>>> -static void >>>> +void >>>> put_lseg_locked(struct pnfs_layout_segment *lseg) >>>> { >>>> bool do_wake_up; >>>> @@ -444,8 +444,10 @@ put_lseg_locked(struct pnfs_layout_segment *lseg) >>>> do_wake_up = !lseg->valid; >>>> nfsi = PNFS_NFS_INODE(lseg->layout); >>>> kref_put(&lseg->kref, destroy_lseg); >>>> - if (do_wake_up) >>>> + if (do_wake_up) { >>>> wake_up(&nfsi->lo_waitq); >>>> + rpc_wake_up(&nfsi->lo_rpcwaitq); >>>> + } >>>> } >>>> >>>> void >>>> @@ -999,7 +1001,7 @@ has_matching_lseg(struct pnfs_layout_segment *lseg, >>>> /* >>>> * lookup range in layout >>>> */ >>>> -static struct pnfs_layout_segment * >>>> +struct pnfs_layout_segment * >>>> pnfs_has_layout(struct pnfs_layout_hdr *lo, >>>> struct pnfs_layout_range *range) >>>> { >>>> @@ -1057,22 +1059,20 @@ _pnfs_update_layout(struct inode *ino, >>>> >>>> /* Check to see if the layout for the given range already exists */ >>>> lseg = pnfs_has_layout(lo, &arg); >>>> - if (lseg && !lseg->valid) { >>>> - put_lseg_locked(lseg); >>>> - /* someone is cleaning the layout */ >>>> - lseg = NULL; >>>> - goto out_unlock; >>>> - } >>>> - >>>> if (lseg) { >>>> - dprintk("%s: Using cached lseg %p for %llu@%llu iomode %d)\n", >>>> - __func__, >>>> - lseg, >>>> - arg.length, >>>> - arg.offset, >>>> - arg.iomode); >>>> + if (lseg->valid) { >>>> + dprintk("%s: Using cached lseg %p for %llu@%llu " >>>> + "iomode %d)\n", >>>> + __func__, >>>> + lseg, >>>> + arg.length, >>>> + arg.offset, >>>> + arg.iomode); >>>> >>>> - goto out_unlock; >>>> + goto out_unlock; >>>> + } >>>> + /* someone is cleaning the layout */ >>>> + put_lseg_locked(lseg); >>>> } >>>> >>>> /* if get layout already failed once goto out */ >>>> @@ -1097,8 +1097,7 @@ out: >>>> nfsi->layout->state, lseg); >>>> return; >>>> out_unlock: >>>> - if (lsegpp) >>>> - *lsegpp = lseg; >>>> + *lsegpp = lseg; >>>> spin_unlock(&ino->i_lock); >>>> goto out; >>>> } >>>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h >>>> index ea70385..f7f567e 100644 >>>> --- a/fs/nfs/pnfs.h >>>> +++ b/fs/nfs/pnfs.h >>>> @@ -34,6 +34,9 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait); >>>> /* pnfs.c */ >>>> extern const nfs4_stateid zero_stateid; >>>> >>>> +void put_lseg_locked(struct pnfs_layout_segment *); >>>> +struct pnfs_layout_segment *pnfs_has_layout(struct pnfs_layout_hdr *, >>>> + struct pnfs_layout_range *); >>>> void _pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, >>>> loff_t pos, u64 count, enum pnfs_iomode access_type, >>>> struct pnfs_layout_segment **lsegpp); >>>> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h >>>> index 5bafa95..196c0c6 100644 >>>> --- a/include/linux/nfs_fs.h >>>> +++ b/include/linux/nfs_fs.h >>>> @@ -213,6 +213,7 @@ struct nfs_inode { >>>> /* pNFS layout information */ >>>> #if defined(CONFIG_NFS_V4_1) >>>> wait_queue_head_t lo_waitq; >>>> + struct rpc_wait_queue lo_rpcwaitq; >>>> struct pnfs_layout_hdr *layout; >>>> time_t pnfs_layout_suspend; >>>> #endif /* CONFIG_NFS_V4_1 */ >>>> -- >>>> 1.7.2.2 >>>> >>>> -- >>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in >>>> the body of a message to majordomo@xxxxxxxxxxxxxxx >>>> More majordomo info at http://vger.kernel.org/majordomo-info.html >>>> >> -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html