Re: [RFC PATCH 11/11] ceph: wait for async dir ops to complete before doing synchronous dir ops

Luis Henriques <lhenriques@xxxxxxxx> · Wed, 10 Apr 2019 14:50:13 +0100

Jeff Layton <jlayton@xxxxxxxxxxxxxxx> writes:

> On Wed, Apr 10, 2019 at 8:16 AM Jeff Layton <jlayton@xxxxxxxxxxxxxxx> wrote:
>>
>> On Wed, Apr 10, 2019 at 7:05 AM Luis Henriques <lhenriques@xxxxxxxx> wrote:
>> >
>> > Jeff Layton <jlayton@xxxxxxxxxx> writes:
>> >
>> > > Ensure that we wait on replies from any pending directory operations
>> > > involving children before we allow synchronous operations involving
>> > > that directory to proceed.
>> > >
>> > > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
>> > > ---
>> > >  fs/ceph/dir.c   | 65 ++++++++++++++++++++++++++++++++++++++++++++++---
>> > >  fs/ceph/file.c  |  4 +++
>> > >  fs/ceph/super.h |  1 +
>> > >  3 files changed, 66 insertions(+), 4 deletions(-)
>> > >
>> > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
>> > > index 386c9439a020..0b8cee46e07c 100644
>> > > --- a/fs/ceph/dir.c
>> > > +++ b/fs/ceph/dir.c
>> > > @@ -998,11 +998,16 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>> > >       struct ceph_mds_request *req;
>> > >       int err;
>> > >
>> > > +     dout("link in dir %p old_dentry %p dentry %p\n", dir,
>> > > +          old_dentry, dentry);
>> > > +
>> > >       if (ceph_snap(dir) != CEPH_NOSNAP)
>> > >               return -EROFS;
>> > >
>> > > -     dout("link in dir %p old_dentry %p dentry %p\n", dir,
>> > > -          old_dentry, dentry);
>> > > +     err = ceph_async_dirop_request_wait(dir);
>> > > +     if (err)
>> > > +             return err;
>> > > +
>> > >       req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
>> > >       if (IS_ERR(req)) {
>> > >               d_drop(dentry);
>> > > @@ -1041,6 +1046,43 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>> > >       iput(req->r_old_inode);
>> > >  }
>> > >
>> > > +int ceph_async_dirop_request_wait(struct inode *inode)
>> > > +{
>> > > +     struct ceph_inode_info *ci = ceph_inode(inode);
>> > > +     struct ceph_mds_request *req = NULL;
>> > > +     int ret = 0;
>> > > +
>> > > +     /* Only applicable for directories */
>> > > +     if (S_ISDIR(inode->i_mode))
>> > > +             return 0;
>> > > +
>> > > +     spin_lock(&ci->i_unsafe_lock);
>> > > +     if (!list_empty(&ci->i_unsafe_dirops)) {
>> > > +             struct ceph_mds_request *last;
>> > > +             last = list_last_entry(&ci->i_unsafe_dirops,
>> > > +                                    struct ceph_mds_request,
>> > > +                                    r_unsafe_dir_item);
>> > > +             /*
>> > > +              * If last request hasn't gotten a reply, then wait
>> > > +              * for it.
>> > > +              */
>> > > +             if (!test_bit(CEPH_MDS_R_GOT_UNSAFE, &last->r_req_flags) &&
>> > > +                 !test_bit(CEPH_MDS_R_GOT_SAFE, &last->r_req_flags)) {
>> > > +                     req = last;
>> > > +                     ceph_mdsc_get_request(req);
>> > > +             }
>> > > +     }
>> > > +     spin_unlock(&ci->i_unsafe_lock);
>> > > +
>> > > +     if (req) {
>> > > +             dout("%s %p wait on tid %llu\n", __func__, inode,
>> > > +                  req ? req->r_tid : 0ULL);
>> > > +             ret = wait_for_completion_killable(&req->r_completion);
>> > > +             ceph_mdsc_put_request(req);
>> > > +     }
>> > > +     return ret;
>> > > +}
>> > > +
>> > >  /*
>> > >   * rmdir and unlink are differ only by the metadata op code
>> > >   */
>> > > @@ -1064,6 +1106,12 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>> > >                       CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
>> > >       } else
>> > >               goto out;
>> > > +
>> > > +     /* Wait for any requests involving children to get a reply */
>> > > +     err = ceph_async_dirop_request_wait(dir);
>> > > +     if (err)
>> > > +             goto out;
>> > > +
>> >
>> > In this case, couldn't we move this check into the 'else' branch added
>> > in the previous patch?  IOW, couldn't we have two (or more) asynchronous
>> > unlink operations at the same time?
>> >
>> > Cheers,
>>
>> For this set, it won't matter. We're only doing async unlinks on
>> regular files, and rmdir is still done synchronously. So, if this is a
>> candidate for an async unlink it can't have any child dirops anyway.
>>
>> It's a minor thing, but this has us blocking before the
>> ceph_mdsc_create_request call, which means we won't do any allocation
>> until we're ready to fire off the request, which I like marginally
>> better.
>>
>> Longer term, I'd like to expand this so that can do async rmdirs as
>> well, but that require a different set of caps (FxLx on the parent).
>> Once I get there, I'll probably split off a separate ceph_rmdir
>> inode_operation.
>>
>
> Sorry -- now that I look closer, I think this is a bug. We should be doing:
>
>     err = ceph_async_dirop_request_wait(inode);
>
> Basically, we want to wait for any requests involving children to
> finish before we issue an rmdir. I'll fix that up and also test to see
> whether this improves performance. Good catch!

Heh, obviously I did *not* found that bug -- I probably just got
confused when I saw 'dir' and immediately assumed an rmdir.  So, I
accidentally made you see a bug.  Nice :-)

Cheers,
-- 
Luis

>
> Thanks for the review so far!
>
>> >
>> > >       req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if
>> > > (IS_ERR(req)) { err = PTR_ERR(req); @@ -1115,6 +1163,9 @@ static int
>> > > ceph_rename(struct inode *old_dir, struct dentry *old_dentry, int op =
>> > > CEPH_MDS_OP_RENAME; int err;
>> > >
>> > > +     dout("rename dir %p dentry %p to dir %p dentry %p\n",
>> > > +          old_dir, old_dentry, new_dir, new_dentry);
>> > > +
>> > >       if (flags)
>> > >               return -EINVAL;
>> > >
>> > > @@ -1131,8 +1182,14 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
>> > >           (!ceph_quota_is_same_realm(old_dir, new_dir)))
>> > >               return -EXDEV;
>> > >
>> > > -     dout("rename dir %p dentry %p to dir %p dentry %p\n",
>> > > -          old_dir, old_dentry, new_dir, new_dentry);
>> > > +     err = ceph_async_dirop_request_wait(old_dir);
>> > > +     if (err)
>> > > +             return err;
>> > > +
>> > > +     err = ceph_async_dirop_request_wait(new_dir);
>> > > +     if (err)
>> > > +             return err;
>> > > +
>> > >       req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
>> > >       if (IS_ERR(req))
>> > >               return PTR_ERR(req);
>> > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>> > > index f24d18f46715..f7e49907514e 100644
>> > > --- a/fs/ceph/file.c
>> > > +++ b/fs/ceph/file.c
>> > > @@ -444,6 +444,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>> > >            dir, dentry, dentry,
>> > >            d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
>> > >
>> > > +     err = ceph_async_dirop_request_wait(dir);
>> > > +     if (err)
>> > > +             return err;
>> > > +
>> > >       if (dentry->d_name.len > NAME_MAX)
>> > >               return -ENAMETOOLONG;
>> > >
>> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> > > index 5c361dc1f47f..e97a6ce31a4e 100644
>> > > --- a/fs/ceph/super.h
>> > > +++ b/fs/ceph/super.h
>> > > @@ -1070,6 +1070,7 @@ extern int ceph_handle_snapdir(struct ceph_mds_request *req,
>> > >                              struct dentry *dentry, int err);
>> > >  extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
>> > >                                        struct dentry *dentry, int err);
>> > > +extern int ceph_async_dirop_request_wait(struct inode *inode);
>> > >
>> > >  extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di);
>> > >  extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di);
>>
>>
>>
>> --
>> Jeff Layton <jlayton@xxxxxxxxxxxxxxx>