Re: [RFC PATCH 10/11] ceph: perform asynchronous unlink if we have sufficient caps

"Yan, Zheng" <ukernel@xxxxxxxxx> · Wed, 10 Apr 2019 21:06:17 +0800

On Wed, Apr 10, 2019 at 3:44 AM Jeff Layton <jlayton@xxxxxxxxxx> wrote:
>
> When performing an unlink, if we have Fx on the parent directory and
> Lx on the inode of the dentry being unlinked then we don't need to
> wait on the reply before returning.
>
> In that situation, just fix up the dcache and link count and return
> immediately after issuing the call to the MDS. This does mean that we
> need to hold an extra reference to the inode being unlinked, and extra
> references to the caps to avoid races. Put those references in the
> r_callback routine.
>
> If the operation ends up failing, then set a writeback error on the
> directory inode that can be fetched later by an fsync on the dir.
>
> Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> ---
>  fs/ceph/caps.c  | 22 ++++++++++++++++++++++
>  fs/ceph/dir.c   | 38 +++++++++++++++++++++++++++++++++++---
>  fs/ceph/super.h |  1 +
>  3 files changed, 58 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index d022e15c8378..8fbb09c761a7 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -2682,6 +2682,28 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
>         return ret;
>  }
>
> +bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry)
> +{
> +       int err, got;
> +       struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
> +
> +       /* Ensure we have Lx on the inode being unlinked */
> +       err = try_get_cap_refs(ci, 0, CEPH_CAP_LINK_EXCL, 0, true, &got);
> +       dout("Lx on %p err=%d got=%d\n", dentry, err, got);
> +       if (err != 1 || !(got & CEPH_CAP_LINK_EXCL))
> +               return false;
> +
> +       /* Do we have Fx on the dir ? */
> +       err = try_get_cap_refs(ceph_inode(dir), 0, CEPH_CAP_FILE_EXCL, 0,
> +                               true, &got);
> +       dout("Fx on %p err=%d got=%d\n", dir, err, got);
> +       if (err != 1 || !(got & CEPH_CAP_FILE_EXCL)) {
> +               ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> +               return false;
> +       }
> +       return true;
> +}

holding caps for request may cause deadlock.  For example

- client hold Fx caps and send unlink request
- mds process request from other client, it change filelock's state to
EXCL_FOO and revoke Fx caps
- mds receives the unlink request, it can't process it because it
can't acquire wrlock on filelock

filelock state stays in EXCL_FOO because client does not release Fx caps.

I checked MDS code, supporting async operation is more complex than I
thought.  For the unlink case, mds needs to acquire locks other than
filelock and linklock.

- Request A from client1 has acquired lock X, waiting for wrlock of
filelock.
- Client2 sends async unlink  request 'B' to mds
- MDS processes request 'B'. It needs to lock X and filelock.  But it
can't get lock X because request A holds it. Request A can get
filelock because Client2 hold Fx caps.

I think a way to handle this is delegating locks hold by unsafe
request to client. For example:

- mds processes unlink request from client A,  sends unsafe reply and
delegates acquired locks to client
- client got the delegation, it can do async unlink on the same
directory as  long as it hold the delegation

Regards
Yan, Zheng

> +
>  /*
>   * Check the offset we are writing up to against our current
>   * max_size.  If necessary, tell the MDS we want to write to
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 3eb9bc226b77..386c9439a020 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -1029,6 +1029,18 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>         return err;
>  }
>
> +static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> +                                struct ceph_mds_request *req)
> +{
> +       struct ceph_inode_info *ci = ceph_inode(req->r_old_inode);
> +
> +       /* If op failed, set error on parent directory */
> +       mapping_set_error(req->r_parent->i_mapping, req->r_err);
> +       ceph_put_cap_refs(ci, CEPH_CAP_LINK_EXCL);
> +       ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_FILE_EXCL);
> +       iput(req->r_old_inode);
> +}
> +
>  /*
>   * rmdir and unlink are differ only by the metadata op code
>   */
> @@ -1065,9 +1077,29 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>         req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
>         req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
>         req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
> -       err = ceph_mdsc_do_request(mdsc, dir, req);
> -       if (!err && !req->r_reply_info.head->is_dentry)
> -               d_delete(dentry);
> +
> +       if (op == CEPH_MDS_OP_UNLINK &&
> +           ceph_get_caps_for_unlink(dir, dentry)) {
> +               /* Keep LINK caps to ensure continuity over async call */
> +               req->r_inode_drop &= ~(CEPH_CAP_LINK_SHARED|CEPH_CAP_LINK_EXCL);
> +               req->r_callback = ceph_async_unlink_cb;
> +               req->r_old_inode = d_inode(dentry);
> +               ihold(req->r_old_inode);
> +               err = ceph_mdsc_submit_request(mdsc, dir, req);
> +               if (!err) {
> +                       /*
> +                        * We have enough caps, so we assume that the unlink
> +                        * will succeed. Fix up the target inode and dcache.
> +                        */
> +                       drop_nlink(inode);
> +                       d_delete(dentry);
> +               }
> +       } else {
> +               err = ceph_mdsc_do_request(mdsc, dir, req);
> +               if (!err && !req->r_reply_info.head->is_dentry)
> +                       d_delete(dentry);
> +       }
> +
>         ceph_mdsc_put_request(req);
>  out:
>         return err;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 3c5608f2108a..5c361dc1f47f 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -1033,6 +1033,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
>                          loff_t endoff, int *got, struct page **pinned_page);
>  extern int ceph_try_get_caps(struct ceph_inode_info *ci,
>                              int need, int want, bool nonblock, int *got);
> +extern bool ceph_get_caps_for_unlink(struct inode *dir, struct dentry *dentry);
>
>  /* for counting open files by mode */
>  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> --
> 2.20.1
>