On Thu, 2020-02-20 at 14:44 +0800, Yan, Zheng wrote: > On Wed, Feb 19, 2020 at 9:27 PM Jeff Layton <jlayton@xxxxxxxxxx> wrote: > > The MDS is getting a new lock-caching facility that will allow it > > to cache the necessary locks to allow asynchronous directory operations. > > Since the CEPH_CAP_FILE_* caps are currently unused on directories, > > we can repurpose those bits for this purpose. > > > > When performing an unlink, if we have Fx on the parent directory, > > and CEPH_CAP_DIR_UNLINK (aka Fr), and we know that the dentry being > > removed is the primary link, then then we can fire off an unlink > > request immediately and don't need to wait on reply before returning. > > > > In that situation, just fix up the dcache and link count and return > > immediately after issuing the call to the MDS. This does mean that we > > need to hold an extra reference to the inode being unlinked, and extra > > references to the caps to avoid races. Those references are put and > > error handling is done in the r_callback routine. > > > > If the operation ends up failing, then set a writeback error on the > > directory inode, and the inode itself that can be fetched later by > > an fsync on the dir. > > > > The behavior of dir caps is slightly different from caps on normal > > files. Because these are just considered an optimization, if the > > session is reconnected, we will not automatically reclaim them. They > > are instead considered lost until we do another synchronous op in the > > parent directory. > > > > Async dirops are enabled via the "nowsync" mount option, which is > > patterned after the xfs "wsync" mount option. For now, the default > > is "wsync", but eventually we may flip that. > > > > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx> > > --- > > fs/ceph/dir.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++-- > > fs/ceph/super.c | 20 ++++++++++ > > fs/ceph/super.h | 5 ++- > > 3 files changed, 123 insertions(+), 5 deletions(-) > > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c > > index 5b83bda57056..37ab09d223fc 100644 > > --- a/fs/ceph/dir.c > > +++ b/fs/ceph/dir.c > > @@ -1036,6 +1036,73 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, > > return err; > > } > > > > +static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, > > + struct ceph_mds_request *req) > > +{ > > + int result = req->r_err ? req->r_err : > > + le32_to_cpu(req->r_reply_info.head->result); > > + > > + /* If op failed, mark everyone involved for errors */ > > + if (result) { > > I think this function will get called for -EJUKEBOX case. > Good catch. I'll have another look at how to handle this better. > > > + int pathlen; > > + u64 base; > > + char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, > > + &base, 0); > > + > > + /* mark error on parent + clear complete */ > > + mapping_set_error(req->r_parent->i_mapping, result); > > + ceph_dir_clear_complete(req->r_parent); > > + > > + /* drop the dentry -- we don't know its status */ > > + if (!d_unhashed(req->r_dentry)) > > + d_drop(req->r_dentry); > > + > > + /* mark inode itself for an error (since metadata is bogus) */ > > + mapping_set_error(req->r_old_inode->i_mapping, result); > > + > > + pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n", > > + base, IS_ERR(path) ? "<<bad>>" : path, result); > > + ceph_mdsc_free_path(path, pathlen); > > + } > > + iput(req->r_old_inode); > > +} > > + > > +static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry) > > +{ > > + struct ceph_inode_info *ci = ceph_inode(dir); > > + struct ceph_dentry_info *di; > > + int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK; > > + > > + spin_lock(&ci->i_ceph_lock); > > + if ((__ceph_caps_issued(ci, NULL) & want) == want) { > > + ceph_take_cap_refs(ci, want, false); > > + got = want; > > + } > > + spin_unlock(&ci->i_ceph_lock); > > + > > + /* If we didn't get anything, return 0 */ > > + if (!got) > > + return 0; > > + > > + spin_lock(&dentry->d_lock); > > + di = ceph_dentry(dentry); > > + /* > > + * - We are holding Fx, which implies Fs caps. > > + * - Only support async unlink for primary linkage > > + */ > > + if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen || > > + !(di->flags & CEPH_DENTRY_PRIMARY_LINK)) > > + want = 0; > > + spin_unlock(&dentry->d_lock); > > + > > + /* Do we still want what we've got? */ > > + if (want == got) > > + return got; > > + > > + ceph_put_cap_refs(ci, got); > > + return 0; > > +} > > + > > /* > > * rmdir and unlink are differ only by the metadata op code > > */ > > @@ -1045,6 +1112,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) > > struct ceph_mds_client *mdsc = fsc->mdsc; > > struct inode *inode = d_inode(dentry); > > struct ceph_mds_request *req; > > + bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); > > int err = -EROFS; > > int op; > > > > @@ -1059,6 +1127,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) > > CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; > > } else > > goto out; > > +retry: > > req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); > > if (IS_ERR(req)) { > > err = PTR_ERR(req); > > @@ -1067,13 +1136,39 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) > > req->r_dentry = dget(dentry); > > req->r_num_caps = 2; > > req->r_parent = dir; > > - set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); > > req->r_dentry_drop = CEPH_CAP_FILE_SHARED; > > req->r_dentry_unless = CEPH_CAP_FILE_EXCL; > > req->r_inode_drop = ceph_drop_caps_for_unlink(inode); > > - err = ceph_mdsc_do_request(mdsc, dir, req); > > - if (!err && !req->r_reply_info.head->is_dentry) > > - d_delete(dentry); > > + > > + if (try_async && op == CEPH_MDS_OP_UNLINK && > > + (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { > > + dout("async unlink on %lu/%.*s caps=%s", dir->i_ino, > > + dentry->d_name.len, dentry->d_name.name, > > + ceph_cap_string(req->r_dir_caps)); > > + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); > > + req->r_callback = ceph_async_unlink_cb; > > + req->r_old_inode = d_inode(dentry); > > + ihold(req->r_old_inode); > > + err = ceph_mdsc_submit_request(mdsc, dir, req); > > + if (!err) { > > + /* > > + * We have enough caps, so we assume that the unlink > > + * will succeed. Fix up the target inode and dcache. > > + */ > > + drop_nlink(inode); > > + d_delete(dentry); > > + } else if (err == -EJUKEBOX) { > > + try_async = false; > > + ceph_mdsc_put_request(req); > > + goto retry; > > + } > > + } else { > > + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); > > + err = ceph_mdsc_do_request(mdsc, dir, req); > > + if (!err && !req->r_reply_info.head->is_dentry) > > + d_delete(dentry); > > + } > > + > > ceph_mdsc_put_request(req); > > out: > > return err; > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c > > index b1329cd5388a..c9784eb1159a 100644 > > --- a/fs/ceph/super.c > > +++ b/fs/ceph/super.c > > @@ -155,6 +155,7 @@ enum { > > Opt_acl, > > Opt_quotadf, > > Opt_copyfrom, > > + Opt_wsync, > > }; > > > > enum ceph_recover_session_mode { > > @@ -194,6 +195,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { > > fsparam_string ("snapdirname", Opt_snapdirname), > > fsparam_string ("source", Opt_source), > > fsparam_u32 ("wsize", Opt_wsize), > > + fsparam_flag_no ("wsync", Opt_wsync), > > {} > > }; > > > > @@ -444,6 +446,12 @@ static int ceph_parse_mount_param(struct fs_context *fc, > > fc->sb_flags &= ~SB_POSIXACL; > > } > > break; > > + case Opt_wsync: > > + if (!result.negated) > > + fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS; > > + else > > + fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; > > + break; > > default: > > BUG(); > > } > > @@ -567,6 +575,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) > > if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) > > seq_show_option(m, "recover_session", "clean"); > > > > + if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) > > + seq_puts(m, ",nowsync"); > > + > > if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) > > seq_printf(m, ",wsize=%u", fsopt->wsize); > > if (fsopt->rsize != CEPH_MAX_READ_SIZE) > > @@ -1115,6 +1126,15 @@ static void ceph_free_fc(struct fs_context *fc) > > > > static int ceph_reconfigure_fc(struct fs_context *fc) > > { > > + struct ceph_parse_opts_ctx *pctx = fc->fs_private; > > + struct ceph_mount_options *fsopt = pctx->opts; > > + struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb); > > + > > + if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) > > + ceph_set_mount_opt(fsc, ASYNC_DIROPS); > > + else > > + ceph_clear_mount_opt(fsc, ASYNC_DIROPS); > > + > > sync_filesystem(fc->root->d_sb); > > return 0; > > } > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > > index 2393803c38de..1b4996efc111 100644 > > --- a/fs/ceph/super.h > > +++ b/fs/ceph/super.h > > @@ -43,13 +43,16 @@ > > #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ > > #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ > > #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ > > +#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ > > > > #define CEPH_MOUNT_OPT_DEFAULT \ > > (CEPH_MOUNT_OPT_DCACHE | \ > > CEPH_MOUNT_OPT_NOCOPYFROM) > > > > #define ceph_set_mount_opt(fsc, opt) \ > > - (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; > > + (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt > > +#define ceph_clear_mount_opt(fsc, opt) \ > > + (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt > > #define ceph_test_mount_opt(fsc, opt) \ > > (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) > > > > -- > > 2.24.1 > > -- Jeff Layton <jlayton@xxxxxxxxxx>