On Mon, 2020-01-13 at 22:48 +0800, Yan, Zheng wrote: > On 1/13/20 9:44 PM, Jeff Layton wrote: > > On Mon, 2020-01-13 at 18:53 +0800, Yan, Zheng wrote: > > > On 1/11/20 4:56 AM, Jeff Layton wrote: > > > > With the Octopus release, the MDS will hand out directoy create caps. > > > > If we have Fxc caps on the directory, and complete directory information > > > > or a known negative dentry, then we can return without waiting on the > > > > reply, allowing the open() call to return very quickly to userland. > > > > > > > > We use the normal ceph_fill_inode() routine to fill in the inode, so we > > > > have to gin up some reply inode information with what we'd expect a > > > > newly-created inode to have. The client assumes that it has a full set > > > > of caps on the new inode, and that the MDS will revoke them when there > > > > is conflicting access. > > > > > > > > This functionality is gated on the enable_async_dirops module option, > > > > along with async unlinks, and on the server supporting the Octopus > > > > CephFS feature bit. > > > > > > > > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx> > > > > --- > > > > fs/ceph/caps.c | 7 +- > > > > fs/ceph/file.c | 178 +++++++++++++++++++++++++++++++++-- > > > > fs/ceph/mds_client.c | 12 ++- > > > > fs/ceph/mds_client.h | 3 +- > > > > fs/ceph/super.h | 2 + > > > > include/linux/ceph/ceph_fs.h | 8 +- > > > > 6 files changed, 191 insertions(+), 19 deletions(-) > > > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > > > > index b96fb1378479..21a8a2ddc94b 100644 > > > > --- a/fs/ceph/caps.c > > > > +++ b/fs/ceph/caps.c > > > > @@ -654,6 +654,10 @@ void ceph_add_cap(struct inode *inode, > > > > session->s_nr_caps++; > > > > spin_unlock(&session->s_cap_lock); > > > > } else { > > > > + /* Did an async create race with the reply? */ > > > > + if (cap_id == CEPH_CAP_ID_TBD && cap->issued == issued) > > > > + return; > > > > + > > > > spin_lock(&session->s_cap_lock); > > > > list_move_tail(&cap->session_caps, &session->s_caps); > > > > spin_unlock(&session->s_cap_lock); > > > > @@ -672,7 +676,8 @@ void ceph_add_cap(struct inode *inode, > > > > */ > > > > if (ceph_seq_cmp(seq, cap->seq) <= 0) { > > > > WARN_ON(cap != ci->i_auth_cap); > > > > - WARN_ON(cap->cap_id != cap_id); > > > > + WARN_ON(cap_id != CEPH_CAP_ID_TBD && > > > > + cap->cap_id != cap_id); > > > > seq = cap->seq; > > > > mseq = cap->mseq; > > > > issued |= cap->issued; > > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > > > > index d4d7a277faf1..706abd71b731 100644 > > > > --- a/fs/ceph/file.c > > > > +++ b/fs/ceph/file.c > > > > @@ -450,6 +450,141 @@ copy_file_layout(struct inode *dst, struct inode *src) > > > > spin_unlock(&cdst->i_ceph_lock); > > > > } > > > > > > > > +static bool get_caps_for_async_create(struct inode *dir, struct dentry *dentry) > > > > +{ > > > > + struct ceph_inode_info *ci = ceph_inode(dir); > > > > + int ret, want, got; > > > > + > > > > + /* > > > > + * We can do an async create if we either have a valid negative dentry > > > > + * or the complete contents of the directory. Do a quick check without > > > > + * cap refs. > > > > + */ > > > > + if ((d_in_lookup(dentry) && !__ceph_dir_is_complete(ci)) || > > > > > > what does (d_in_lookup(dentry) && !__ceph_dir_is_complete(ci)) mean? > > > > > > > If d_in_lookup returns false, then we have a dentry that is known to be > > negative (IOW, it passed d_revalidate). If d_in_lookup returns true, > > then this is the initial lookup for the dentry. > > > > So if that returns true and the directory isn't complete then we can't > > do an async create since we don't have enough information about the > > namespace. > > > > That probably deserves a better comment. I'll try to make that clear. > > if directory is not complete and d_in_lookup() return false. we know the > dentry is negative in dcache, but it does not guarantee that > corresponding dentry in mds is negative. Between d_revalidate and this > function, mds may reovked dentry's lease and issued Fsx caps Nothing prevents that occurring while in this function in that case. I guess then we need to test for an actual dentry lease here then after ensuring that we have the correct caps? > > > > > I think we can async create if dentry is negative and its > > > lease_shared_gen == ci->i_shared_gen. > > > > > > > + !ceph_file_layout_is_valid(&ci->i_layout)) > > > > + return false; > > > > + > > > > + /* Try to get caps */ > > > > + want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE; > > > > + ret = ceph_try_get_caps(dir, 0, want, true, &got); > > > > + dout("Fx on %p ret=%d got=%d\n", dir, ret, got); > > > > + if (ret != 1) > > > > + return false; > > > > + if (got != want) { > > > > + ceph_put_cap_refs(ci, got); > > > > + return false; > > > > + } > > > > + > > > > + /* Check again, now that we hold cap refs */ > > > > + if ((d_in_lookup(dentry) && !__ceph_dir_is_complete(ci)) || > > > > + !ceph_file_layout_is_valid(&ci->i_layout)) { > > > > + ceph_put_cap_refs(ci, got); > > > > + return false; > > > > + } > > > > + > > > > + return true; > > > > +} > > > > + > > > > +static void ceph_async_create_cb(struct ceph_mds_client *mdsc, > > > > + struct ceph_mds_request *req) > > > > +{ > > > > + /* If we never sent anything then nothing to clean up */ > > > > + if (req->r_err == -ECHILD) > > > > + goto out; > > > > + > > > > + mapping_set_error(req->r_parent->i_mapping, req->r_err); > > > > + > > > > + if (req->r_target_inode) { > > > > + u64 ino = ceph_vino(req->r_target_inode).ino; > > > > + > > > > + if (req->r_deleg_ino != ino) > > > > + pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%lx target=0x%llx\n", > > > > + __func__, req->r_err, req->r_deleg_ino, ino); > > > > + mapping_set_error(req->r_target_inode->i_mapping, req->r_err); > > > > + } else { > > > > + pr_warn("%s: no req->r_target_inode for 0x%lx\n", __func__, > > > > + req->r_deleg_ino); > > > > + } > > > > +out: > > > > + ceph_put_cap_refs(ceph_inode(req->r_parent), > > > > + CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE); > > > > +} > > > > + > > > > +static int ceph_finish_async_open(struct inode *dir, struct dentry *dentry, > > > > + struct file *file, umode_t mode, > > > > + struct ceph_mds_request *req, > > > > + struct ceph_acl_sec_ctx *as_ctx) > > > > +{ > > > > + int ret; > > > > + struct ceph_mds_reply_inode in = { }; > > > > + struct ceph_mds_reply_info_in iinfo = { .in = &in }; > > > > + struct ceph_inode_info *ci = ceph_inode(dir); > > > > + struct inode *inode; > > > > + struct timespec64 now; > > > > + struct ceph_vino vino = { .ino = req->r_deleg_ino, > > > > + .snap = CEPH_NOSNAP }; > > > > + > > > > + ktime_get_real_ts64(&now); > > > > + > > > > + inode = ceph_get_inode(dentry->d_sb, vino); > > > > + if (IS_ERR(inode)) > > > > + return PTR_ERR(inode); > > > > + > > > > + /* If we can't get a buffer, just carry on */ > > > > + iinfo.xattr_data = kzalloc(4, GFP_NOFS); > > > > + if (iinfo.xattr_data) > > > > + iinfo.xattr_len = 4; > > > > > > ?? > > > > > > I think we should decode req->r_pagelist into xattrs > > > > > > > > > > I'm not sure I follow what you're suggesting here. At this point, > > r_pagelist may be set to as_ctx.pagelist. It would be nice to avoid an > > allocation here though. I guess I could pass in an on-stack buffer. It > > is only 4 bytes after all. > > > > > > + > > > > + iinfo.inline_version = CEPH_INLINE_NONE; > > > > + iinfo.change_attr = 1; > > > > + ceph_encode_timespec64(&iinfo.btime, &now); > > > > + > > > > + in.ino = cpu_to_le64(vino.ino); > > > > + in.snapid = cpu_to_le64(CEPH_NOSNAP); > > > > + in.version = cpu_to_le64(1); // ??? > > > > + in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); > > > > + in.cap.cap_id = cpu_to_le64(CEPH_CAP_ID_TBD); > > > > + in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); > > > > + in.cap.flags = CEPH_CAP_FLAG_AUTH; > > > > + in.ctime = in.mtime = in.atime = iinfo.btime; > > > > + in.mode = cpu_to_le32((u32)mode); > > > > + in.truncate_seq = cpu_to_le32(1); > > > > + in.truncate_size = cpu_to_le64(ci->i_truncate_size); > > > > + in.max_size = cpu_to_le64(ci->i_max_size); > > > > + in.xattr_version = cpu_to_le64(1); > > > > + in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); > > > > + in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid())); > > > > > > if dir has S_ISGID, new file's gid should be inherit from dir's gid > > > > > > > > > > Good catch. I'll fix that up for the next iteration. > > > > > > + in.nlink = cpu_to_le32(1); > > > > + > > > > + ceph_file_layout_to_legacy(&ci->i_layout, &in.layout); > > > > + > > > > + ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, > > > > + req->r_fmode, NULL); > > > > + if (ret) { > > > > + dout("%s failed to fill inode: %d\n", __func__, ret); > > > > + if (inode->i_state & I_NEW) > > > > + discard_new_inode(inode); > > > > + } else { > > > > + struct dentry *dn; > > > > + > > > > + dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__, > > > > + vino.ino, dir->i_ino, dentry->d_name.name); > > > > + ceph_dir_clear_ordered(dir); > > > > + ceph_init_inode_acls(inode, as_ctx); > > > > + if (inode->i_state & I_NEW) > > > > + unlock_new_inode(inode); > > > > + if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { > > > > + if (!d_unhashed(dentry)) > > > > + d_drop(dentry); > > > > + dn = d_splice_alias(inode, dentry); > > > > + WARN_ON_ONCE(dn && dn != dentry); > > > > + } > > > > + file->f_mode |= FMODE_CREATED; > > > > + ret = finish_open(file, dentry, ceph_open); > > > > + } > > > > + return ret; > > > > +} > > > > + > > > > /* > > > > * Do a lookup + open with a single request. If we get a non-existent > > > > * file or symlink, return 1 so the VFS can retry. > > > > @@ -462,6 +597,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > > > struct ceph_mds_request *req; > > > > struct dentry *dn; > > > > struct ceph_acl_sec_ctx as_ctx = {}; > > > > + bool try_async = enable_async_dirops; > > > > int mask; > > > > int err; > > > > > > > > @@ -486,6 +622,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > > > return -ENOENT; > > > > } > > > > > > > > +retry: > > > > /* do the open */ > > > > req = prepare_open_request(dir->i_sb, flags, mode); > > > > if (IS_ERR(req)) { > > > > @@ -494,6 +631,12 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > > > } > > > > req->r_dentry = dget(dentry); > > > > req->r_num_caps = 2; > > > > + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; > > > > + if (ceph_security_xattr_wanted(dir)) > > > > + mask |= CEPH_CAP_XATTR_SHARED; > > > > + req->r_args.open.mask = cpu_to_le32(mask); > > > > + req->r_parent = dir; > > > > + > > > > if (flags & O_CREAT) { > > > > req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; > > > > req->r_dentry_unless = CEPH_CAP_FILE_EXCL; > > > > @@ -501,21 +644,37 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > > > req->r_pagelist = as_ctx.pagelist; > > > > as_ctx.pagelist = NULL; > > > > } > > > > + if (try_async && get_caps_for_async_create(dir, dentry)) { > > > > + set_bit(CEPH_MDS_R_DELEG_INO, &req->r_req_flags); > > > > + req->r_callback = ceph_async_create_cb; > > > > + err = ceph_mdsc_submit_request(mdsc, dir, req); > > > > + switch (err) { > > > > + case 0: > > > > + /* set up inode, dentry and return */ > > > > + err = ceph_finish_async_open(dir, dentry, file, > > > > + mode, req, &as_ctx); > > > > + goto out_req; > > > > + case -ECHILD: > > > > + /* do a sync create */ > > > > + try_async = false; > > > > + as_ctx.pagelist = req->r_pagelist; > > > > + req->r_pagelist = NULL; > > > > + ceph_mdsc_put_request(req); > > > > + goto retry; > > > > + default: > > > > + /* Hard error, give up */ > > > > + goto out_req; > > > > + } > > > > + } > > > > } > > > > > > > > - mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; > > > > - if (ceph_security_xattr_wanted(dir)) > > > > - mask |= CEPH_CAP_XATTR_SHARED; > > > > - req->r_args.open.mask = cpu_to_le32(mask); > > > > - > > > > - req->r_parent = dir; > > > > set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); > > > > err = ceph_mdsc_do_request(mdsc, > > > > (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, > > > > req); > > > > err = ceph_handle_snapdir(req, dentry, err); > > > > if (err) > > > > - goto out_req; > > > > + goto out_fmode; > > > > > > > > if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) > > > > err = ceph_handle_notrace_create(dir, dentry); > > > > @@ -529,7 +688,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > > > dn = NULL; > > > > } > > > > if (err) > > > > - goto out_req; > > > > + goto out_fmode; > > > > if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { > > > > /* make vfs retry on splice, ENOENT, or symlink */ > > > > dout("atomic_open finish_no_open on dn %p\n", dn); > > > > @@ -545,9 +704,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > > > } > > > > err = finish_open(file, dentry, ceph_open); > > > > } > > > > -out_req: > > > > +out_fmode: > > > > if (!req->r_err && req->r_target_inode) > > > > ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); > > > > +out_req: > > > > ceph_mdsc_put_request(req); > > > > out_ctx: > > > > ceph_release_acl_sec_ctx(&as_ctx); > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > > > index 9e7492b21b50..c76d6e7f8136 100644 > > > > --- a/fs/ceph/mds_client.c > > > > +++ b/fs/ceph/mds_client.c > > > > @@ -2620,14 +2620,16 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, > > > > flags |= CEPH_MDS_FLAG_REPLAY; > > > > if (req->r_parent) > > > > flags |= CEPH_MDS_FLAG_WANT_DENTRY; > > > > - rhead->flags = cpu_to_le32(flags); > > > > - rhead->num_fwd = req->r_num_fwd; > > > > - rhead->num_retry = req->r_attempts - 1; > > > > - if (test_bit(CEPH_MDS_R_DELEG_INO, &req->r_req_flags)) > > > > + if (test_bit(CEPH_MDS_R_DELEG_INO, &req->r_req_flags)) { > > > > rhead->ino = cpu_to_le64(req->r_deleg_ino); > > > > - else > > > > + flags |= CEPH_MDS_FLAG_ASYNC; > > > > + } else { > > > > rhead->ino = 0; > > > > + } > > > > > > > > + rhead->flags = cpu_to_le32(flags); > > > > + rhead->num_fwd = req->r_num_fwd; > > > > + rhead->num_retry = req->r_attempts - 1; > > > > dout(" r_parent = %p\n", req->r_parent); > > > > return 0; > > > > } > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > > > > index e0b36be7c44f..49e6cd5a07a2 100644 > > > > --- a/fs/ceph/mds_client.h > > > > +++ b/fs/ceph/mds_client.h > > > > @@ -39,8 +39,7 @@ enum ceph_feature_type { > > > > CEPHFS_FEATURE_REPLY_ENCODING, \ > > > > CEPHFS_FEATURE_LAZY_CAP_WANTED, \ > > > > CEPHFS_FEATURE_MULTI_RECONNECT, \ > > > > - \ > > > > - CEPHFS_FEATURE_MAX, \ > > > > + CEPHFS_FEATURE_OCTOPUS, \ > > > > } > > > > #define CEPHFS_FEATURES_CLIENT_REQUIRED {} > > > > > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > > > > index ec4d66d7c261..33e03fbba888 100644 > > > > --- a/fs/ceph/super.h > > > > +++ b/fs/ceph/super.h > > > > @@ -136,6 +136,8 @@ struct ceph_fs_client { > > > > #endif > > > > }; > > > > > > > > +/* Special placeholder value for a cap_id during an asynchronous create. */ > > > > +#define CEPH_CAP_ID_TBD -1ULL > > > > > > > > /* > > > > * File i/o capability. This tracks shared state with the metadata > > > > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h > > > > index a099f60feb7b..b127563e21a1 100644 > > > > --- a/include/linux/ceph/ceph_fs.h > > > > +++ b/include/linux/ceph/ceph_fs.h > > > > @@ -444,8 +444,9 @@ union ceph_mds_request_args { > > > > } __attribute__ ((packed)) lookupino; > > > > } __attribute__ ((packed)); > > > > > > > > -#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ > > > > -#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ > > > > +#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ > > > > +#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ > > > > +#define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */ > > > > > > > > struct ceph_mds_request_head { > > > > __le64 oldest_client_tid; > > > > @@ -658,6 +659,9 @@ int ceph_flags_to_mode(int flags); > > > > #define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ > > > > CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ > > > > CEPH_CAP_PIN) > > > > +#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \ > > > > + CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \ > > > > + CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR) > > > > > > > > #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ > > > > CEPH_LOCK_IXATTR) > > > > -- Jeff Layton <jlayton@xxxxxxxxxx>