On Thu, Feb 10, 2022 at 2:23 PM Xiubo Li <xiubli@xxxxxxxxxx> wrote: > > > On 2/10/22 11:21 AM, Milind Changire wrote: > > Problem: > > Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be > > propagated to the client as frequently to keep them updated. This > > creates vxattr availability problems. > > > > Solution: > > Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and > > ceph.file.layout* vxattrs. > > If the entire layout for a dir or a file is being set, then it is > > expected that the layout be set in standard JSON format. Individual > > field value retrieval is not wrapped in JSON. The JSON format also > > applies while setting the vxattr if the entire layout is being set in > > one go. > > As a temporary measure, setting a vxattr can also be done in the old > > format. The old format will be deprecated in the future. > > > > URL: https://tracker.ceph.com/issues/51062 > > Signed-off-by: Milind Changire <mchangir@xxxxxxxxxx> > > --- > > fs/ceph/inode.c | 104 +++++++++++++++++++++++++++++++++++ > > fs/ceph/mds_client.c | 32 +++++++++++ > > fs/ceph/mds_client.h | 24 +++++++- > > fs/ceph/strings.c | 1 + > > fs/ceph/super.h | 1 + > > fs/ceph/xattr.c | 15 ++++- > > include/linux/ceph/ceph_fs.h | 1 + > > 7 files changed, 175 insertions(+), 3 deletions(-) > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c > > index e3322fcb2e8d..7d05863a43db 100644 > > --- a/fs/ceph/inode.c > > +++ b/fs/ceph/inode.c > > @@ -2291,6 +2291,110 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, > > return err; > > } > > > > +int ceph_vet_session(struct ceph_mds_client *mdsc, > > + struct ceph_mds_request *req, > > + bool *random) > > +{ > > + struct ceph_inode_info *ci = ceph_inode(req->r_inode); > > + struct ceph_mds_session *session; > > + int mds_with_getvxattr_support = 0; > > + int idx = MDS_RANK_UNAVAILABLE; > > + int rnd; > > + int i; > > + > > + if (req->r_op == CEPH_MDS_OP_GETVXATTR) { > > + session = ci->i_auth_cap->session; > > I think you missed to fix this, as Jeff mentioned the ci->i_auth_cap is > possibly be NULL. > > The code could be something like: > > session = ci->i_auth_cap ? ci->i_auth_cap->session : NULL; thanks for pointing this out will fix in v8 patch > > > + /* check if the auth mds supports the getvxattr feature */ > > + if (session && > > + test_bit(CEPHFS_FEATURE_GETVXATTR, &session->s_features)) { > > + for (i = 0; i < mdsc->max_sessions; i++) { > > + if (mdsc->sessions[i] && session == mdsc->sessions[i]) { > > + idx = i; > > + break; > > + } > > + } > > You could just return 'session->s_mds' here. ack > > > > + } else { > > + for (i = 0; i < mdsc->max_sessions; i++) { > > + if (mdsc->sessions[i] && > > + test_bit(CEPHFS_FEATURE_GETVXATTR, > > + &mdsc->sessions[i]->s_features)) > > + mds_with_getvxattr_support++; > > + } > > + if (!mds_with_getvxattr_support) > > + goto out; > > + > > + /* choose a random mds with getvxattr support */ > > + rnd = prandom_u32() % mds_with_getvxattr_support; > > + for (i = 0; i < mdsc->max_sessions; i++) { > > + if (mdsc->sessions[i] && > > + test_bit(CEPHFS_FEATURE_GETVXATTR, > > + &mdsc->sessions[i]->s_features)) { > > + if (rnd) > > + rnd--; > > + if (!rnd) { > > + idx = i; > > + break; > > + } > > + } > > + } > > + *random = true; > > The 'random' could be NULL here. ack > > > > + } > > + } > > +out: > > + return idx; > > +} > > + > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, > > + size_t size) > > +{ > > + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); > > + struct ceph_mds_client *mdsc = fsc->mdsc; > > + struct ceph_mds_request *req; > > + int mode = USE_AUTH_MDS; > > + int err; > > + char *xattr_value; > > + size_t xattr_value_len; > > + > > + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode); > > + if (IS_ERR(req)) { > > + err = -ENOMEM; > > + goto out; > > + } > > + > > + req->r_path2 = kstrdup(name, GFP_NOFS); > > + if (!req->r_path2) { > > + err = -ENOMEM; > > + goto put; > > + } > > + > > + ihold(inode); > > + req->r_inode = inode; > > + err = ceph_mdsc_do_request(mdsc, NULL, req); > > + if (err < 0) > > + goto put; > > + > > + xattr_value = req->r_reply_info.xattr_info.xattr_value; > > + xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len; > > + > > + dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size); > > + > > + err = (int)xattr_value_len; > > + if (size == 0) > > + goto put; > > + > > + if (xattr_value_len > size) { > > + err = -ERANGE; > > + goto put; > > + } > > + > > + memcpy(value, xattr_value, xattr_value_len); > > +put: > > + ceph_mdsc_put_request(req); > > +out: > > + dout("do_getvxattr result=%d\n", err); > > + return err; > > +} > > + > > > > /* > > * Check inode permissions. We verify we have a valid value for > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > index c30eefc0ac19..87df4ef6880b 100644 > > --- a/fs/ceph/mds_client.c > > +++ b/fs/ceph/mds_client.c > > @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end, > > return -EIO; > > } > > > > +static int parse_reply_info_getvxattr(void **p, void *end, > > + struct ceph_mds_reply_info_parsed *info, > > + u64 features) > > +{ > > + u8 struct_v, struct_compat; > > + u32 struct_len; > > + u32 value_len; > > + > > + ceph_decode_8_safe(p, end, struct_v, bad); > > + ceph_decode_8_safe(p, end, struct_compat, bad); > > + ceph_decode_32_safe(p, end, struct_len, bad); > > + ceph_decode_32_safe(p, end, value_len, bad); > > + > > + if (value_len == end - *p) { > > + info->xattr_info.xattr_value = *p; > > + info->xattr_info.xattr_value_len = value_len; > > + *p = end; > > + return value_len; > > + } > > +bad: > > + return -EIO; > > +} > > + > > /* > > * parse extra results > > */ > > @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end, > > return parse_reply_info_readdir(p, end, info, features); > > else if (op == CEPH_MDS_OP_CREATE) > > return parse_reply_info_create(p, end, info, features, s); > > + else if (op == CEPH_MDS_OP_GETVXATTR) > > + return parse_reply_info_getvxattr(p, end, info, features); > > else > > return -EIO; > > } > > @@ -1041,6 +1066,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, > > if (mode == USE_RANDOM_MDS) > > goto random; > > > > + if (mode == USE_VETTED_MDS) > > Where sets this flag ? :P oops! I built the infrastructure, but forgot to tie it all together will fix in v8 patch > > > > > + return req->r_vet_session(mdsc, req, random); > > + > > inode = NULL; > > if (req->r_inode) { > > if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) { > > @@ -2770,6 +2798,10 @@ static void __do_request(struct ceph_mds_client *mdsc, > > put_request_session(req); > > > > mds = __choose_mds(mdsc, req, &random); > > + if (mds == MDS_RANK_UNAVAILABLE) { > > + err = -EOPNOTSUPP; > > + goto finish; > > + } > > if (mds < 0 || > > ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { > > if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > > index 97c7f7bfa55f..94230fae9e71 100644 > > --- a/fs/ceph/mds_client.h > > +++ b/fs/ceph/mds_client.h > > @@ -29,8 +29,10 @@ enum ceph_feature_type { > > CEPHFS_FEATURE_MULTI_RECONNECT, > > CEPHFS_FEATURE_DELEG_INO, > > CEPHFS_FEATURE_METRIC_COLLECT, > > + CEPHFS_FEATURE_ALTERNATE_NAME, > > + CEPHFS_FEATURE_GETVXATTR, > > > > - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT, > > + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR, > > }; > > > > /* > > @@ -45,6 +47,8 @@ enum ceph_feature_type { > > CEPHFS_FEATURE_MULTI_RECONNECT, \ > > CEPHFS_FEATURE_DELEG_INO, \ > > CEPHFS_FEATURE_METRIC_COLLECT, \ > > + CEPHFS_FEATURE_ALTERNATE_NAME, \ > > + CEPHFS_FEATURE_GETVXATTR, \ > > \ > > CEPHFS_FEATURE_MAX, \ > > } > > @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry { > > loff_t offset; > > }; > > > > +struct ceph_mds_reply_xattr { > > + char *xattr_value; > > + size_t xattr_value_len; > > +}; > > + > > /* > > * parsed info about an mds reply, including information about > > * either: 1) the target inode and/or its parent directory and dentry, > > @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed { > > char *dname; > > u32 dname_len; > > struct ceph_mds_reply_lease *dlease; > > + struct ceph_mds_reply_xattr xattr_info; > > > > /* extra */ > > union { > > @@ -222,6 +232,14 @@ enum { > > USE_ANY_MDS, > > USE_RANDOM_MDS, > > USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */ > > + USE_VETTED_MDS /* eg. use an mds supporting particular feature */ > > +}; > > + > > +/* > > + * special mds rank > > + */ > > +enum { > > + MDS_RANK_UNAVAILABLE = (int)0x80000000 /* INT_MIN */ > > }; > > > > struct ceph_mds_request; > > @@ -337,6 +355,10 @@ struct ceph_mds_request { > > int r_readdir_cache_idx; > > > > struct ceph_cap_reservation r_caps_reservation; > > + /* return mds rank to send request to */ > > + int (*r_vet_session)(struct ceph_mds_client *mdsc, > > + struct ceph_mds_request *req, > > + bool *random); > > }; > > > > struct ceph_pool_perm { > > diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c > > index 573bb9556fb5..e36e8948e728 100644 > > --- a/fs/ceph/strings.c > > +++ b/fs/ceph/strings.c > > @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op) > > case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; > > case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; > > case CEPH_MDS_OP_GETATTR: return "getattr"; > > + case CEPH_MDS_OP_GETVXATTR: return "getvxattr"; > > case CEPH_MDS_OP_SETXATTR: return "setxattr"; > > case CEPH_MDS_OP_SETATTR: return "setattr"; > > case CEPH_MDS_OP_RMXATTR: return "rmxattr"; > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > > index ac331aa07cfa..a627fa69668e 100644 > > --- a/fs/ceph/super.h > > +++ b/fs/ceph/super.h > > @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode) > > > > /* xattr.c */ > > int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size); > > ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); > > extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); > > extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); > > diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c > > index fcf7dfdecf96..557749882aa2 100644 > > --- a/fs/ceph/xattr.c > > +++ b/fs/ceph/xattr.c > > @@ -923,9 +923,12 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, > > { > > struct ceph_inode_info *ci = ceph_inode(inode); > > struct ceph_inode_xattr *xattr; > > - struct ceph_vxattr *vxattr = NULL; > > + struct ceph_vxattr *vxattr; > > int req_mask; > > - ssize_t err; > > + ssize_t err = -ENODATA; > > + > > + if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) > > + goto out_nounlock; > > > > /* let's see if a virtual xattr was requested */ > > vxattr = ceph_match_vxattr(inode, name); > > @@ -945,6 +948,13 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, > > err = -ERANGE; > > } > > return err; > > + } else { > > + err = -ENODATA; > > + spin_unlock(&ci->i_ceph_lock); > > + err = ceph_do_getvxattr(inode, name, value, size); > > + spin_lock(&ci->i_ceph_lock); > > + > > + goto out; > > } > > > > req_mask = __get_request_mask(inode); > > @@ -997,6 +1007,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, > > ci->i_ceph_flags |= CEPH_I_SEC_INITED; > > out: > > spin_unlock(&ci->i_ceph_lock); > > +out_nounlock: > > return err; > > } > > > > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h > > index 7ad6c3d0db7d..66db21ac5f0c 100644 > > --- a/include/linux/ceph/ceph_fs.h > > +++ b/include/linux/ceph/ceph_fs.h > > @@ -328,6 +328,7 @@ enum { > > CEPH_MDS_OP_LOOKUPPARENT = 0x00103, > > CEPH_MDS_OP_LOOKUPINO = 0x00104, > > CEPH_MDS_OP_LOOKUPNAME = 0x00105, > > + CEPH_MDS_OP_GETVXATTR = 0x00106, > > > > CEPH_MDS_OP_SETXATTR = 0x01105, > > CEPH_MDS_OP_RMXATTR = 0x01106, >