Re: [PATCH v4 1/1] ceph: add getvxattr op

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Jan 17, 2022 at 6:20 PM Xiubo Li <xiubli@xxxxxxxxxx> wrote:
>
>
> On 1/17/22 7:07 PM, Milind Changire wrote:
> > On Mon, Jan 17, 2022 at 4:23 PM Jeff Layton <jlayton@xxxxxxxxxx> wrote:
> >> On Mon, 2022-01-17 at 08:51 +0000, Milind Changire wrote:
> >>> Problem:
> >>> Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
> >>> propagated to the client as frequently to keep them updated. This
> >>> creates vxattr availability problems.
> >>>
> >>> Solution:
> >>> Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
> >>> ceph.file.layout* vxattrs.
> >>> If the entire layout for a dir or a file is being set, then it is
> >>> expected that the layout be set in standard JSON format. Individual
> >>> field value retrieval is not wrapped in JSON. The JSON format also
> >>> applies while setting the vxattr if the entire layout is being set in
> >>> one go.
> >>> As a temporary measure, setting a vxattr can also be done in the old
> >>> format. The old format will be deprecated in the future.
> >>>
> >>> URL: https://tracker.ceph.com/issues/51062
> >>> Signed-off-by: Milind Changire <mchangir@xxxxxxxxxx>
> >>> ---
> >>>   fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
> >>>   fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
> >>>   fs/ceph/mds_client.h         | 12 ++++++++-
> >>>   fs/ceph/strings.c            |  1 +
> >>>   fs/ceph/super.h              |  1 +
> >>>   fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
> >>>   include/linux/ceph/ceph_fs.h |  1 +
> >>>   7 files changed, 125 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> >>> index e3322fcb2e8d..efdce049b7f0 100644
> >>> --- a/fs/ceph/inode.c
> >>> +++ b/fs/ceph/inode.c
> >>> @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> >>>        return err;
> >>>   }
> >>>
> >>> +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
> >>> +                   size_t size)
> >>> +{
> >>> +     struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
> >>> +     struct ceph_mds_client *mdsc = fsc->mdsc;
> >>> +     struct ceph_mds_request *req;
> >>> +     int mode = USE_AUTH_MDS;
> >>> +     int err;
> >>> +     char *xattr_value;
> >>> +     size_t xattr_value_len;
> >>> +
> >>> +     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
> >>> +     if (IS_ERR(req)) {
> >>> +             err = -ENOMEM;
> >>> +             goto out;
> >>> +     }
> >>> +
> >>> +     req->r_path2 = kstrdup(name, GFP_NOFS);
> >>> +     if (!req->r_path2) {
> >>> +             err = -ENOMEM;
> >>> +             goto put;
> >>> +     }
> >>> +
> >>> +     ihold(inode);
> >>> +     req->r_inode = inode;
> >>> +     err = ceph_mdsc_do_request(mdsc, NULL, req);
> >>> +     if (err < 0)
> >>> +             goto put;
> >>> +
> >>> +     xattr_value = req->r_reply_info.xattr_info.xattr_value;
> >>> +     xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
> >>> +
> >>> +     dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
> >>> +
> >>> +     err = (int)xattr_value_len;
> >>> +     if (size == 0)
> >>> +             goto put;
> >>> +
> >>> +     if (xattr_value_len > size) {
> >>> +             err = -ERANGE;
> >>> +             goto put;
> >>> +     }
> >>> +
> >>> +     memcpy(value, xattr_value, xattr_value_len);
> >>> +put:
> >>> +     ceph_mdsc_put_request(req);
> >>> +out:
> >>> +     dout("do_getvxattr result=%d\n", err);
> >>> +     return err;
> >>> +}
> >>> +
> >>>
> >>>   /*
> >>>    * Check inode permissions.  We verify we have a valid value for
> >>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >>> index c30eefc0ac19..a5eafc71d976 100644
> >>> --- a/fs/ceph/mds_client.c
> >>> +++ b/fs/ceph/mds_client.c
> >>> @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
> >>>        return -EIO;
> >>>   }
> >>>
> >>> +static int parse_reply_info_getvxattr(void **p, void *end,
> >>> +                                   struct ceph_mds_reply_info_parsed *info,
> >>> +                                   u64 features)
> >>> +{
> >>> +     u8 struct_v, struct_compat;
> >>> +     u32 struct_len;
> >>> +     u32 value_len;
> >>> +
> >>> +     ceph_decode_8_safe(p, end, struct_v, bad);
> >>> +     ceph_decode_8_safe(p, end, struct_compat, bad);
> >>> +     ceph_decode_32_safe(p, end, struct_len, bad);
> >>> +     ceph_decode_32_safe(p, end, value_len, bad);
> >>> +
> >>> +     if (value_len == end - *p) {
> >>> +       info->xattr_info.xattr_value = *p;
> >>> +       info->xattr_info.xattr_value_len = end - *p;
> >>> +       *p = end;
> >>> +       return info->xattr_info.xattr_value_len;
> >>> +     }
> >>> +bad:
> >>> +     return -EIO;
> >>> +}
> >>> +
> >>>   /*
> >>>    * parse extra results
> >>>    */
> >>> @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
> >>>                return parse_reply_info_readdir(p, end, info, features);
> >>>        else if (op == CEPH_MDS_OP_CREATE)
> >>>                return parse_reply_info_create(p, end, info, features, s);
> >>> +     else if (op == CEPH_MDS_OP_GETVXATTR)
> >>> +             return parse_reply_info_getvxattr(p, end, info, features);
> >>>        else
> >>>                return -EIO;
> >>>   }
> >>> @@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
> >>>
> >>>        if (p != end)
> >>>                goto bad;
> >>> -     return 0;
> >>> +     return err;
> >>>
> >>>   bad:
> >>>        err = -EIO;
> >>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> >>> index 97c7f7bfa55f..f2a8e5af3c2e 100644
> >>> --- a/fs/ceph/mds_client.h
> >>> +++ b/fs/ceph/mds_client.h
> >>> @@ -29,8 +29,10 @@ enum ceph_feature_type {
> >>>        CEPHFS_FEATURE_MULTI_RECONNECT,
> >>>        CEPHFS_FEATURE_DELEG_INO,
> >>>        CEPHFS_FEATURE_METRIC_COLLECT,
> >>> +     CEPHFS_FEATURE_ALTERNATE_NAME,
> >>> +     CEPHFS_FEATURE_GETVXATTR,
> >>>
> >>> -     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> >>> +     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
> >>>   };
> >>>
> >>>   /*
> >>> @@ -45,6 +47,8 @@ enum ceph_feature_type {
> >>>        CEPHFS_FEATURE_MULTI_RECONNECT,         \
> >>>        CEPHFS_FEATURE_DELEG_INO,               \
> >>>        CEPHFS_FEATURE_METRIC_COLLECT,          \
> >>> +     CEPHFS_FEATURE_ALTERNATE_NAME,          \
> >>> +     CEPHFS_FEATURE_GETVXATTR,               \
> >>>                                                \
> >>>        CEPHFS_FEATURE_MAX,                     \
> >>>   }
> >>> @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
> >>>        loff_t                        offset;
> >>>   };
> >>>
> >>> +struct ceph_mds_reply_xattr {
> >>> +     char *xattr_value;
> >>> +     size_t xattr_value_len;
> >>> +};
> >>> +
> >>>   /*
> >>>    * parsed info about an mds reply, including information about
> >>>    * either: 1) the target inode and/or its parent directory and dentry,
> >>> @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
> >>>        char                          *dname;
> >>>        u32                           dname_len;
> >>>        struct ceph_mds_reply_lease   *dlease;
> >>> +     struct ceph_mds_reply_xattr   xattr_info;
> >>>
> >>>        /* extra */
> >>>        union {
> >>> diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
> >>> index 573bb9556fb5..e36e8948e728 100644
> >>> --- a/fs/ceph/strings.c
> >>> +++ b/fs/ceph/strings.c
> >>> @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
> >>>        case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
> >>>        case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
> >>>        case CEPH_MDS_OP_GETATTR:  return "getattr";
> >>> +     case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
> >>>        case CEPH_MDS_OP_SETXATTR: return "setxattr";
> >>>        case CEPH_MDS_OP_SETATTR: return "setattr";
> >>>        case CEPH_MDS_OP_RMXATTR: return "rmxattr";
> >>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> >>> index ac331aa07cfa..a627fa69668e 100644
> >>> --- a/fs/ceph/super.h
> >>> +++ b/fs/ceph/super.h
> >>> @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
> >>>
> >>>   /* xattr.c */
> >>>   int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
> >>> +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
> >>>   ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
> >>>   extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
> >>>   extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
> >>> diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
> >>> index fcf7dfdecf96..dc32876a541a 100644
> >>> --- a/fs/ceph/xattr.c
> >>> +++ b/fs/ceph/xattr.c
> >>> @@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
> >>>        return mask;
> >>>   }
> >>>
> >>> +/* check if the entire cluster supports the given feature */
> >>> +static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
> >>> +{
> >>> +     int64_t i;
> >>> +     struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> >>> +     struct ceph_mds_session **sessions = fsc->mdsc->sessions;
> >>> +     int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
> >>> +
> >>> +     if (fsc->mdsc->stopping)
> >>> +             return false;
> >>> +
> >>> +     if (!sessions)
> >>> +             return false;
> >>> +
> >>> +     for (i = 0; i < num_sessions; i++) {
> >>> +             struct ceph_mds_session *session = sessions[i];
> >>> +             if (!session)
> >>> +                     return false;
> >>> +             if (!test_bit(feature_bit, &session->s_features))
> >>> +                     return false;
> >> What guarantee do you have that "session" will still be a valid pointer
> >> by the time you get to dereferencing it here?
> >>
> >> I think this loop needs some locking (as Xiubo pointed out in his
> >> earlier review).
> > yeah, thanks for pointing that out
> > I'm trying to wrap the entire processing of this function inside a
> > mutex_unlock(&mdsc->mutex) ... but the mount command fails
> > to mount if done so. If code is not wrapped in mutex lock...unlock
> > then the mount is successful.
> > It's a surprise that the code doesn't deadlock under the mutex
> > lock...unlock and gracefully fails with a message.
> > Any hints on what I could be missing.
> >
> >>> +     }
> >>> +     return true;
> >>> +}
> >>> +
> >>>   ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> >>>                      size_t size)
> >>>   {
> >>> @@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> >>>        int req_mask;
> >>>        ssize_t err;
> >>>
> >>> +     if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
> >>> +         ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
> >>> +             err = ceph_do_getvxattr(inode, name, value, size);
> >>> +             /* if cluster doesn't support xattr, we try to service it
> >>> +              * locally
> >>> +              */
> >>> +             if (err >= 0)
> >>> +                     return err;
> >>> +     }
> >>> +
> >> What is this? Why not always service this locally?
> > vxattr handling is planned to be moved to the MDS side.
> > As I've pointed out to Xiubo, there's a few new things that have been done for
> > layout vxattr management. Also, as per your original tracker, ceph.dir.pin*
> > can't be handled locally.
> > getvxattr() currently handles:
> > 1. ceph.dir.layout*
> > 2. ceph.file.layout*
> > 3. ceph.dir.pin*
>
> The above seems will include the 'ceph.dir.layout', 'ceph.file.layout'
> and 'ceph.dir.pin' ? All these have been handled in 'ceph_file_vxattrs'
> and 'ceph_dir_vxattrs'...
>
> And the code above will always force kclient to get all the 'ceph.XXX'
> xattrs from MDS ?

yes, the kclient will always get vxattr values from MDS
for old cluster, op will be handled locally

Jeff has a proposal to expose the JSON output variety of layout
vxattr vlaue via new vxattr altogether
eg. ceph.dir.layout_json and ceph.file.layout_json

>
>
>
> > If kclient is new and the cluster is old, then layout vxattr will be
> > handled the old
> > way, i.e. locally. ceph.dir.pin* will remain inaccessible.
> >
> >>>        /* let's see if a virtual xattr was requested */
> >>>        vxattr = ceph_match_vxattr(inode, name);
> >>>        if (vxattr) {
> >>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> >>> index 7ad6c3d0db7d..66db21ac5f0c 100644
> >>> --- a/include/linux/ceph/ceph_fs.h
> >>> +++ b/include/linux/ceph/ceph_fs.h
> >>> @@ -328,6 +328,7 @@ enum {
> >>>        CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
> >>>        CEPH_MDS_OP_LOOKUPINO  = 0x00104,
> >>>        CEPH_MDS_OP_LOOKUPNAME = 0x00105,
> >>> +     CEPH_MDS_OP_GETVXATTR  = 0x00106,
> >>>
> >>>        CEPH_MDS_OP_SETXATTR   = 0x01105,
> >>>        CEPH_MDS_OP_RMXATTR    = 0x01106,
> >> --
> >> Jeff Layton <jlayton@xxxxxxxxxx>
>



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux