Re: [PATCH] ceph: make num_fwd and num_retry to __u32

Aleksandr Mikhalitsyn <aleksandr.mikhalitsyn@xxxxxxxxxxxxx> · Wed, 26 Jul 2023 12:06:12 +0200

Hi Xiubo!

On Tue, Jul 25, 2023 at 12:46 PM <xiubli@xxxxxxxxxx> wrote:
>
> From: Xiubo Li <xiubli@xxxxxxxxxx>
>
> The num_fwd in MClientRequestForward is int32_t, while the num_fwd
> in ceph_mds_request_head is __u8. This is buggy when the num_fwd
> is larger than 256 it will always be truncate to 0 again. But the
> client couldn't recoginize this.
>
> This will make them to __u32 instead. Because the old cephs will
> directly copy the raw memories when decoding the reqeust's head,
> so we need to make sure this kclient will be compatible with old
> cephs. For newer cephs they will decode the requests depending
> the version, which will be much simpler and easier to extend new
> members.
>
> URL: https://tracker.ceph.com/issues/62145
> Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
> ---
>  fs/ceph/mds_client.c         | 191 ++++++++++++++++++-----------------
>  fs/ceph/mds_client.h         |   4 +-
>  include/linux/ceph/ceph_fs.h |  23 ++++-
>  3 files changed, 124 insertions(+), 94 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 70987b3c198a..191bae3a4ee6 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2897,6 +2897,18 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
>         }
>  }
>
> +static struct ceph_mds_request_head_legacy *
> +find_legacy_request_head(void *p, u64 features)
> +{
> +       bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
> +       struct ceph_mds_request_head_old *ohead;
> +
> +       if (legacy)
> +               return (struct ceph_mds_request_head_legacy *)p;
> +       ohead = (struct ceph_mds_request_head_old *)p;
> +       return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
> +}
> +
>  /*
>   * called under mdsc->mutex
>   */
> @@ -2907,7 +2919,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>         int mds = session->s_mds;
>         struct ceph_mds_client *mdsc = session->s_mdsc;
>         struct ceph_msg *msg;
> -       struct ceph_mds_request_head_old *head;
> +       struct ceph_mds_request_head_legacy *lhead;
>         const char *path1 = NULL;
>         const char *path2 = NULL;
>         u64 ino1 = 0, ino2 = 0;
> @@ -2919,6 +2931,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>         void *p, *end;
>         int ret;
>         bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
> +       bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
>
>         ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
>                               req->r_parent, req->r_path1, req->r_ino1.ino,
> @@ -2950,7 +2963,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>                 goto out_free2;
>         }
>
> -       len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head);
> +       /*
> +        * For old cephs without supporting the 32bit retry/fwd feature
> +        * it will copy the raw memories directly when decoding the
> +        * requests. While new cephs will decode the head depending the
> +        * version member, so we need to make sure it will be compatible
> +        * with them both.
> +        */
> +       if (legacy)
> +               len = sizeof(struct ceph_mds_request_head_legacy);
> +       else if (old_version)
> +               len = sizeof(struct ceph_mds_request_head_old);
> +       else
> +               len = sizeof(struct ceph_mds_request_head);
>
>         /* filepaths */
>         len += 2 * (1 + sizeof(u32) + sizeof(u64));
> @@ -2995,33 +3020,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>
>         msg->hdr.tid = cpu_to_le64(req->r_tid);
>
> +       lhead = find_legacy_request_head(msg->front.iov_base,
> +                                        session->s_con.peer_features);
> +
>         /*
> -        * The old ceph_mds_request_head didn't contain a version field, and
> +        * The ceph_mds_request_head_legacy didn't contain a version field, and
>          * one was added when we moved the message version from 3->4.
>          */
>         if (legacy) {
>                 msg->hdr.version = cpu_to_le16(3);
> -               head = msg->front.iov_base;
> -               p = msg->front.iov_base + sizeof(*head);
> +               p = msg->front.iov_base + sizeof(*lhead);
> +       } else if (old_version) {
> +               struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
> +
> +               msg->hdr.version = cpu_to_le16(4);
> +               ohead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);

Don't we want to use the old mds request head version here:
ohead->version = cpu_to_le16(1);
?

As far as I understand the idea here is to skip new fields
ext_num_retry/ext_num_fwd in case
when the old_version is true. Would it be incorrect to set version to
the latest one (CEPH_MDS_REQUEST_HEAD_VERSION)
and at the same time skip the setting of new fields?

Kind regards,
Alex

> +               p = msg->front.iov_base + sizeof(*ohead);
>         } else {
> -               struct ceph_mds_request_head *new_head = msg->front.iov_base;
> +               struct ceph_mds_request_head *nhead = msg->front.iov_base;
>
>                 msg->hdr.version = cpu_to_le16(6);
> -               new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> -               head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
> -               p = msg->front.iov_base + sizeof(*new_head);
> +               nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> +               p = msg->front.iov_base + sizeof(*nhead);
>         }
>
>         end = msg->front.iov_base + msg->front.iov_len;
>
> -       head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
> -       head->op = cpu_to_le32(req->r_op);
> -       head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
> -                                                req->r_cred->fsuid));
> -       head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
> -                                                req->r_cred->fsgid));
> -       head->ino = cpu_to_le64(req->r_deleg_ino);
> -       head->args = req->r_args;
> +       lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
> +       lhead->op = cpu_to_le32(req->r_op);
> +       lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
> +                                                 req->r_cred->fsuid));
> +       lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
> +                                                 req->r_cred->fsgid));
> +       lhead->ino = cpu_to_le64(req->r_deleg_ino);
> +       lhead->args = req->r_args;
>
>         ceph_encode_filepath(&p, end, ino1, path1);
>         ceph_encode_filepath(&p, end, ino2, path2);
> @@ -3063,7 +3095,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>                 p = msg->front.iov_base + req->r_request_release_offset;
>         }
>
> -       head->num_releases = cpu_to_le16(releases);
> +       lhead->num_releases = cpu_to_le16(releases);
>
>         encode_mclientrequest_tail(&p, req);
>
> @@ -3114,18 +3146,6 @@ static void complete_request(struct ceph_mds_client *mdsc,
>         complete_all(&req->r_completion);
>  }
>
> -static struct ceph_mds_request_head_old *
> -find_old_request_head(void *p, u64 features)
> -{
> -       bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
> -       struct ceph_mds_request_head *new_head;
> -
> -       if (legacy)
> -               return (struct ceph_mds_request_head_old *)p;
> -       new_head = (struct ceph_mds_request_head *)p;
> -       return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
> -}
> -
>  /*
>   * called under mdsc->mutex
>   */
> @@ -3136,30 +3156,26 @@ static int __prepare_send_request(struct ceph_mds_session *session,
>         int mds = session->s_mds;
>         struct ceph_mds_client *mdsc = session->s_mdsc;
>         struct ceph_client *cl = mdsc->fsc->client;
> -       struct ceph_mds_request_head_old *rhead;
> +       struct ceph_mds_request_head_legacy *lhead;
> +       struct ceph_mds_request_head *nhead;
>         struct ceph_msg *msg;
> -       int flags = 0, max_retry;
> +       int flags = 0, old_max_retry;
> +       bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
>
> -       /*
> -        * The type of 'r_attempts' in kernel 'ceph_mds_request'
> -        * is 'int', while in 'ceph_mds_request_head' the type of
> -        * 'num_retry' is '__u8'. So in case the request retries
> -        *  exceeding 256 times, the MDS will receive a incorrect
> -        *  retry seq.
> -        *
> -        * In this case it's ususally a bug in MDS and continue
> -        * retrying the request makes no sense.
> -        *
> -        * In future this could be fixed in ceph code, so avoid
> -        * using the hardcode here.
> +       /* Avoid inifinite retrying after overflow. The client will
> +        * increase the retry count and if the MDS is old version,
> +        * so we limit to retry at most 256 times.
>          */
> -       max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
> -       max_retry = 1 << (max_retry * BITS_PER_BYTE);
> -       if (req->r_attempts >= max_retry) {
> -               pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n",
> -                                          req->r_tid);
> -               return -EMULTIHOP;
> -       }
> +       if (req->r_attempts) {
> +               old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry);
> +               old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
> +               if ((old_version && req->r_attempts >= old_max_retry) ||
> +                   ((uint32_t)req->r_attempts >= U32_MAX)) {
> +                       pr_warn_ratelimited_client(cl, "%s request tid %llu seq overflow\n",
> +                                                 __func__, req->r_tid);
> +                       return -EMULTIHOP;
> +               }
> +        }
>
>         req->r_attempts++;
>         if (req->r_inode) {
> @@ -3184,20 +3200,24 @@ static int __prepare_send_request(struct ceph_mds_session *session,
>                  * d_move mangles the src name.
>                  */
>                 msg = req->r_request;
> -               rhead = find_old_request_head(msg->front.iov_base,
> -                                             session->s_con.peer_features);
> +               lhead = find_legacy_request_head(msg->front.iov_base,
> +                                                session->s_con.peer_features);
>
> -               flags = le32_to_cpu(rhead->flags);
> +               flags = le32_to_cpu(lhead->flags);
>                 flags |= CEPH_MDS_FLAG_REPLAY;
> -               rhead->flags = cpu_to_le32(flags);
> +               lhead->flags = cpu_to_le32(flags);
>
>                 if (req->r_target_inode)
> -                       rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
> +                       lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
>
> -               rhead->num_retry = req->r_attempts - 1;
> +               lhead->num_retry = req->r_attempts - 1;
> +               if (!old_version) {
> +                       nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
> +                       nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
> +               }
>
>                 /* remove cap/dentry releases from message */
> -               rhead->num_releases = 0;
> +               lhead->num_releases = 0;
>
>                 p = msg->front.iov_base + req->r_request_release_offset;
>                 encode_mclientrequest_tail(&p, req);
> @@ -3218,18 +3238,23 @@ static int __prepare_send_request(struct ceph_mds_session *session,
>         }
>         req->r_request = msg;
>
> -       rhead = find_old_request_head(msg->front.iov_base,
> -                                     session->s_con.peer_features);
> -       rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
> +       lhead = find_legacy_request_head(msg->front.iov_base,
> +                                        session->s_con.peer_features);
> +       lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
>         if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
>                 flags |= CEPH_MDS_FLAG_REPLAY;
>         if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
>                 flags |= CEPH_MDS_FLAG_ASYNC;
>         if (req->r_parent)
>                 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
> -       rhead->flags = cpu_to_le32(flags);
> -       rhead->num_fwd = req->r_num_fwd;
> -       rhead->num_retry = req->r_attempts - 1;
> +       lhead->flags = cpu_to_le32(flags);
> +       lhead->num_fwd = req->r_num_fwd;
> +       lhead->num_retry = req->r_attempts - 1;
> +       if (!old_version) {
> +               nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
> +               nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
> +               nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
> +       }
>
>         doutc(cl, " r_parent = %p\n", req->r_parent);
>         return 0;
> @@ -3893,34 +3918,20 @@ static void handle_forward(struct ceph_mds_client *mdsc,
>         if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
>                 doutc(cl, "forward tid %llu aborted, unregistering\n", tid);
>                 __unregister_request(mdsc, req);
> -       } else if (fwd_seq <= req->r_num_fwd) {
> -               /*
> -                * The type of 'num_fwd' in ceph 'MClientRequestForward'
> -                * is 'int32_t', while in 'ceph_mds_request_head' the
> -                * type is '__u8'. So in case the request bounces between
> -                * MDSes exceeding 256 times, the client will get stuck.
> -                *
> -                * In this case it's ususally a bug in MDS and continue
> -                * bouncing the request makes no sense.
> +       } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
> +               /* Avoid inifinite retrying after overflow.
>                  *
> -                * In future this could be fixed in ceph code, so avoid
> -                * using the hardcode here.
> +                * The MDS will increase the fwd count and in client side
> +                * if the num_fwd is less than the one saved in request
> +                * that means the MDS is an old version and overflowed of
> +                * 8 bits.
>                  */
> -               int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
> -               max = 1 << (max * BITS_PER_BYTE);
> -               if (req->r_num_fwd >= max) {
> -                       mutex_lock(&req->r_fill_mutex);
> -                       req->r_err = -EMULTIHOP;
> -                       set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
> -                       mutex_unlock(&req->r_fill_mutex);
> -                       aborted = true;
> -                       pr_warn_ratelimited_client(cl,
> -                                       "forward tid %llu seq overflow\n",
> -                                       tid);
> -               } else {
> -                       doutc(cl, "forward tid %llu to mds%d - old seq %d <= %d\n",
> -                             tid, next_mds, req->r_num_fwd, fwd_seq);
> -               }
> +               mutex_lock(&req->r_fill_mutex);
> +               req->r_err = -EMULTIHOP;
> +               set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
> +               mutex_unlock(&req->r_fill_mutex);
> +               aborted = true;
> +               pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid);
>         } else {
>                 /* resend. forward race not possible; mds would drop */
>                 doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index befbd384428e..717a7399bacb 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -32,8 +32,9 @@ enum ceph_feature_type {
>         CEPHFS_FEATURE_ALTERNATE_NAME,
>         CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
>         CEPHFS_FEATURE_OP_GETVXATTR,
> +       CEPHFS_FEATURE_32BITS_RETRY_FWD,
>
> -       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
> +       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
>  };
>
>  #define CEPHFS_FEATURES_CLIENT_SUPPORTED {     \
> @@ -47,6 +48,7 @@ enum ceph_feature_type {
>         CEPHFS_FEATURE_ALTERNATE_NAME,          \
>         CEPHFS_FEATURE_NOTIFY_SESSION_STATE,    \
>         CEPHFS_FEATURE_OP_GETVXATTR,            \
> +       CEPHFS_FEATURE_32BITS_RETRY_FWD,        \
>  }
>
>  /*
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index 45f8ce61e103..f3b3593254b9 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -484,7 +484,7 @@ union ceph_mds_request_args_ext {
>  #define CEPH_MDS_FLAG_WANT_DENTRY      2 /* want dentry in reply */
>  #define CEPH_MDS_FLAG_ASYNC            4 /* request is asynchronous */
>
> -struct ceph_mds_request_head_old {
> +struct ceph_mds_request_head_legacy {
>         __le64 oldest_client_tid;
>         __le32 mdsmap_epoch;           /* on client */
>         __le32 flags;                  /* CEPH_MDS_FLAG_* */
> @@ -497,9 +497,9 @@ struct ceph_mds_request_head_old {
>         union ceph_mds_request_args args;
>  } __attribute__ ((packed));
>
> -#define CEPH_MDS_REQUEST_HEAD_VERSION  1
> +#define CEPH_MDS_REQUEST_HEAD_VERSION  2
>
> -struct ceph_mds_request_head {
> +struct ceph_mds_request_head_old {
>         __le16 version;                /* struct version */
>         __le64 oldest_client_tid;
>         __le32 mdsmap_epoch;           /* on client */
> @@ -513,6 +513,23 @@ struct ceph_mds_request_head {
>         union ceph_mds_request_args_ext args;
>  } __attribute__ ((packed));
>
> +struct ceph_mds_request_head {
> +       __le16 version;                /* struct version */
> +       __le64 oldest_client_tid;
> +       __le32 mdsmap_epoch;           /* on client */
> +       __le32 flags;                  /* CEPH_MDS_FLAG_* */
> +       __u8 num_retry, num_fwd;       /* legacy count retry and fwd attempts */
> +       __le16 num_releases;           /* # include cap/lease release records */
> +       __le32 op;                     /* mds op code */
> +       __le32 caller_uid, caller_gid;
> +       __le64 ino;                    /* use this ino for openc, mkdir, mknod,
> +                                         etc. (if replaying) */
> +       union ceph_mds_request_args_ext args;
> +
> +       __le32 ext_num_retry;          /* new count retry attempts */
> +       __le32 ext_num_fwd;            /* new count fwd attempts */
> +} __attribute__ ((packed));
> +
>  /* cap/lease release record */
>  struct ceph_mds_request_release {
>         __le64 ino, cap_id;            /* ino and unique cap id */
> --
> 2.40.1
>