Re: [PATCH] ceph: make num_fwd and num_retry to __u32

Aleksandr Mikhalitsyn <aleksandr.mikhalitsyn@xxxxxxxxxxxxx> · Wed, 26 Jul 2023 15:19:55 +0200

On Wed, Jul 26, 2023 at 3:16 PM Xiubo Li <xiubli@xxxxxxxxxx> wrote:
>
>
> On 7/26/23 18:06, Aleksandr Mikhalitsyn wrote:
> > Hi Xiubo!
> >
> > On Tue, Jul 25, 2023 at 12:46 PM <xiubli@xxxxxxxxxx> wrote:
> >> From: Xiubo Li <xiubli@xxxxxxxxxx>
> >>
> >> The num_fwd in MClientRequestForward is int32_t, while the num_fwd
> >> in ceph_mds_request_head is __u8. This is buggy when the num_fwd
> >> is larger than 256 it will always be truncate to 0 again. But the
> >> client couldn't recoginize this.
> >>
> >> This will make them to __u32 instead. Because the old cephs will
> >> directly copy the raw memories when decoding the reqeust's head,
> >> so we need to make sure this kclient will be compatible with old
> >> cephs. For newer cephs they will decode the requests depending
> >> the version, which will be much simpler and easier to extend new
> >> members.
> >>
> >> URL: https://tracker.ceph.com/issues/62145
> >> Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
> >> ---
> >>   fs/ceph/mds_client.c         | 191 ++++++++++++++++++-----------------
> >>   fs/ceph/mds_client.h         |   4 +-
> >>   include/linux/ceph/ceph_fs.h |  23 ++++-
> >>   3 files changed, 124 insertions(+), 94 deletions(-)
> >>
> >> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >> index 70987b3c198a..191bae3a4ee6 100644
> >> --- a/fs/ceph/mds_client.c
> >> +++ b/fs/ceph/mds_client.c
> >> @@ -2897,6 +2897,18 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
> >>          }
> >>   }
> >>
> >> +static struct ceph_mds_request_head_legacy *
> >> +find_legacy_request_head(void *p, u64 features)
> >> +{
> >> +       bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
> >> +       struct ceph_mds_request_head_old *ohead;
> >> +
> >> +       if (legacy)
> >> +               return (struct ceph_mds_request_head_legacy *)p;
> >> +       ohead = (struct ceph_mds_request_head_old *)p;
> >> +       return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
> >> +}
> >> +
> >>   /*
> >>    * called under mdsc->mutex
> >>    */
> >> @@ -2907,7 +2919,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>          int mds = session->s_mds;
> >>          struct ceph_mds_client *mdsc = session->s_mdsc;
> >>          struct ceph_msg *msg;
> >> -       struct ceph_mds_request_head_old *head;
> >> +       struct ceph_mds_request_head_legacy *lhead;
> >>          const char *path1 = NULL;
> >>          const char *path2 = NULL;
> >>          u64 ino1 = 0, ino2 = 0;
> >> @@ -2919,6 +2931,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>          void *p, *end;
> >>          int ret;
> >>          bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
> >> +       bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
> >>
> >>          ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
> >>                                req->r_parent, req->r_path1, req->r_ino1.ino,
> >> @@ -2950,7 +2963,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>                  goto out_free2;
> >>          }
> >>
> >> -       len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head);
> >> +       /*
> >> +        * For old cephs without supporting the 32bit retry/fwd feature
> >> +        * it will copy the raw memories directly when decoding the
> >> +        * requests. While new cephs will decode the head depending the
> >> +        * version member, so we need to make sure it will be compatible
> >> +        * with them both.
> >> +        */
> >> +       if (legacy)
> >> +               len = sizeof(struct ceph_mds_request_head_legacy);
> >> +       else if (old_version)
> >> +               len = sizeof(struct ceph_mds_request_head_old);
> >> +       else
> >> +               len = sizeof(struct ceph_mds_request_head);
> >>
> >>          /* filepaths */
> >>          len += 2 * (1 + sizeof(u32) + sizeof(u64));
> >> @@ -2995,33 +3020,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>
> >>          msg->hdr.tid = cpu_to_le64(req->r_tid);
> >>
> >> +       lhead = find_legacy_request_head(msg->front.iov_base,
> >> +                                        session->s_con.peer_features);
> >> +
> >>          /*
> >> -        * The old ceph_mds_request_head didn't contain a version field, and
> >> +        * The ceph_mds_request_head_legacy didn't contain a version field, and
> >>           * one was added when we moved the message version from 3->4.
> >>           */
> >>          if (legacy) {
> >>                  msg->hdr.version = cpu_to_le16(3);
> >> -               head = msg->front.iov_base;
> >> -               p = msg->front.iov_base + sizeof(*head);
> >> +               p = msg->front.iov_base + sizeof(*lhead);
> >> +       } else if (old_version) {
> >> +               struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
> >> +
> >> +               msg->hdr.version = cpu_to_le16(4);
> >> +               ohead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> > Don't we want to use the old mds request head version here:
> > ohead->version = cpu_to_le16(1);
> > ?
> >
> > As far as I understand the idea here is to skip new fields
> > ext_num_retry/ext_num_fwd in case
> > when the old_version is true. Would it be incorrect to set version to
> > the latest one (CEPH_MDS_REQUEST_HEAD_VERSION)
> > and at the same time skip the setting of new fields?
>
> Hi Alex,
>
> I noticed that, but this doesn't matter, because for the old version
> cephs they will do nothing with the version field. I am just following
> the user space code.

Got it. Thanks!

Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@xxxxxxxxxxxxx>

>
> But to make the code to be more readable and easier to understand I will
> revise this. And also will raise one PR to fix the user space code.
>
> Thanks
>
> - Xiubo
>
>
> > Kind regards,
> > Alex
> >
> >> +               p = msg->front.iov_base + sizeof(*ohead);
> >>          } else {
> >> -               struct ceph_mds_request_head *new_head = msg->front.iov_base;
> >> +               struct ceph_mds_request_head *nhead = msg->front.iov_base;
> >>
> >>                  msg->hdr.version = cpu_to_le16(6);
> >> -               new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> >> -               head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
> >> -               p = msg->front.iov_base + sizeof(*new_head);
> >> +               nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> >> +               p = msg->front.iov_base + sizeof(*nhead);
> >>          }
> >>
> >>          end = msg->front.iov_base + msg->front.iov_len;
> >>
> >> -       head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
> >> -       head->op = cpu_to_le32(req->r_op);
> >> -       head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
> >> -                                                req->r_cred->fsuid));
> >> -       head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
> >> -                                                req->r_cred->fsgid));
> >> -       head->ino = cpu_to_le64(req->r_deleg_ino);
> >> -       head->args = req->r_args;
> >> +       lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
> >> +       lhead->op = cpu_to_le32(req->r_op);
> >> +       lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
> >> +                                                 req->r_cred->fsuid));
> >> +       lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
> >> +                                                 req->r_cred->fsgid));
> >> +       lhead->ino = cpu_to_le64(req->r_deleg_ino);
> >> +       lhead->args = req->r_args;
> >>
> >>          ceph_encode_filepath(&p, end, ino1, path1);
> >>          ceph_encode_filepath(&p, end, ino2, path2);
> >> @@ -3063,7 +3095,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>                  p = msg->front.iov_base + req->r_request_release_offset;
> >>          }
> >>
> >> -       head->num_releases = cpu_to_le16(releases);
> >> +       lhead->num_releases = cpu_to_le16(releases);
> >>
> >>          encode_mclientrequest_tail(&p, req);
> >>
> >> @@ -3114,18 +3146,6 @@ static void complete_request(struct ceph_mds_client *mdsc,
> >>          complete_all(&req->r_completion);
> >>   }
> >>
> >> -static struct ceph_mds_request_head_old *
> >> -find_old_request_head(void *p, u64 features)
> >> -{
> >> -       bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
> >> -       struct ceph_mds_request_head *new_head;
> >> -
> >> -       if (legacy)
> >> -               return (struct ceph_mds_request_head_old *)p;
> >> -       new_head = (struct ceph_mds_request_head *)p;
> >> -       return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
> >> -}
> >> -
> >>   /*
> >>    * called under mdsc->mutex
> >>    */
> >> @@ -3136,30 +3156,26 @@ static int __prepare_send_request(struct ceph_mds_session *session,
> >>          int mds = session->s_mds;
> >>          struct ceph_mds_client *mdsc = session->s_mdsc;
> >>          struct ceph_client *cl = mdsc->fsc->client;
> >> -       struct ceph_mds_request_head_old *rhead;
> >> +       struct ceph_mds_request_head_legacy *lhead;
> >> +       struct ceph_mds_request_head *nhead;
> >>          struct ceph_msg *msg;
> >> -       int flags = 0, max_retry;
> >> +       int flags = 0, old_max_retry;
> >> +       bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
> >>
> >> -       /*
> >> -        * The type of 'r_attempts' in kernel 'ceph_mds_request'
> >> -        * is 'int', while in 'ceph_mds_request_head' the type of
> >> -        * 'num_retry' is '__u8'. So in case the request retries
> >> -        *  exceeding 256 times, the MDS will receive a incorrect
> >> -        *  retry seq.
> >> -        *
> >> -        * In this case it's ususally a bug in MDS and continue
> >> -        * retrying the request makes no sense.
> >> -        *
> >> -        * In future this could be fixed in ceph code, so avoid
> >> -        * using the hardcode here.
> >> +       /* Avoid inifinite retrying after overflow. The client will
> >> +        * increase the retry count and if the MDS is old version,
> >> +        * so we limit to retry at most 256 times.
> >>           */
> >> -       max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
> >> -       max_retry = 1 << (max_retry * BITS_PER_BYTE);
> >> -       if (req->r_attempts >= max_retry) {
> >> -               pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n",
> >> -                                          req->r_tid);
> >> -               return -EMULTIHOP;
> >> -       }
> >> +       if (req->r_attempts) {
> >> +               old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry);
> >> +               old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
> >> +               if ((old_version && req->r_attempts >= old_max_retry) ||
> >> +                   ((uint32_t)req->r_attempts >= U32_MAX)) {
> >> +                       pr_warn_ratelimited_client(cl, "%s request tid %llu seq overflow\n",
> >> +                                                 __func__, req->r_tid);
> >> +                       return -EMULTIHOP;
> >> +               }
> >> +        }
> >>
> >>          req->r_attempts++;
> >>          if (req->r_inode) {
> >> @@ -3184,20 +3200,24 @@ static int __prepare_send_request(struct ceph_mds_session *session,
> >>                   * d_move mangles the src name.
> >>                   */
> >>                  msg = req->r_request;
> >> -               rhead = find_old_request_head(msg->front.iov_base,
> >> -                                             session->s_con.peer_features);
> >> +               lhead = find_legacy_request_head(msg->front.iov_base,
> >> +                                                session->s_con.peer_features);
> >>
> >> -               flags = le32_to_cpu(rhead->flags);
> >> +               flags = le32_to_cpu(lhead->flags);
> >>                  flags |= CEPH_MDS_FLAG_REPLAY;
> >> -               rhead->flags = cpu_to_le32(flags);
> >> +               lhead->flags = cpu_to_le32(flags);
> >>
> >>                  if (req->r_target_inode)
> >> -                       rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
> >> +                       lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
> >>
> >> -               rhead->num_retry = req->r_attempts - 1;
> >> +               lhead->num_retry = req->r_attempts - 1;
> >> +               if (!old_version) {
> >> +                       nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
> >> +                       nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
> >> +               }
> >>
> >>                  /* remove cap/dentry releases from message */
> >> -               rhead->num_releases = 0;
> >> +               lhead->num_releases = 0;
> >>
> >>                  p = msg->front.iov_base + req->r_request_release_offset;
> >>                  encode_mclientrequest_tail(&p, req);
> >> @@ -3218,18 +3238,23 @@ static int __prepare_send_request(struct ceph_mds_session *session,
> >>          }
> >>          req->r_request = msg;
> >>
> >> -       rhead = find_old_request_head(msg->front.iov_base,
> >> -                                     session->s_con.peer_features);
> >> -       rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
> >> +       lhead = find_legacy_request_head(msg->front.iov_base,
> >> +                                        session->s_con.peer_features);
> >> +       lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
> >>          if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
> >>                  flags |= CEPH_MDS_FLAG_REPLAY;
> >>          if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
> >>                  flags |= CEPH_MDS_FLAG_ASYNC;
> >>          if (req->r_parent)
> >>                  flags |= CEPH_MDS_FLAG_WANT_DENTRY;
> >> -       rhead->flags = cpu_to_le32(flags);
> >> -       rhead->num_fwd = req->r_num_fwd;
> >> -       rhead->num_retry = req->r_attempts - 1;
> >> +       lhead->flags = cpu_to_le32(flags);
> >> +       lhead->num_fwd = req->r_num_fwd;
> >> +       lhead->num_retry = req->r_attempts - 1;
> >> +       if (!old_version) {
> >> +               nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
> >> +               nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
> >> +               nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
> >> +       }
> >>
> >>          doutc(cl, " r_parent = %p\n", req->r_parent);
> >>          return 0;
> >> @@ -3893,34 +3918,20 @@ static void handle_forward(struct ceph_mds_client *mdsc,
> >>          if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
> >>                  doutc(cl, "forward tid %llu aborted, unregistering\n", tid);
> >>                  __unregister_request(mdsc, req);
> >> -       } else if (fwd_seq <= req->r_num_fwd) {
> >> -               /*
> >> -                * The type of 'num_fwd' in ceph 'MClientRequestForward'
> >> -                * is 'int32_t', while in 'ceph_mds_request_head' the
> >> -                * type is '__u8'. So in case the request bounces between
> >> -                * MDSes exceeding 256 times, the client will get stuck.
> >> -                *
> >> -                * In this case it's ususally a bug in MDS and continue
> >> -                * bouncing the request makes no sense.
> >> +       } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
> >> +               /* Avoid inifinite retrying after overflow.
> >>                   *
> >> -                * In future this could be fixed in ceph code, so avoid
> >> -                * using the hardcode here.
> >> +                * The MDS will increase the fwd count and in client side
> >> +                * if the num_fwd is less than the one saved in request
> >> +                * that means the MDS is an old version and overflowed of
> >> +                * 8 bits.
> >>                   */
> >> -               int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
> >> -               max = 1 << (max * BITS_PER_BYTE);
> >> -               if (req->r_num_fwd >= max) {
> >> -                       mutex_lock(&req->r_fill_mutex);
> >> -                       req->r_err = -EMULTIHOP;
> >> -                       set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
> >> -                       mutex_unlock(&req->r_fill_mutex);
> >> -                       aborted = true;
> >> -                       pr_warn_ratelimited_client(cl,
> >> -                                       "forward tid %llu seq overflow\n",
> >> -                                       tid);
> >> -               } else {
> >> -                       doutc(cl, "forward tid %llu to mds%d - old seq %d <= %d\n",
> >> -                             tid, next_mds, req->r_num_fwd, fwd_seq);
> >> -               }
> >> +               mutex_lock(&req->r_fill_mutex);
> >> +               req->r_err = -EMULTIHOP;
> >> +               set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
> >> +               mutex_unlock(&req->r_fill_mutex);
> >> +               aborted = true;
> >> +               pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid);
> >>          } else {
> >>                  /* resend. forward race not possible; mds would drop */
> >>                  doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds);
> >> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> >> index befbd384428e..717a7399bacb 100644
> >> --- a/fs/ceph/mds_client.h
> >> +++ b/fs/ceph/mds_client.h
> >> @@ -32,8 +32,9 @@ enum ceph_feature_type {
> >>          CEPHFS_FEATURE_ALTERNATE_NAME,
> >>          CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
> >>          CEPHFS_FEATURE_OP_GETVXATTR,
> >> +       CEPHFS_FEATURE_32BITS_RETRY_FWD,
> >>
> >> -       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
> >> +       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
> >>   };
> >>
> >>   #define CEPHFS_FEATURES_CLIENT_SUPPORTED {     \
> >> @@ -47,6 +48,7 @@ enum ceph_feature_type {
> >>          CEPHFS_FEATURE_ALTERNATE_NAME,          \
> >>          CEPHFS_FEATURE_NOTIFY_SESSION_STATE,    \
> >>          CEPHFS_FEATURE_OP_GETVXATTR,            \
> >> +       CEPHFS_FEATURE_32BITS_RETRY_FWD,        \
> >>   }
> >>
> >>   /*
> >> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> >> index 45f8ce61e103..f3b3593254b9 100644
> >> --- a/include/linux/ceph/ceph_fs.h
> >> +++ b/include/linux/ceph/ceph_fs.h
> >> @@ -484,7 +484,7 @@ union ceph_mds_request_args_ext {
> >>   #define CEPH_MDS_FLAG_WANT_DENTRY      2 /* want dentry in reply */
> >>   #define CEPH_MDS_FLAG_ASYNC            4 /* request is asynchronous */
> >>
> >> -struct ceph_mds_request_head_old {
> >> +struct ceph_mds_request_head_legacy {
> >>          __le64 oldest_client_tid;
> >>          __le32 mdsmap_epoch;           /* on client */
> >>          __le32 flags;                  /* CEPH_MDS_FLAG_* */
> >> @@ -497,9 +497,9 @@ struct ceph_mds_request_head_old {
> >>          union ceph_mds_request_args args;
> >>   } __attribute__ ((packed));
> >>
> >> -#define CEPH_MDS_REQUEST_HEAD_VERSION  1
> >> +#define CEPH_MDS_REQUEST_HEAD_VERSION  2
> >>
> >> -struct ceph_mds_request_head {
> >> +struct ceph_mds_request_head_old {
> >>          __le16 version;                /* struct version */
> >>          __le64 oldest_client_tid;
> >>          __le32 mdsmap_epoch;           /* on client */
> >> @@ -513,6 +513,23 @@ struct ceph_mds_request_head {
> >>          union ceph_mds_request_args_ext args;
> >>   } __attribute__ ((packed));
> >>
> >> +struct ceph_mds_request_head {
> >> +       __le16 version;                /* struct version */
> >> +       __le64 oldest_client_tid;
> >> +       __le32 mdsmap_epoch;           /* on client */
> >> +       __le32 flags;                  /* CEPH_MDS_FLAG_* */
> >> +       __u8 num_retry, num_fwd;       /* legacy count retry and fwd attempts */
> >> +       __le16 num_releases;           /* # include cap/lease release records */
> >> +       __le32 op;                     /* mds op code */
> >> +       __le32 caller_uid, caller_gid;
> >> +       __le64 ino;                    /* use this ino for openc, mkdir, mknod,
> >> +                                         etc. (if replaying) */
> >> +       union ceph_mds_request_args_ext args;
> >> +
> >> +       __le32 ext_num_retry;          /* new count retry attempts */
> >> +       __le32 ext_num_fwd;            /* new count fwd attempts */
> >> +} __attribute__ ((packed));
> >> +
> >>   /* cap/lease release record */
> >>   struct ceph_mds_request_release {
> >>          __le64 ino, cap_id;            /* ino and unique cap id */
> >> --
> >> 2.40.1
> >>
>