Hi Xiubo! On Tue, Jul 25, 2023 at 12:46 PM <xiubli@xxxxxxxxxx> wrote: > > From: Xiubo Li <xiubli@xxxxxxxxxx> > > The num_fwd in MClientRequestForward is int32_t, while the num_fwd > in ceph_mds_request_head is __u8. This is buggy when the num_fwd > is larger than 256 it will always be truncate to 0 again. But the > client couldn't recoginize this. > > This will make them to __u32 instead. Because the old cephs will > directly copy the raw memories when decoding the reqeust's head, > so we need to make sure this kclient will be compatible with old > cephs. For newer cephs they will decode the requests depending > the version, which will be much simpler and easier to extend new > members. > > URL: https://tracker.ceph.com/issues/62145 > Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> > --- > fs/ceph/mds_client.c | 191 ++++++++++++++++++----------------- > fs/ceph/mds_client.h | 4 +- > include/linux/ceph/ceph_fs.h | 23 ++++- > 3 files changed, 124 insertions(+), 94 deletions(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 70987b3c198a..191bae3a4ee6 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -2897,6 +2897,18 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request * > } > } > > +static struct ceph_mds_request_head_legacy * > +find_legacy_request_head(void *p, u64 features) > +{ > + bool legacy = !(features & CEPH_FEATURE_FS_BTIME); > + struct ceph_mds_request_head_old *ohead; > + > + if (legacy) > + return (struct ceph_mds_request_head_legacy *)p; > + ohead = (struct ceph_mds_request_head_old *)p; > + return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid; > +} > + > /* > * called under mdsc->mutex > */ > @@ -2907,7 +2919,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > int mds = session->s_mds; > struct ceph_mds_client *mdsc = session->s_mdsc; > struct ceph_msg *msg; > - struct ceph_mds_request_head_old *head; > + struct ceph_mds_request_head_legacy *lhead; > const char *path1 = NULL; > const char *path2 = NULL; > u64 ino1 = 0, ino2 = 0; > @@ -2919,6 +2931,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > void *p, *end; > int ret; > bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); > + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features); > > ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, > req->r_parent, req->r_path1, req->r_ino1.ino, > @@ -2950,7 +2963,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > goto out_free2; > } > > - len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head); > + /* > + * For old cephs without supporting the 32bit retry/fwd feature > + * it will copy the raw memories directly when decoding the > + * requests. While new cephs will decode the head depending the > + * version member, so we need to make sure it will be compatible > + * with them both. > + */ > + if (legacy) > + len = sizeof(struct ceph_mds_request_head_legacy); > + else if (old_version) > + len = sizeof(struct ceph_mds_request_head_old); > + else > + len = sizeof(struct ceph_mds_request_head); > > /* filepaths */ > len += 2 * (1 + sizeof(u32) + sizeof(u64)); > @@ -2995,33 +3020,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > > msg->hdr.tid = cpu_to_le64(req->r_tid); > > + lhead = find_legacy_request_head(msg->front.iov_base, > + session->s_con.peer_features); > + > /* > - * The old ceph_mds_request_head didn't contain a version field, and > + * The ceph_mds_request_head_legacy didn't contain a version field, and > * one was added when we moved the message version from 3->4. > */ > if (legacy) { > msg->hdr.version = cpu_to_le16(3); > - head = msg->front.iov_base; > - p = msg->front.iov_base + sizeof(*head); > + p = msg->front.iov_base + sizeof(*lhead); > + } else if (old_version) { > + struct ceph_mds_request_head_old *ohead = msg->front.iov_base; > + > + msg->hdr.version = cpu_to_le16(4); > + ohead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); Don't we want to use the old mds request head version here: ohead->version = cpu_to_le16(1); ? As far as I understand the idea here is to skip new fields ext_num_retry/ext_num_fwd in case when the old_version is true. Would it be incorrect to set version to the latest one (CEPH_MDS_REQUEST_HEAD_VERSION) and at the same time skip the setting of new fields? Kind regards, Alex > + p = msg->front.iov_base + sizeof(*ohead); > } else { > - struct ceph_mds_request_head *new_head = msg->front.iov_base; > + struct ceph_mds_request_head *nhead = msg->front.iov_base; > > msg->hdr.version = cpu_to_le16(6); > - new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); > - head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; > - p = msg->front.iov_base + sizeof(*new_head); > + nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); > + p = msg->front.iov_base + sizeof(*nhead); > } > > end = msg->front.iov_base + msg->front.iov_len; > > - head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); > - head->op = cpu_to_le32(req->r_op); > - head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, > - req->r_cred->fsuid)); > - head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, > - req->r_cred->fsgid)); > - head->ino = cpu_to_le64(req->r_deleg_ino); > - head->args = req->r_args; > + lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); > + lhead->op = cpu_to_le32(req->r_op); > + lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, > + req->r_cred->fsuid)); > + lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, > + req->r_cred->fsgid)); > + lhead->ino = cpu_to_le64(req->r_deleg_ino); > + lhead->args = req->r_args; > > ceph_encode_filepath(&p, end, ino1, path1); > ceph_encode_filepath(&p, end, ino2, path2); > @@ -3063,7 +3095,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, > p = msg->front.iov_base + req->r_request_release_offset; > } > > - head->num_releases = cpu_to_le16(releases); > + lhead->num_releases = cpu_to_le16(releases); > > encode_mclientrequest_tail(&p, req); > > @@ -3114,18 +3146,6 @@ static void complete_request(struct ceph_mds_client *mdsc, > complete_all(&req->r_completion); > } > > -static struct ceph_mds_request_head_old * > -find_old_request_head(void *p, u64 features) > -{ > - bool legacy = !(features & CEPH_FEATURE_FS_BTIME); > - struct ceph_mds_request_head *new_head; > - > - if (legacy) > - return (struct ceph_mds_request_head_old *)p; > - new_head = (struct ceph_mds_request_head *)p; > - return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; > -} > - > /* > * called under mdsc->mutex > */ > @@ -3136,30 +3156,26 @@ static int __prepare_send_request(struct ceph_mds_session *session, > int mds = session->s_mds; > struct ceph_mds_client *mdsc = session->s_mdsc; > struct ceph_client *cl = mdsc->fsc->client; > - struct ceph_mds_request_head_old *rhead; > + struct ceph_mds_request_head_legacy *lhead; > + struct ceph_mds_request_head *nhead; > struct ceph_msg *msg; > - int flags = 0, max_retry; > + int flags = 0, old_max_retry; > + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features); > > - /* > - * The type of 'r_attempts' in kernel 'ceph_mds_request' > - * is 'int', while in 'ceph_mds_request_head' the type of > - * 'num_retry' is '__u8'. So in case the request retries > - * exceeding 256 times, the MDS will receive a incorrect > - * retry seq. > - * > - * In this case it's ususally a bug in MDS and continue > - * retrying the request makes no sense. > - * > - * In future this could be fixed in ceph code, so avoid > - * using the hardcode here. > + /* Avoid inifinite retrying after overflow. The client will > + * increase the retry count and if the MDS is old version, > + * so we limit to retry at most 256 times. > */ > - max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); > - max_retry = 1 << (max_retry * BITS_PER_BYTE); > - if (req->r_attempts >= max_retry) { > - pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n", > - req->r_tid); > - return -EMULTIHOP; > - } > + if (req->r_attempts) { > + old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry); > + old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); > + if ((old_version && req->r_attempts >= old_max_retry) || > + ((uint32_t)req->r_attempts >= U32_MAX)) { > + pr_warn_ratelimited_client(cl, "%s request tid %llu seq overflow\n", > + __func__, req->r_tid); > + return -EMULTIHOP; > + } > + } > > req->r_attempts++; > if (req->r_inode) { > @@ -3184,20 +3200,24 @@ static int __prepare_send_request(struct ceph_mds_session *session, > * d_move mangles the src name. > */ > msg = req->r_request; > - rhead = find_old_request_head(msg->front.iov_base, > - session->s_con.peer_features); > + lhead = find_legacy_request_head(msg->front.iov_base, > + session->s_con.peer_features); > > - flags = le32_to_cpu(rhead->flags); > + flags = le32_to_cpu(lhead->flags); > flags |= CEPH_MDS_FLAG_REPLAY; > - rhead->flags = cpu_to_le32(flags); > + lhead->flags = cpu_to_le32(flags); > > if (req->r_target_inode) > - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); > + lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); > > - rhead->num_retry = req->r_attempts - 1; > + lhead->num_retry = req->r_attempts - 1; > + if (!old_version) { > + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; > + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); > + } > > /* remove cap/dentry releases from message */ > - rhead->num_releases = 0; > + lhead->num_releases = 0; > > p = msg->front.iov_base + req->r_request_release_offset; > encode_mclientrequest_tail(&p, req); > @@ -3218,18 +3238,23 @@ static int __prepare_send_request(struct ceph_mds_session *session, > } > req->r_request = msg; > > - rhead = find_old_request_head(msg->front.iov_base, > - session->s_con.peer_features); > - rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); > + lhead = find_legacy_request_head(msg->front.iov_base, > + session->s_con.peer_features); > + lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); > if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) > flags |= CEPH_MDS_FLAG_REPLAY; > if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) > flags |= CEPH_MDS_FLAG_ASYNC; > if (req->r_parent) > flags |= CEPH_MDS_FLAG_WANT_DENTRY; > - rhead->flags = cpu_to_le32(flags); > - rhead->num_fwd = req->r_num_fwd; > - rhead->num_retry = req->r_attempts - 1; > + lhead->flags = cpu_to_le32(flags); > + lhead->num_fwd = req->r_num_fwd; > + lhead->num_retry = req->r_attempts - 1; > + if (!old_version) { > + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; > + nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd); > + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); > + } > > doutc(cl, " r_parent = %p\n", req->r_parent); > return 0; > @@ -3893,34 +3918,20 @@ static void handle_forward(struct ceph_mds_client *mdsc, > if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { > doutc(cl, "forward tid %llu aborted, unregistering\n", tid); > __unregister_request(mdsc, req); > - } else if (fwd_seq <= req->r_num_fwd) { > - /* > - * The type of 'num_fwd' in ceph 'MClientRequestForward' > - * is 'int32_t', while in 'ceph_mds_request_head' the > - * type is '__u8'. So in case the request bounces between > - * MDSes exceeding 256 times, the client will get stuck. > - * > - * In this case it's ususally a bug in MDS and continue > - * bouncing the request makes no sense. > + } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { > + /* Avoid inifinite retrying after overflow. > * > - * In future this could be fixed in ceph code, so avoid > - * using the hardcode here. > + * The MDS will increase the fwd count and in client side > + * if the num_fwd is less than the one saved in request > + * that means the MDS is an old version and overflowed of > + * 8 bits. > */ > - int max = sizeof_field(struct ceph_mds_request_head, num_fwd); > - max = 1 << (max * BITS_PER_BYTE); > - if (req->r_num_fwd >= max) { > - mutex_lock(&req->r_fill_mutex); > - req->r_err = -EMULTIHOP; > - set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); > - mutex_unlock(&req->r_fill_mutex); > - aborted = true; > - pr_warn_ratelimited_client(cl, > - "forward tid %llu seq overflow\n", > - tid); > - } else { > - doutc(cl, "forward tid %llu to mds%d - old seq %d <= %d\n", > - tid, next_mds, req->r_num_fwd, fwd_seq); > - } > + mutex_lock(&req->r_fill_mutex); > + req->r_err = -EMULTIHOP; > + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); > + mutex_unlock(&req->r_fill_mutex); > + aborted = true; > + pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid); > } else { > /* resend. forward race not possible; mds would drop */ > doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds); > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > index befbd384428e..717a7399bacb 100644 > --- a/fs/ceph/mds_client.h > +++ b/fs/ceph/mds_client.h > @@ -32,8 +32,9 @@ enum ceph_feature_type { > CEPHFS_FEATURE_ALTERNATE_NAME, > CEPHFS_FEATURE_NOTIFY_SESSION_STATE, > CEPHFS_FEATURE_OP_GETVXATTR, > + CEPHFS_FEATURE_32BITS_RETRY_FWD, > > - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, > + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD, > }; > > #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ > @@ -47,6 +48,7 @@ enum ceph_feature_type { > CEPHFS_FEATURE_ALTERNATE_NAME, \ > CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ > CEPHFS_FEATURE_OP_GETVXATTR, \ > + CEPHFS_FEATURE_32BITS_RETRY_FWD, \ > } > > /* > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h > index 45f8ce61e103..f3b3593254b9 100644 > --- a/include/linux/ceph/ceph_fs.h > +++ b/include/linux/ceph/ceph_fs.h > @@ -484,7 +484,7 @@ union ceph_mds_request_args_ext { > #define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ > #define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */ > > -struct ceph_mds_request_head_old { > +struct ceph_mds_request_head_legacy { > __le64 oldest_client_tid; > __le32 mdsmap_epoch; /* on client */ > __le32 flags; /* CEPH_MDS_FLAG_* */ > @@ -497,9 +497,9 @@ struct ceph_mds_request_head_old { > union ceph_mds_request_args args; > } __attribute__ ((packed)); > > -#define CEPH_MDS_REQUEST_HEAD_VERSION 1 > +#define CEPH_MDS_REQUEST_HEAD_VERSION 2 > > -struct ceph_mds_request_head { > +struct ceph_mds_request_head_old { > __le16 version; /* struct version */ > __le64 oldest_client_tid; > __le32 mdsmap_epoch; /* on client */ > @@ -513,6 +513,23 @@ struct ceph_mds_request_head { > union ceph_mds_request_args_ext args; > } __attribute__ ((packed)); > > +struct ceph_mds_request_head { > + __le16 version; /* struct version */ > + __le64 oldest_client_tid; > + __le32 mdsmap_epoch; /* on client */ > + __le32 flags; /* CEPH_MDS_FLAG_* */ > + __u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */ > + __le16 num_releases; /* # include cap/lease release records */ > + __le32 op; /* mds op code */ > + __le32 caller_uid, caller_gid; > + __le64 ino; /* use this ino for openc, mkdir, mknod, > + etc. (if replaying) */ > + union ceph_mds_request_args_ext args; > + > + __le32 ext_num_retry; /* new count retry attempts */ > + __le32 ext_num_fwd; /* new count fwd attempts */ > +} __attribute__ ((packed)); > + > /* cap/lease release record */ > struct ceph_mds_request_release { > __le64 ino, cap_id; /* ino and unique cap id */ > -- > 2.40.1 >