+ p = msg->front.iov_base + sizeof(*ohead);
} else {
- struct ceph_mds_request_head *new_head = msg->front.iov_base;
+ struct ceph_mds_request_head *nhead = msg->front.iov_base;
msg->hdr.version = cpu_to_le16(6);
- new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
- head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
- p = msg->front.iov_base + sizeof(*new_head);
+ nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+ p = msg->front.iov_base + sizeof(*nhead);
}
end = msg->front.iov_base + msg->front.iov_len;
- head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
- head->op = cpu_to_le32(req->r_op);
- head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
- req->r_cred->fsuid));
- head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
- req->r_cred->fsgid));
- head->ino = cpu_to_le64(req->r_deleg_ino);
- head->args = req->r_args;
+ lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+ lhead->op = cpu_to_le32(req->r_op);
+ lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
+ req->r_cred->fsuid));
+ lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
+ req->r_cred->fsgid));
+ lhead->ino = cpu_to_le64(req->r_deleg_ino);
+ lhead->args = req->r_args;
ceph_encode_filepath(&p, end, ino1, path1);
ceph_encode_filepath(&p, end, ino2, path2);
@@ -3063,7 +3095,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
p = msg->front.iov_base + req->r_request_release_offset;
}
- head->num_releases = cpu_to_le16(releases);
+ lhead->num_releases = cpu_to_le16(releases);
encode_mclientrequest_tail(&p, req);
@@ -3114,18 +3146,6 @@ static void complete_request(struct ceph_mds_client *mdsc,
complete_all(&req->r_completion);
}
-static struct ceph_mds_request_head_old *
-find_old_request_head(void *p, u64 features)
-{
- bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
- struct ceph_mds_request_head *new_head;
-
- if (legacy)
- return (struct ceph_mds_request_head_old *)p;
- new_head = (struct ceph_mds_request_head *)p;
- return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
-}
-
/*
* called under mdsc->mutex
*/
@@ -3136,30 +3156,26 @@ static int __prepare_send_request(struct ceph_mds_session *session,
int mds = session->s_mds;
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_client *cl = mdsc->fsc->client;
- struct ceph_mds_request_head_old *rhead;
+ struct ceph_mds_request_head_legacy *lhead;
+ struct ceph_mds_request_head *nhead;
struct ceph_msg *msg;
- int flags = 0, max_retry;
+ int flags = 0, old_max_retry;
+ bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
- /*
- * The type of 'r_attempts' in kernel 'ceph_mds_request'
- * is 'int', while in 'ceph_mds_request_head' the type of
- * 'num_retry' is '__u8'. So in case the request retries
- * exceeding 256 times, the MDS will receive a incorrect
- * retry seq.
- *
- * In this case it's ususally a bug in MDS and continue
- * retrying the request makes no sense.
- *
- * In future this could be fixed in ceph code, so avoid
- * using the hardcode here.
+ /* Avoid inifinite retrying after overflow. The client will
+ * increase the retry count and if the MDS is old version,
+ * so we limit to retry at most 256 times.
*/
- max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
- max_retry = 1 << (max_retry * BITS_PER_BYTE);
- if (req->r_attempts >= max_retry) {
- pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n",
- req->r_tid);
- return -EMULTIHOP;
- }
+ if (req->r_attempts) {
+ old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry);
+ old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
+ if ((old_version && req->r_attempts >= old_max_retry) ||
+ ((uint32_t)req->r_attempts >= U32_MAX)) {
+ pr_warn_ratelimited_client(cl, "%s request tid %llu seq overflow\n",
+ __func__, req->r_tid);
+ return -EMULTIHOP;
+ }
+ }
req->r_attempts++;
if (req->r_inode) {
@@ -3184,20 +3200,24 @@ static int __prepare_send_request(struct ceph_mds_session *session,
* d_move mangles the src name.
*/
msg = req->r_request;
- rhead = find_old_request_head(msg->front.iov_base,
- session->s_con.peer_features);
+ lhead = find_legacy_request_head(msg->front.iov_base,
+ session->s_con.peer_features);
- flags = le32_to_cpu(rhead->flags);
+ flags = le32_to_cpu(lhead->flags);
flags |= CEPH_MDS_FLAG_REPLAY;
- rhead->flags = cpu_to_le32(flags);
+ lhead->flags = cpu_to_le32(flags);
if (req->r_target_inode)
- rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+ lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
- rhead->num_retry = req->r_attempts - 1;
+ lhead->num_retry = req->r_attempts - 1;
+ if (!old_version) {
+ nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
+ nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
+ }
/* remove cap/dentry releases from message */
- rhead->num_releases = 0;
+ lhead->num_releases = 0;
p = msg->front.iov_base + req->r_request_release_offset;
encode_mclientrequest_tail(&p, req);
@@ -3218,18 +3238,23 @@ static int __prepare_send_request(struct ceph_mds_session *session,
}
req->r_request = msg;
- rhead = find_old_request_head(msg->front.iov_base,
- session->s_con.peer_features);
- rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+ lhead = find_legacy_request_head(msg->front.iov_base,
+ session->s_con.peer_features);
+ lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_REPLAY;
if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_ASYNC;
if (req->r_parent)
flags |= CEPH_MDS_FLAG_WANT_DENTRY;
- rhead->flags = cpu_to_le32(flags);
- rhead->num_fwd = req->r_num_fwd;
- rhead->num_retry = req->r_attempts - 1;
+ lhead->flags = cpu_to_le32(flags);
+ lhead->num_fwd = req->r_num_fwd;
+ lhead->num_retry = req->r_attempts - 1;
+ if (!old_version) {
+ nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
+ nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
+ nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
+ }
doutc(cl, " r_parent = %p\n", req->r_parent);
return 0;
@@ -3893,34 +3918,20 @@ static void handle_forward(struct ceph_mds_client *mdsc,
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
doutc(cl, "forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req);
- } else if (fwd_seq <= req->r_num_fwd) {
- /*
- * The type of 'num_fwd' in ceph 'MClientRequestForward'
- * is 'int32_t', while in 'ceph_mds_request_head' the
- * type is '__u8'. So in case the request bounces between
- * MDSes exceeding 256 times, the client will get stuck.
- *
- * In this case it's ususally a bug in MDS and continue
- * bouncing the request makes no sense.
+ } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
+ /* Avoid inifinite retrying after overflow.
*
- * In future this could be fixed in ceph code, so avoid
- * using the hardcode here.
+ * The MDS will increase the fwd count and in client side
+ * if the num_fwd is less than the one saved in request
+ * that means the MDS is an old version and overflowed of
+ * 8 bits.
*/
- int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
- max = 1 << (max * BITS_PER_BYTE);
- if (req->r_num_fwd >= max) {
- mutex_lock(&req->r_fill_mutex);
- req->r_err = -EMULTIHOP;
- set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
- mutex_unlock(&req->r_fill_mutex);
- aborted = true;
- pr_warn_ratelimited_client(cl,
- "forward tid %llu seq overflow\n",
- tid);
- } else {
- doutc(cl, "forward tid %llu to mds%d - old seq %d <= %d\n",
- tid, next_mds, req->r_num_fwd, fwd_seq);
- }
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = -EMULTIHOP;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ aborted = true;
+ pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid);
} else {
/* resend. forward race not possible; mds would drop */
doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index befbd384428e..717a7399bacb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -32,8 +32,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_ALTERNATE_NAME,
CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
CEPHFS_FEATURE_OP_GETVXATTR,
+ CEPHFS_FEATURE_32BITS_RETRY_FWD,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
};
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
@@ -47,6 +48,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_ALTERNATE_NAME, \
CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
CEPHFS_FEATURE_OP_GETVXATTR, \
+ CEPHFS_FEATURE_32BITS_RETRY_FWD, \
}
/*
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 45f8ce61e103..f3b3593254b9 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -484,7 +484,7 @@ union ceph_mds_request_args_ext {
#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
#define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */
-struct ceph_mds_request_head_old {
+struct ceph_mds_request_head_legacy {
__le64 oldest_client_tid;
__le32 mdsmap_epoch; /* on client */
__le32 flags; /* CEPH_MDS_FLAG_* */
@@ -497,9 +497,9 @@ struct ceph_mds_request_head_old {
union ceph_mds_request_args args;
} __attribute__ ((packed));
-#define CEPH_MDS_REQUEST_HEAD_VERSION 1
+#define CEPH_MDS_REQUEST_HEAD_VERSION 2
-struct ceph_mds_request_head {
+struct ceph_mds_request_head_old {
__le16 version; /* struct version */
__le64 oldest_client_tid;
__le32 mdsmap_epoch; /* on client */
@@ -513,6 +513,23 @@ struct ceph_mds_request_head {
union ceph_mds_request_args_ext args;
} __attribute__ ((packed));
+struct ceph_mds_request_head {
+ __le16 version; /* struct version */
+ __le64 oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args_ext args;
+
+ __le32 ext_num_retry; /* new count retry attempts */
+ __le32 ext_num_fwd; /* new count fwd attempts */
+} __attribute__ ((packed));
+
/* cap/lease release record */
struct ceph_mds_request_release {
__le64 ino, cap_id; /* ino and unique cap id */
--
2.40.1