This can be seen almost every time when mounting:
92959 <7>[13220.815822] ceph: mount start 00000000e3b1070a
92960 <6>[13220.817349] libceph: mon2 (1)192.168.195.165:40566 session
established
92961 <7>[13220.818639] ceph: handle_map epoch 58 len 889
92962 <7>[13220.818644] ceph: mdsmap_decode 1/3 4216 mds0.5
(1)192.168.195.165:6813 up:active ==> max_mds == 3
92963 <7>[13220.818646] ceph: mdsmap_decode 2/3 4339 mds1.55
(1)192.168.195.165:6815 up:active
92964 <7>[13220.818646] ceph: mdsmap_decode 3/3 4340 mds2.57
(1)192.168.195.165:6814 up:active
92965 <7>[13220.818648] ceph: mdsmap_decode m_enabled: 1, m_damaged:
0, m_num_laggy: 0
92966 <7>[13220.818648] ceph: mdsmap_decode success epoch 58
92967 <7>[13220.818649] ceph: check_new_map new 58 old 0
92968 <6>[13220.820577] libceph: client5819 fsid
6c39c42b-5888-4c6a-ba32-89acfa5a2353
92969 <7>[13220.821827] ceph: mount opening path \t
92970 <7>[13220.821887] ceph: open_root_inode opening ''
92971 <7>[13220.821892] ceph: do_request on 000000005666e7bf
92972 <7>[13220.821893] ceph: submit_request on 000000005666e7bf for
inode 0000000057da6992
92973 <7>[13220.821896] ceph: __register_request 000000005666e7bf tid 1
92974 <7>[13220.821898] ceph: __choose_mds 0000000057da6992 is_hash=0
(0) mode 0
92975 <7>[13220.821899] ceph: choose_mds chose random
mds0 ====================> random mds0
92976 <7>[13220.821901] ceph: register_session: realloc to 1
92977 <7>[13220.821902] ceph: register_session: mds0
92978 <7>[13220.821905] ceph: mdsc get_session 000000001649fdbd 2 -> 3
92979 <7>[13220.821905] ceph: mdsc con_get 000000001649fdbd ok (3)
92980 <7>[13220.821910] ceph: mdsc get_session 000000001649fdbd 3 -> 4
92981 <7>[13220.821912] ceph: do_request mds0 session
000000001649fdbd state new
92982 <7>[13220.821913] ceph: open_session to mds0 (up:active)
92983 [...]
92984 <7>[13220.822167] ceph: do_request waiting
92985 [...]
92986 <7>[13220.833112] ceph: handle_session mds0 open
000000001649fdbd state opening seq 0
92987 <7>[13220.833113] ceph: renewed_caps mds0 ttl now 4307945924,
was fresh, now stale
92988 <7>[13220.833114] ceph: wake request 000000005666e7bf tid 1
92989 <7>[13220.833116] ceph: mdsc put_session 000000001649fdbd 4 -> 3
92990 <7>[13220.833117] ceph: __choose_mds 0000000057da6992 is_hash=0
(0) mode 0
92991 <7>[13220.833118] ceph: choose_mds chose random mds1
========================> random mds1
92992 <7>[13220.833119] ceph: register_session: realloc to 2
92993 <7>[13220.833121] ceph: register_session: mds1
92994 <7>[13220.833122] ceph: mdsc get_session 00000000534bd3ef 2 -> 3
92995 <7>[13220.833123] ceph: mdsc con_get 00000000534bd3ef ok (3)
92996 <7>[13220.833124] ceph: mdsc get_session 00000000534bd3ef 3 -> 4
92997 <7>[13220.833124] ceph: do_request mds1 session
00000000534bd3ef state new
92998 <7>[13220.833126] ceph: open_session to mds1 (up:active)
92999 [...]
93000 <7>[13220.845883] ceph: handle_session mds1 open
00000000534bd3ef state opening seq 0
93001 <7>[13220.845884] ceph: renewed_caps mds1 ttl now 4307945935,
was fresh, now stale
93002 <7>[13220.845885] ceph: wake request 000000005666e7bf tid 1
93003 <7>[13220.845887] ceph: mdsc put_session 00000000534bd3ef 4 -> 3
93004 <7>[13220.845888] ceph: __choose_mds 0000000057da6992 is_hash=0
(0) mode 0
93005 <7>[13220.845889] ceph: choose_mds chose random mds2
================> random mds2
93006 <7>[13220.845890] ceph: register_session: realloc to 4
93007 <7>[13220.845891] ceph: register_session: mds2
93008 <7>[13220.845892] ceph: mdsc get_session 0000000076259af8 2 -> 3
93009 <7>[13220.845892] ceph: mdsc con_get 0000000076259af8 ok (3)
93010 <7>[13220.845893] ceph: mdsc get_session 0000000076259af8 3 -> 4
93011 <7>[13220.845893] ceph: do_request mds2 session
0000000076259af8 state new
93012 <7>[13220.845894] ceph: open_session to mds2 (up:active)
[...]
93014 <7>[13220.852986] ceph: handle_session mds2 open
0000000076259af8 state opening seq 0
93015 <7>[13220.852987] ceph: renewed_caps mds2 ttl now 4307945948,
was fresh, now stale
93016 <7>[13220.852988] ceph: wake request 000000005666e7bf tid 1
93017 <7>[13220.852990] ceph: mdsc put_session 0000000076259af8 4 -> 3
93018 <7>[13220.852991] ceph: __choose_mds 0000000057da6992 is_hash=0
(0) mode 0
93019 <7>[13220.852992] ceph: choose_mds chose random mds0
===============> choose random mds0 again
93020 <7>[13220.852993] ceph: mdsc get_session 0000000076259af8 3 -> 4
93021 <7>[13220.852994] ceph: mdsc get_session 0000000076259af8 4 -> 5
93022 <7>[13220.852994] ceph: do_request mds0 session
0000000076259af8 state open
93023 <7>[13220.852996] ceph: prepare_send_request 000000005666e7bf
tid 1 getattr (attempt 1)
93024 <7>[13220.852997] ceph: path
93025 <7>[13220.852998] ceph: r_parent = 0000000057da6992
93026 [...]
93027 <7>[13220.853414] ceph: handle_reply 000000005666e7bf
93028 <7>[13220.853415] ceph: __unregister_request 000000005666e7bf tid 1
93029 <7>[13220.853416] ceph: got safe reply 1, mds0, last request: Yes
93030 <7>[13220.853417] ceph: handle_reply tid 1 result 0
93031 <7>[13220.853421] ceph: fill_trace 000000005666e7bf is_dentry 0
is_target 1
93032 <7>[13220.853424] ceph: alloc_inode 00000000a823bfed
93033 <7>[13220.853427] ceph: get_inode created new inode
00000000a823bfed 1.fffffffffffffffe ino 1
93034 <7>[13220.853428] ceph: get_inode on 1=1.fffffffffffffffe got
00000000a823bfed
93035 <7>[13220.853429] ceph: fill_inode 00000000a823bfed ino
1.fffffffffffffffe v 81286 had 0
93036 <7>[13220.853431] ceph: 00000000a823bfed mode 040755 uid.gid 0.0
93037 <7>[13220.853433] ceph: truncate_size 0 -> 18446744073709551615
93038 <7>[13220.853435] ceph: fill_trace done err=0
93039 <7>[13220.853441] ceph: mdsc con_put 0000000076259af8 (4)
93040 <7>[13220.853442] ceph: mdsc put_session 0000000076259af8 5 -> 4
93041 <7>[13220.853446] ceph: do_request waited, got 0
93042 <7>[13220.853447] ceph: do_request 000000005666e7bf done, result 0
93043 <7>[13220.853447] ceph: open_root_inode success
93044 <7>[13220.853452] ceph: open_root_inode success, root dentry is
0000000037517481
[...]
93050 <7>[13220.853457] ceph: mount success
On 2019/12/9 20:47, xiubli@xxxxxxxxxx wrote:
From: Xiubo Li<xiubli@xxxxxxxxxx>
With max_mds > 1 and for a request which are choosing a random
mds rank and if the relating session is not opened yet, the request
will wait the session been opened and resend again. While every
time the request is beening __do_request, it will release the
req->session first and choose a random one again, so this time it
may choose another random mds rank. The worst case is that it will
open all the mds sessions one by one just before it can be
successfully sent out out.
Signed-off-by: Xiubo Li<xiubli@xxxxxxxxxx>
---
fs/ceph/mds_client.c | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 68f3b5ed6ac8..d747e9baf9c9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -876,7 +876,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry)
* Called under mdsc->mutex.
*/
static int __choose_mds(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
+ struct ceph_mds_request *req,
+ bool *random)
{
struct inode *inode;
struct ceph_inode_info *ci;
@@ -886,6 +887,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
u32 hash = req->r_direct_hash;
bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
+ if (random)
+ *random = false;
+
/*
* is there a specific mds we should try? ignore hint if we have
* no session and the mds is not up (active or recovering).
@@ -1021,6 +1025,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
return mds;
random:
+ if (random)
+ *random = true;
+
mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
dout("choose_mds chose random mds%d\n", mds);
return mds;
@@ -2556,6 +2563,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session = NULL;
int mds = -1;
int err = 0;
+ bool random;
if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
@@ -2596,7 +2604,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
put_request_session(req);
- mds = __choose_mds(mdsc, req);
+ mds = __choose_mds(mdsc, req, &random);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
dout("do_request no mds or not active, waiting for map\n");
@@ -2624,8 +2632,12 @@ static void __do_request(struct ceph_mds_client *mdsc,
goto out_session;
}
if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
+ session->s_state == CEPH_MDS_SESSION_CLOSING) {
__open_session(mdsc, session);
+ /* retry the same mds later */
+ if (random)
+ req->r_resend_mds = mds;
+ }
list_add(&req->r_wait, &session->s_waiting);
goto out_session;
}
@@ -2890,7 +2902,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_unlock(&mdsc->mutex);
goto out;
} else {
- int mds = __choose_mds(mdsc, req);
+ int mds = __choose_mds(mdsc, req, NULL);
if (mds >= 0 && mds != req->r_session->s_mds) {
dout("but auth changed, so resending\n");
__do_request(mdsc, req);