On Mon, 2019-12-09 at 07:47 -0500, xiubli@xxxxxxxxxx wrote: > From: Xiubo Li <xiubli@xxxxxxxxxx> > > With max_mds > 1 and for a request which are choosing a random > mds rank and if the relating session is not opened yet, the request > will wait the session been opened and resend again. While every > time the request is beening __do_request, it will release the > req->session first and choose a random one again, so this time it > may choose another random mds rank. The worst case is that it will > open all the mds sessions one by one just before it can be > successfully sent out out. > > Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> > --- > fs/ceph/mds_client.c | 20 ++++++++++++++++---- > 1 file changed, 16 insertions(+), 4 deletions(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 68f3b5ed6ac8..d747e9baf9c9 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -876,7 +876,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry) > * Called under mdsc->mutex. > */ > static int __choose_mds(struct ceph_mds_client *mdsc, > - struct ceph_mds_request *req) > + struct ceph_mds_request *req, > + bool *random) > { > struct inode *inode; > struct ceph_inode_info *ci; > @@ -886,6 +887,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, > u32 hash = req->r_direct_hash; > bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); > > + if (random) > + *random = false; > + > /* > * is there a specific mds we should try? ignore hint if we have > * no session and the mds is not up (active or recovering). > @@ -1021,6 +1025,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, > return mds; > > random: > + if (random) > + *random = true; > + > mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); > dout("choose_mds chose random mds%d\n", mds); > return mds; > @@ -2556,6 +2563,7 @@ static void __do_request(struct ceph_mds_client *mdsc, > struct ceph_mds_session *session = NULL; > int mds = -1; > int err = 0; > + bool random; > > if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { > if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) > @@ -2596,7 +2604,7 @@ static void __do_request(struct ceph_mds_client *mdsc, > > put_request_session(req); > > - mds = __choose_mds(mdsc, req); > + mds = __choose_mds(mdsc, req, &random); > if (mds < 0 || > ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { > dout("do_request no mds or not active, waiting for map\n"); > @@ -2624,8 +2632,12 @@ static void __do_request(struct ceph_mds_client *mdsc, > goto out_session; > } > if (session->s_state == CEPH_MDS_SESSION_NEW || > - session->s_state == CEPH_MDS_SESSION_CLOSING) > + session->s_state == CEPH_MDS_SESSION_CLOSING) { > __open_session(mdsc, session); > + /* retry the same mds later */ > + if (random) > + req->r_resend_mds = mds; > + } > list_add(&req->r_wait, &session->s_waiting); > goto out_session; > } > @@ -2890,7 +2902,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) > mutex_unlock(&mdsc->mutex); > goto out; > } else { > - int mds = __choose_mds(mdsc, req); > + int mds = __choose_mds(mdsc, req, NULL); > if (mds >= 0 && mds != req->r_session->s_mds) { > dout("but auth changed, so resending\n"); > __do_request(mdsc, req); Is there a tracker bug for this? This does seem like the behavior we'd want. I wish it were a little less Rube Goldberg to do this, but this code is already pretty messy so this doesn't really make it too much worse. In any case, merged with a reworded changelog. -- Jeff Layton <jlayton@xxxxxxxxxx>