On Wed, 2019-12-04 at 06:57 -0500, xiubli@xxxxxxxxxx wrote: > From: Xiubo Li <xiubli@xxxxxxxxxx> > > The m_num_mds here is actually the number for MDSs which are in > up:active status, and it will be duplicated to m_num_active_mds, > so remove it. > > Add possible_max_rank to the mdsmap struct and this will be > the correctly possible largest rank boundary. > > Remove the special case for one mds in __mdsmap_get_random_mds(), > because the validate mds rank may not always be 0. > > Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> > --- > fs/ceph/debugfs.c | 2 +- > fs/ceph/mds_client.c | 10 ++++---- > fs/ceph/mdsmap.c | 49 +++++++++++++++---------------------- > include/linux/ceph/mdsmap.h | 10 ++++---- > 4 files changed, 31 insertions(+), 40 deletions(-) > > diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c > index facb387c2735..0b1591e76077 100644 > --- a/fs/ceph/debugfs.c > +++ b/fs/ceph/debugfs.c > @@ -33,7 +33,7 @@ static int mdsmap_show(struct seq_file *s, void *p) > seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds); > seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout); > seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose); > - for (i = 0; i < mdsmap->m_num_mds; i++) { > + for (i = 0; i < mdsmap->possible_max_rank; i++) { > struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; > int state = mdsmap->m_info[i].state; > seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 39f4d8501df5..036d388ddb10 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -597,7 +597,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, > { > struct ceph_mds_session *s; > > - if (mds >= mdsc->mdsmap->m_num_mds) > + if (mds >= mdsc->mdsmap->possible_max_rank) > return ERR_PTR(-EINVAL); > > s = kzalloc(sizeof(*s), GFP_NOFS); > @@ -1222,7 +1222,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, > struct ceph_mds_session *ts; > int i, mds = session->s_mds; > > - if (mds >= mdsc->mdsmap->m_num_mds) > + if (mds >= mdsc->mdsmap->possible_max_rank) > return; > > mi = &mdsc->mdsmap->m_info[mds]; > @@ -3762,7 +3762,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, > dout("check_new_map new %u old %u\n", > newmap->m_epoch, oldmap->m_epoch); > > - for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { > + for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) { > if (!mdsc->sessions[i]) > continue; > s = mdsc->sessions[i]; > @@ -3776,7 +3776,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, > ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", > ceph_session_state_name(s->s_state)); > > - if (i >= newmap->m_num_mds) { > + if (i >= newmap->possible_max_rank) { > /* force close session for stopped mds */ > get_session(s); > __unregister_session(mdsc, s); > @@ -3833,7 +3833,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, > } > } > > - for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) { > + for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) { > s = mdsc->sessions[i]; > if (!s) > continue; > diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c > index a77e0ecb9a6b..889627817e52 100644 > --- a/fs/ceph/mdsmap.c > +++ b/fs/ceph/mdsmap.c > @@ -14,22 +14,15 @@ > #include "super.h" > > #define CEPH_MDS_IS_READY(i, ignore_laggy) \ > - (m->m_info[i].state > 0 && (ignore_laggy ? true : !m->m_info[i].laggy)) > + (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy) > > static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) > { > int n = 0; > int i, j; > > - /* > - * special case for one mds, no matter it is laggy or > - * not we have no choice > - */ > - if (1 == m->m_num_mds && m->m_info[0].state > 0) > - return 0; > - > /* count */ > - for (i = 0; i < m->m_num_mds; i++) > + for (i = 0; i < m->possible_max_rank; i++) > if (CEPH_MDS_IS_READY(i, ignore_laggy)) > n++; > if (n == 0) > @@ -37,7 +30,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) > > /* pick */ > n = prandom_u32() % n; > - for (j = 0, i = 0; i < m->m_num_mds; i++) { > + for (j = 0, i = 0; i < m->possible_max_rank; i++) { > if (CEPH_MDS_IS_READY(i, ignore_laggy)) > j++; > if (j > n) > @@ -55,10 +48,10 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) > int mds; > > mds = __mdsmap_get_random_mds(m, false); > - if (mds == m->m_num_mds || mds == -1) > + if (mds == m->possible_max_rank || mds == -1) > mds = __mdsmap_get_random_mds(m, true); > > - return mds == m->m_num_mds ? -1 : mds; > + return mds == m->possible_max_rank ? -1 : mds; > } > > #define __decode_and_drop_type(p, end, type, bad) \ > @@ -129,7 +122,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) > int err; > u8 mdsmap_v, mdsmap_cv; > u16 mdsmap_ev; > - u32 possible_max_rank; > > m = kzalloc(sizeof(*m), GFP_NOFS); > if (!m) > @@ -157,24 +149,23 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) > m->m_max_mds = ceph_decode_32(p); > > /* > - * pick out the active nodes as the m_num_mds, the m_num_mds > - * maybe larger than m_max_mds when decreasing the max_mds in > - * cluster side, in other case it should less than or equal > - * to m_max_mds. > + * pick out the active nodes as the m_num_active_mds, the > + * m_num_active_mds maybe larger than m_max_mds when decreasing > + * the max_mds in cluster side, in other case it should less > + * than or equal to m_max_mds. > */ > - m->m_num_mds = n = ceph_decode_32(p); > - m->m_num_active_mds = m->m_num_mds; > + m->m_num_active_mds = n = ceph_decode_32(p); > > /* > - * the possible max rank, it maybe larger than the m->m_num_mds, > + * the possible max rank, it maybe larger than the m_num_active_mds, > * for example if the mds_max == 2 in the cluster, when the MDS(0) > * was laggy and being replaced by a new MDS, we will temporarily > * receive a new mds map with n_num_mds == 1 and the active MDS(1), > - * and the mds rank >= m->m_num_mds. > + * and the mds rank >= m_num_active_mds. > */ > - possible_max_rank = max((u32)m->m_num_mds, m->m_max_mds); > + m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds); > > - m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); > + m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS); > if (!m->m_info) > goto nomem; > > @@ -248,7 +239,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) > ceph_mds_state_name(state), > laggy ? "(laggy)" : ""); > > - if (mds < 0 || mds >= possible_max_rank) { > + if (mds < 0 || mds >= m->possible_max_rank) { > pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds); > continue; > } > @@ -318,14 +309,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) > > for (i = 0; i < n; i++) { > s32 mds = ceph_decode_32(p); > - if (mds >= 0 && mds < m->m_num_mds) { > + if (mds >= 0 && mds < m->possible_max_rank) { > if (m->m_info[mds].laggy) > num_laggy++; > } > } > m->m_num_laggy = num_laggy; > > - if (n > m->m_num_mds) { > + if (n > m->possible_max_rank) { > void *new_m_info = krealloc(m->m_info, > n * sizeof(*m->m_info), > GFP_NOFS | __GFP_ZERO); > @@ -333,7 +324,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) > goto nomem; > m->m_info = new_m_info; > } > - m->m_num_mds = n; > + m->possible_max_rank = n; > } > > /* inc */ > @@ -404,7 +395,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m) > { > int i; > > - for (i = 0; i < m->m_num_mds; i++) > + for (i = 0; i < m->possible_max_rank; i++) > kfree(m->m_info[i].export_targets); > kfree(m->m_info); > kfree(m->m_data_pg_pools); > @@ -420,7 +411,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) > return false; > if (m->m_num_laggy == m->m_num_active_mds) > return false; > - for (i = 0; i < m->m_num_mds; i++) { > + for (i = 0; i < m->possible_max_rank; i++) { > if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) > nr_active++; > } > diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h > index 3a66f4f926ce..35d385296fbb 100644 > --- a/include/linux/ceph/mdsmap.h > +++ b/include/linux/ceph/mdsmap.h > @@ -26,8 +26,8 @@ struct ceph_mdsmap { > u32 m_session_autoclose; /* seconds */ > u64 m_max_file_size; > u32 m_max_mds; /* expected up:active mds number */ > - int m_num_active_mds; /* actual up:active mds number */ > - int m_num_mds; /* size of m_info array */ > + u32 m_num_active_mds; /* actual up:active mds number */ > + u32 possible_max_rank; /* possible max rank index */ > struct ceph_mds_info *m_info; > > /* which object pools file data can be stored in */ > @@ -43,7 +43,7 @@ struct ceph_mdsmap { > static inline struct ceph_entity_addr * > ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) > { > - if (w >= m->m_num_mds) > + if (w >= m->possible_max_rank) > return NULL; > return &m->m_info[w].addr; > } > @@ -51,14 +51,14 @@ ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) > static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) > { > BUG_ON(w < 0); > - if (w >= m->m_num_mds) > + if (w >= m->possible_max_rank) > return CEPH_MDS_STATE_DNE; > return m->m_info[w].state; > } > > static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) > { > - if (w >= 0 && w < m->m_num_mds) > + if (w >= 0 && w < m->possible_max_rank) > return m->m_info[w].laggy; > return false; > } Looks reasonable -- merged. -- Jeff Layton <jlayton@xxxxxxxxxx>