On Mon, 2019-11-25 at 06:08 -0500, xiubli@xxxxxxxxxx wrote: > From: Xiubo Li <xiubli@xxxxxxxxxx> > > In case the max_mds > 1 in MDS cluster and there is no any standby > MDS and all the max_mds MDSs are in up:active state, if one of the > up:active MDSs is dead, the m->m_num_laggy in kclient will be 1. > Then the mount will fail without considering other healthy MDSs. > > There manybe some MDSs still "in" the cluster but not in up:active > state, we will ignore them. Only when all the up:active MDSs in > the cluster are laggy will treat the cluster as not be available. > > In case decreasing the max_mds, the cluster will not stop the extra > up:active MDSs immediately and there will be a latency. During it > the up:active MDS number will be larger than the max_mds, so later > the m_info memories will 100% be reallocated. > > Here will pick out the up:active MDSs as the m_num_mds and allocate > the needed memories once. > > Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> > --- > fs/ceph/mdsmap.c | 32 ++++++++++---------------------- > include/linux/ceph/mdsmap.h | 5 +++-- > 2 files changed, 13 insertions(+), 24 deletions(-) > > diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c > index 471bac335fae..cc9ec959fe46 100644 > --- a/fs/ceph/mdsmap.c > +++ b/fs/ceph/mdsmap.c > @@ -138,14 +138,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) > m->m_session_autoclose = ceph_decode_32(p); > m->m_max_file_size = ceph_decode_64(p); > m->m_max_mds = ceph_decode_32(p); > - m->m_num_mds = m->m_max_mds; > + > + /* > + * pick out the active nodes as the m_num_mds, the m_num_mds > + * maybe larger than m_max_mds when decreasing the max_mds in > + * cluster side, in other case it should less than or equal > + * to m_max_mds. > + */ > + m->m_num_mds = n = ceph_decode_32(p); > + m->m_num_active_mds = m->m_num_mds; > > m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); > if (!m->m_info) > goto nomem; > > /* pick out active nodes from mds_info (state > 0) */ > - n = ceph_decode_32(p); > for (i = 0; i < n; i++) { > u64 global_id; > u32 namelen; > @@ -218,17 +225,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) > if (mds < 0 || state <= 0) > continue; > > - if (mds >= m->m_num_mds) { > - int new_num = max(mds + 1, m->m_num_mds * 2); > - void *new_m_info = krealloc(m->m_info, > - new_num * sizeof(*m->m_info), > - GFP_NOFS | __GFP_ZERO); > - if (!new_m_info) > - goto nomem; > - m->m_info = new_m_info; > - m->m_num_mds = new_num; > - } > - I don't think we want to get rid of this bit. What happens if the number of MDS' increases after the mount occurs? > info = &m->m_info[mds]; > info->global_id = global_id; > info->state = state; > @@ -247,14 +243,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) > info->export_targets = NULL; > } > } > - if (m->m_num_mds > m->m_max_mds) { > - /* find max up mds */ > - for (i = m->m_num_mds; i >= m->m_max_mds; i--) { > - if (i == 0 || m->m_info[i-1].state > 0) > - break; > - } > - m->m_num_mds = i; > - } > > /* pg_pools */ > ceph_decode_32_safe(p, end, n, bad); > @@ -396,7 +384,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) > return false; > if (m->m_damaged) > return false; > - if (m->m_num_laggy > 0) > + if (m->m_num_laggy == m->m_num_active_mds) > return false; > for (i = 0; i < m->m_num_mds; i++) { > if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) > diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h > index 0067d767c9ae..3a66f4f926ce 100644 > --- a/include/linux/ceph/mdsmap.h > +++ b/include/linux/ceph/mdsmap.h > @@ -25,8 +25,9 @@ struct ceph_mdsmap { > u32 m_session_timeout; /* seconds */ > u32 m_session_autoclose; /* seconds */ > u64 m_max_file_size; > - u32 m_max_mds; /* size of m_addr, m_state arrays */ > - int m_num_mds; > + u32 m_max_mds; /* expected up:active mds number */ > + int m_num_active_mds; /* actual up:active mds number */ > + int m_num_mds; /* size of m_info array */ > struct ceph_mds_info *m_info; > > /* which object pools file data can be stored in */ -- Jeff Layton <jlayton@xxxxxxxxxx>