On 2019/11/25 21:27, Jeff Layton wrote:
On Mon, 2019-11-25 at 06:08 -0500, xiubli@xxxxxxxxxx wrote:
From: Xiubo Li <xiubli@xxxxxxxxxx>
In case the max_mds > 1 in MDS cluster and there is no any standby
MDS and all the max_mds MDSs are in up:active state, if one of the
up:active MDSs is dead, the m->m_num_laggy in kclient will be 1.
Then the mount will fail without considering other healthy MDSs.
There manybe some MDSs still "in" the cluster but not in up:active
state, we will ignore them. Only when all the up:active MDSs in
the cluster are laggy will treat the cluster as not be available.
In case decreasing the max_mds, the cluster will not stop the extra
up:active MDSs immediately and there will be a latency. During it
the up:active MDS number will be larger than the max_mds, so later
the m_info memories will 100% be reallocated.
Here will pick out the up:active MDSs as the m_num_mds and allocate
the needed memories once.
Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
---
fs/ceph/mdsmap.c | 32 ++++++++++----------------------
include/linux/ceph/mdsmap.h | 5 +++--
2 files changed, 13 insertions(+), 24 deletions(-)
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 471bac335fae..cc9ec959fe46 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -138,14 +138,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_session_autoclose = ceph_decode_32(p);
m->m_max_file_size = ceph_decode_64(p);
m->m_max_mds = ceph_decode_32(p);
- m->m_num_mds = m->m_max_mds;
+
+ /*
+ * pick out the active nodes as the m_num_mds, the m_num_mds
+ * maybe larger than m_max_mds when decreasing the max_mds in
+ * cluster side, in other case it should less than or equal
+ * to m_max_mds.
+ */
+ m->m_num_mds = n = ceph_decode_32(p);
+ m->m_num_active_mds = m->m_num_mds;
m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
if (!m->m_info)
goto nomem;
/* pick out active nodes from mds_info (state > 0) */
- n = ceph_decode_32(p);
for (i = 0; i < n; i++) {
u64 global_id;
u32 namelen;
@@ -218,17 +225,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
if (mds < 0 || state <= 0)
continue;
- if (mds >= m->m_num_mds) {
- int new_num = max(mds + 1, m->m_num_mds * 2);
- void *new_m_info = krealloc(m->m_info,
- new_num * sizeof(*m->m_info),
- GFP_NOFS | __GFP_ZERO);
- if (!new_m_info)
- goto nomem;
- m->m_info = new_m_info;
- m->m_num_mds = new_num;
- }
-
I don't think we want to get rid of this bit. What happens if the number
of MDS' increases after the mount occurs?
Every time when we receive a new version of mdsmap, the old whole
mdsc->mdsmap memory will be reallocated and replaced, no matter the MDS'
increases or decreases.
The active nodes from mds_info above will help record the actual MDS
number and then we decode it into "m_num_active_mds". If we are
decreasing the max_mds in the cluster side, the "m_num_active_mds" will
very probably be larger than the expected "m_num_mds", then we
definitely will need to reallocate the memory for m->m_info here. Why
not allocate enough memory beforehand ?
BTW, from my investigation that the mds number decoded from the mds_info
won't be larger than the "m_num_active_mds". If I am right then this
code is useless here, or we need it.
info = &m->m_info[mds];
info->global_id = global_id;
info->state = state;
@@ -247,14 +243,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info->export_targets = NULL;
}
}
- if (m->m_num_mds > m->m_max_mds) {
- /* find max up mds */
- for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
- if (i == 0 || m->m_info[i-1].state > 0)
- break;
- }
- m->m_num_mds = i;
- }
/* pg_pools */
ceph_decode_32_safe(p, end, n, bad);
@@ -396,7 +384,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
return false;
if (m->m_damaged)
return false;
- if (m->m_num_laggy > 0)
+ if (m->m_num_laggy == m->m_num_active_mds)
return false;
for (i = 0; i < m->m_num_mds; i++) {
if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 0067d767c9ae..3a66f4f926ce 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -25,8 +25,9 @@ struct ceph_mdsmap {
u32 m_session_timeout; /* seconds */
u32 m_session_autoclose; /* seconds */
u64 m_max_file_size;
- u32 m_max_mds; /* size of m_addr, m_state arrays */
- int m_num_mds;
+ u32 m_max_mds; /* expected up:active mds number */
+ int m_num_active_mds; /* actual up:active mds number */
+ int m_num_mds; /* size of m_info array */
struct ceph_mds_info *m_info;
/* which object pools file data can be stored in */