From: Xiubo Li <xiubli@xxxxxxxxxx>
This will send the caps/read/write/metadata metrics to any available
MDS only once per second as default, which will be the same as the
userland client, or every metric_send_interval seconds, which is a
module parameter.
Skip the MDS sessions if they don't support the metric collection,
or the MDSs will close the socket connections directly when it get
an unknown type message.
URL: https://tracker.ceph.com/issues/43215
Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
---
fs/ceph/mds_client.c | 3 +
fs/ceph/mds_client.h | 4 +-
fs/ceph/metric.c | 142 +++++++++++++++++++++++++++++++++++++++++++
fs/ceph/metric.h | 78 ++++++++++++++++++++++++
fs/ceph/super.c | 42 +++++++++++++
fs/ceph/super.h | 2 +
include/linux/ceph/ceph_fs.h | 1 +
7 files changed, 271 insertions(+), 1 deletion(-)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 608fb5c..f996363 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4625,6 +4625,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+ cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
dout("stopped\n");
}
@@ -4667,6 +4668,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
{
dout("stop\n");
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+ cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
if (mdsc->mdsmap)
ceph_mdsmap_destroy(mdsc->mdsmap);
kfree(mdsc->sessions);
@@ -4824,6 +4826,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
mutex_unlock(&mdsc->mutex);
schedule_delayed(mdsc);
+ metric_schedule_delayed(&mdsc->metric);
return;
bad_unlock:
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bcb3892..3c65ac1 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -28,8 +28,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_LAZY_CAP_WANTED,
CEPHFS_FEATURE_MULTI_RECONNECT,
CEPHFS_FEATURE_DELEG_INO,
+ CEPHFS_FEATURE_METRIC_COLLECT,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
};
/*
@@ -43,6 +44,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
CEPHFS_FEATURE_MULTI_RECONNECT, \
CEPHFS_FEATURE_DELEG_INO, \
+ CEPHFS_FEATURE_METRIC_COLLECT, \
\
CEPHFS_FEATURE_MAX, \
}
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 9217f35..4267b46 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -1,10 +1,150 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/ceph/ceph_debug.h>
#include <linux/types.h>
#include <linux/percpu_counter.h>
#include <linux/math64.h>
#include "metric.h"
+#include "mds_client.h"
+
+static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *s,
+ u64 nr_caps)
+{
+ struct ceph_metric_head *head;
+ struct ceph_metric_cap *cap;
+ struct ceph_metric_read_latency *read;
+ struct ceph_metric_write_latency *write;
+ struct ceph_metric_metadata_latency *meta;
+ struct ceph_client_metric *m = &mdsc->metric;
+ struct ceph_msg *msg;
+ struct timespec64 ts;
+ s64 sum, total;
+ s32 items = 0;
+ s32 len;
+
+ len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
+ + sizeof(*meta);
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
+ if (!msg) {
+ pr_err("send metrics to mds%d, failed to allocate message\n",
+ s->s_mds);
+ return false;
+ }
+
+ head = msg->front.iov_base;
+
+ /* encode the cap metric */
+ cap = (struct ceph_metric_cap *)(head + 1);
+ cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
+ cap->ver = 1;
+ cap->compat = 1;
+ cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
+ cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
+ cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
+ cap->total = cpu_to_le64(nr_caps);
+ items++;
+
+ /* encode the read latency metric */
+ read = (struct ceph_metric_read_latency *)(cap + 1);
+ read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
+ read->ver = 1;
+ read->compat = 1;
+ read->data_len = cpu_to_le32(sizeof(*read) - 10);
+ total = m->total_reads;
+ sum = m->read_latency_sum;
+ jiffies_to_timespec64(sum, &ts);
+ read->sec = cpu_to_le32(ts.tv_sec);
+ read->nsec = cpu_to_le32(ts.tv_nsec);
+ items++;
+
+ /* encode the write latency metric */
+ write = (struct ceph_metric_write_latency *)(read + 1);
+ write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
+ write->ver = 1;
+ write->compat = 1;
+ write->data_len = cpu_to_le32(sizeof(*write) - 10);
+ total = m->total_writes;
+ sum = m->write_latency_sum;
+ jiffies_to_timespec64(sum, &ts);
+ write->sec = cpu_to_le32(ts.tv_sec);
+ write->nsec = cpu_to_le32(ts.tv_nsec);
+ items++;
+
+ /* encode the metadata latency metric */
+ meta = (struct ceph_metric_metadata_latency *)(write + 1);
+ meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
+ meta->ver = 1;
+ meta->compat = 1;
+ meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
+ total = m->total_metadatas;
+ sum = m->metadata_latency_sum;
+ jiffies_to_timespec64(sum, &ts);
+ meta->sec = cpu_to_le32(ts.tv_sec);
+ meta->nsec = cpu_to_le32(ts.tv_nsec);
+ items++;
+
+ put_unaligned_le32(items, &head->num);
+ msg->front.iov_len = cpu_to_le32(len);
+ msg->hdr.version = cpu_to_le16(1);
+ msg->hdr.compat_version = cpu_to_le16(1);
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ dout("client%llu send metrics to mds%d\n",
+ ceph_client_gid(mdsc->fsc->client), s->s_mds);
+ ceph_con_send(&s->s_con, msg);
+
+ return true;
+}
+
+static void metric_delayed_work(struct work_struct *work)
+{
+ struct ceph_client_metric *m =
+ container_of(work, struct ceph_client_metric, delayed_work.work);
+ struct ceph_mds_client *mdsc =
+ container_of(m, struct ceph_mds_client, metric);
+ struct ceph_mds_session *s;
+ u64 nr_caps = 0;
+ bool ret;
+ int i;
+
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ s = __ceph_lookup_mds_session(mdsc, i);
+ if (!s)
+ continue;
+ nr_caps += s->s_nr_caps;
+ ceph_put_mds_session(s);
+ }
+
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ s = __ceph_lookup_mds_session(mdsc, i);
+ if (!s)
+ continue;
+ if (!check_session_state(mdsc, s)) {
+ ceph_put_mds_session(s);
+ continue;
+ }
+
+ /*
+ * Skip it if MDS doesn't support the metric collection,
+ * or the MDS will close the session's socket connection
+ * directly when it get this message.
+ */
+ if (!test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features))
+ continue;
+
+ /* Only send the metric once in any available session */
+ ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
+ ceph_put_mds_session(s);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ metric_schedule_delayed(&mdsc->metric);
+}