Re: [PATCH v6 6/9] ceph: periodically send perf metrics to ceph

Xiubo Li <xiubli@xxxxxxxxxx> · Wed, 12 Feb 2020 16:38:03 +0800

On 2020/2/12 1:42, Ilya Dryomov wrote:
On Tue, Feb 11, 2020 at 2:30 AM Xiubo Li <xiubli@xxxxxxxxxx> wrote:
On 2020/2/10 23:34, Ilya Dryomov wrote:
On Mon, Feb 10, 2020 at 6:34 AM <xiubli@xxxxxxxxxx> wrote:
From: Xiubo Li <xiubli@xxxxxxxxxx>

Add metric_send_interval module parameter support, the default valume
is 0, means disabled. If none zero it will enable the transmission of
the metrics to the ceph cluster periodically per metric_send_interval
seconds.

This will send the caps, dentry lease and read/write/metadata perf
metrics to any available MDS only once per metric_send_interval
seconds.

URL: https://tracker.ceph.com/issues/43215
Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
---
   fs/ceph/mds_client.c         | 235 +++++++++++++++++++++++++++++++----
   fs/ceph/mds_client.h         |   2 +
   fs/ceph/metric.h             |  76 +++++++++++
   fs/ceph/super.c              |   4 +
   fs/ceph/super.h              |   1 +
   include/linux/ceph/ceph_fs.h |   1 +
   6 files changed, 294 insertions(+), 25 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d414eded6810..f9a6f95c7941 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4085,16 +4085,167 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
          ceph_force_reconnect(fsc->sb);
   }

-/*
- * delayed work -- periodically trim expired leases, renew caps with mds
- */
+static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
+                                  struct ceph_mds_session *s,
+                                  u64 nr_caps)
+{
+       struct ceph_metric_head *head;
+       struct ceph_metric_cap *cap;
+       struct ceph_metric_dentry_lease *lease;
+       struct ceph_metric_read_latency *read;
+       struct ceph_metric_write_latency *write;
+       struct ceph_metric_metadata_latency *meta;
+       struct ceph_msg *msg;
+       struct timespec64 ts;
+       s64 sum, total;
+       s32 items = 0;
+       s32 len;
+
+       if (!mdsc || !s)
+               return false;
+
+       len = sizeof(*head) + sizeof(*cap) + sizeof(*lease) + sizeof(*read)
+             + sizeof(*write) + sizeof(*meta);
+
+       msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
+       if (!msg) {
+               pr_err("send metrics to mds%d, failed to allocate message\n",
+                      s->s_mds);
+               return false;
+       }
+
+       head = msg->front.iov_base;
+
+       /* encode the cap metric */
+       cap = (struct ceph_metric_cap *)(head + 1);
+       cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
+       cap->ver = 1;
+       cap->compat = 1;
+       cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
+       cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
+       cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
+       cap->total = cpu_to_le64(nr_caps);
+       items++;
+
+       dout("cap metric hit %lld, mis %lld, total caps %lld",
+            le64_to_cpu(cap->hit), le64_to_cpu(cap->mis),
+            le64_to_cpu(cap->total));
+
+       /* encode the read latency metric */
+       read = (struct ceph_metric_read_latency *)(cap + 1);
+       read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
+       read->ver = 1;
+       read->compat = 1;
+       read->data_len = cpu_to_le32(sizeof(*read) - 10);
+       total = percpu_counter_sum(&mdsc->metric.total_reads),
+       sum = percpu_counter_sum(&mdsc->metric.read_latency_sum);
+       jiffies_to_timespec64(sum, &ts);
+       read->sec = cpu_to_le32(ts.tv_sec);
+       read->nsec = cpu_to_le32(ts.tv_nsec);
+       items++;
+       dout("read latency metric total %lld, sum lat %lld", total, sum);
+
+       /* encode the write latency metric */
+       write = (struct ceph_metric_write_latency *)(read + 1);
+       write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
+       write->ver = 1;
+       write->compat = 1;
+       write->data_len = cpu_to_le32(sizeof(*write) - 10);
+       total = percpu_counter_sum(&mdsc->metric.total_writes),
+       sum = percpu_counter_sum(&mdsc->metric.write_latency_sum);
+       jiffies_to_timespec64(sum, &ts);
+       write->sec = cpu_to_le32(ts.tv_sec);
+       write->nsec = cpu_to_le32(ts.tv_nsec);
+       items++;
+       dout("write latency metric total %lld, sum lat %lld", total, sum);
+
+       /* encode the metadata latency metric */
+       meta = (struct ceph_metric_metadata_latency *)(write + 1);
+       meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
+       meta->ver = 1;
+       meta->compat = 1;
+       meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
+       total = percpu_counter_sum(&mdsc->metric.total_metadatas),
+       sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum);
+       jiffies_to_timespec64(sum, &ts);
+       meta->sec = cpu_to_le32(ts.tv_sec);
+       meta->nsec = cpu_to_le32(ts.tv_nsec);
+       items++;
+       dout("metadata latency metric total %lld, sum lat %lld", total, sum);
+
+       /* encode the dentry lease metric */
+       lease = (struct ceph_metric_dentry_lease *)(meta + 1);
+       lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
+       lease->ver = 1;
+       lease->compat = 1;
+       lease->data_len = cpu_to_le32(sizeof(*lease) - 10);
+       lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit));
+       lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis));
+       lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries));
+       items++;
+       dout("dentry lease metric hit %lld, mis %lld, total dentries %lld",
+            le64_to_cpu(lease->hit), le64_to_cpu(lease->mis),
+            le64_to_cpu(lease->total));
+
+       put_unaligned_le32(items, &head->num);
+       msg->front.iov_len = cpu_to_le32(len);
+       msg->hdr.version = cpu_to_le16(1);
+       msg->hdr.compat_version = cpu_to_le16(1);
+       msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+       dout("send metrics to mds%d %p\n", s->s_mds, msg);
+       ceph_con_send(&s->s_con, msg);
+
+       return true;
+}
+
+#define CEPH_WORK_DELAY_DEF 5
+static void __schedule_delayed(struct delayed_work *work, int delay)
+{
+       unsigned int hz = round_jiffies_relative(HZ * delay);
+
+       schedule_delayed_work(work, hz);
+}
+
   static void schedule_delayed(struct ceph_mds_client *mdsc)
   {
-       int delay = 5;
-       unsigned hz = round_jiffies_relative(HZ * delay);
-       schedule_delayed_work(&mdsc->delayed_work, hz);
+       __schedule_delayed(&mdsc->delayed_work, CEPH_WORK_DELAY_DEF);
+}
+
+static void metric_schedule_delayed(struct ceph_mds_client *mdsc)
+{
+       /* delay CEPH_WORK_DELAY_DEF seconds when idle */
+       int delay = metric_send_interval ? : CEPH_WORK_DELAY_DEF;
+
+       __schedule_delayed(&mdsc->metric_delayed_work, delay);
+}
+
+static bool check_session_state(struct ceph_mds_client *mdsc,
+                               struct ceph_mds_session *s)
+{
+       if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+               dout("resending session close request for mds%d\n",
+                               s->s_mds);
+               request_close_session(mdsc, s);
+               return false;
+       }
+       if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+               if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+                       s->s_state = CEPH_MDS_SESSION_HUNG;
+                       pr_info("mds%d hung\n", s->s_mds);
+               }
+       }
+       if (s->s_state == CEPH_MDS_SESSION_NEW ||
+           s->s_state == CEPH_MDS_SESSION_RESTARTING ||
+           s->s_state == CEPH_MDS_SESSION_REJECTED)
+               /* this mds is failed or recovering, just wait */
+               return false;
+
+       return true;
   }

+/*
+ * delayed work -- periodically trim expired leases, renew caps with mds
+ */
   static void delayed_work(struct work_struct *work)
   {
          int i;
@@ -4116,23 +4267,8 @@ static void delayed_work(struct work_struct *work)
                  struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
                  if (!s)
                          continue;
-               if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
-                       dout("resending session close request for mds%d\n",
-                            s->s_mds);
-                       request_close_session(mdsc, s);
-                       ceph_put_mds_session(s);
-                       continue;
-               }
-               if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
-                       if (s->s_state == CEPH_MDS_SESSION_OPEN) {
-                               s->s_state = CEPH_MDS_SESSION_HUNG;
-                               pr_info("mds%d hung\n", s->s_mds);
-                       }
-               }
-               if (s->s_state == CEPH_MDS_SESSION_NEW ||
-                   s->s_state == CEPH_MDS_SESSION_RESTARTING ||
-                   s->s_state == CEPH_MDS_SESSION_REJECTED) {
-                       /* this mds is failed or recovering, just wait */
+
+               if (!check_session_state(mdsc, s)) {
                          ceph_put_mds_session(s);
                          continue;
                  }
@@ -4164,8 +4300,53 @@ static void delayed_work(struct work_struct *work)
          schedule_delayed(mdsc);
   }

-static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
+static void metric_delayed_work(struct work_struct *work)
+{
+       struct ceph_mds_client *mdsc =
+               container_of(work, struct ceph_mds_client, metric_delayed_work.work);
+       struct ceph_mds_session *s;
+       u64 nr_caps = 0;
+       bool ret;
+       int i;
+
+       if (!metric_send_interval)
+               goto idle;
+
+       dout("mdsc metric_delayed_work\n");
+
+       mutex_lock(&mdsc->mutex);
+       for (i = 0; i < mdsc->max_sessions; i++) {
+               s = __ceph_lookup_mds_session(mdsc, i);
+               if (!s)
+                       continue;
+               nr_caps += s->s_nr_caps;
+               ceph_put_mds_session(s);
+       }
+
+       for (i = 0; i < mdsc->max_sessions; i++) {
+               s = __ceph_lookup_mds_session(mdsc, i);
+               if (!s)
+                       continue;
+               if (!check_session_state(mdsc, s)) {
+                       ceph_put_mds_session(s);
+                       continue;
+               }
+
+               /* Only send the metric once in any available session */
+               ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
+               ceph_put_mds_session(s);
+               if (ret)
+                       break;
+       }
+       mutex_unlock(&mdsc->mutex);
+
+idle:
+       metric_schedule_delayed(mdsc);
Looks like this will schedule metric_delayed_work() every 5 seconds
even if metric_send_interval = 0 (i.e. sending is disabled).  What is
the reason for that?
Hi Ilya,

Before I folded the metric_delayed_work() into delayed_work(). But for
the this version since the interval is settable, so it hard to calculate
the next schedule delay for that.

When it is idle just looping every 5 seconds, I thought though this is
not a very graceful approach it won't introduce too much overload. If we
do not like this, let's switch it to a completion.
Take a look at module_param_cb macro.  I think you can provide a
setter and schedule the first work / modify the delay from there.

Hi Ilya,

Yeah, this is what I was trying to switch to.

That said, I'm not sure making the interval configurable is a good
idea.  I'm not saying you need to change anything -- just that if it
was me, I would send these metrics once per tick (i.e. delayed_work)
with an on/off switch and no other tunables.

Currently I still couldn't be sure whether per second will introduce any 
potential overload in some cases. But for now I couldn't foresee it will.

Thanks,

Xiubo


Thanks,

                 Ilya