On Mon, Feb 10, 2020 at 6:34 AM <xiubli@xxxxxxxxxx> wrote: > > From: Xiubo Li <xiubli@xxxxxxxxxx> > > Add metric_send_interval module parameter support, the default valume > is 0, means disabled. If none zero it will enable the transmission of > the metrics to the ceph cluster periodically per metric_send_interval > seconds. > > This will send the caps, dentry lease and read/write/metadata perf > metrics to any available MDS only once per metric_send_interval > seconds. > > URL: https://tracker.ceph.com/issues/43215 > Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> > --- > fs/ceph/mds_client.c | 235 +++++++++++++++++++++++++++++++---- > fs/ceph/mds_client.h | 2 + > fs/ceph/metric.h | 76 +++++++++++ > fs/ceph/super.c | 4 + > fs/ceph/super.h | 1 + > include/linux/ceph/ceph_fs.h | 1 + > 6 files changed, 294 insertions(+), 25 deletions(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index d414eded6810..f9a6f95c7941 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -4085,16 +4085,167 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) > ceph_force_reconnect(fsc->sb); > } > > -/* > - * delayed work -- periodically trim expired leases, renew caps with mds > - */ > +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, > + struct ceph_mds_session *s, > + u64 nr_caps) > +{ > + struct ceph_metric_head *head; > + struct ceph_metric_cap *cap; > + struct ceph_metric_dentry_lease *lease; > + struct ceph_metric_read_latency *read; > + struct ceph_metric_write_latency *write; > + struct ceph_metric_metadata_latency *meta; > + struct ceph_msg *msg; > + struct timespec64 ts; > + s64 sum, total; > + s32 items = 0; > + s32 len; > + > + if (!mdsc || !s) > + return false; > + > + len = sizeof(*head) + sizeof(*cap) + sizeof(*lease) + sizeof(*read) > + + sizeof(*write) + sizeof(*meta); > + > + msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); > + if (!msg) { > + pr_err("send metrics to mds%d, failed to allocate message\n", > + s->s_mds); > + return false; > + } > + > + head = msg->front.iov_base; > + > + /* encode the cap metric */ > + cap = (struct ceph_metric_cap *)(head + 1); > + cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); > + cap->ver = 1; > + cap->compat = 1; > + cap->data_len = cpu_to_le32(sizeof(*cap) - 10); > + cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit)); > + cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis)); > + cap->total = cpu_to_le64(nr_caps); > + items++; > + > + dout("cap metric hit %lld, mis %lld, total caps %lld", > + le64_to_cpu(cap->hit), le64_to_cpu(cap->mis), > + le64_to_cpu(cap->total)); > + > + /* encode the read latency metric */ > + read = (struct ceph_metric_read_latency *)(cap + 1); > + read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); > + read->ver = 1; > + read->compat = 1; > + read->data_len = cpu_to_le32(sizeof(*read) - 10); > + total = percpu_counter_sum(&mdsc->metric.total_reads), > + sum = percpu_counter_sum(&mdsc->metric.read_latency_sum); > + jiffies_to_timespec64(sum, &ts); > + read->sec = cpu_to_le32(ts.tv_sec); > + read->nsec = cpu_to_le32(ts.tv_nsec); > + items++; > + dout("read latency metric total %lld, sum lat %lld", total, sum); > + > + /* encode the write latency metric */ > + write = (struct ceph_metric_write_latency *)(read + 1); > + write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); > + write->ver = 1; > + write->compat = 1; > + write->data_len = cpu_to_le32(sizeof(*write) - 10); > + total = percpu_counter_sum(&mdsc->metric.total_writes), > + sum = percpu_counter_sum(&mdsc->metric.write_latency_sum); > + jiffies_to_timespec64(sum, &ts); > + write->sec = cpu_to_le32(ts.tv_sec); > + write->nsec = cpu_to_le32(ts.tv_nsec); > + items++; > + dout("write latency metric total %lld, sum lat %lld", total, sum); > + > + /* encode the metadata latency metric */ > + meta = (struct ceph_metric_metadata_latency *)(write + 1); > + meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); > + meta->ver = 1; > + meta->compat = 1; > + meta->data_len = cpu_to_le32(sizeof(*meta) - 10); > + total = percpu_counter_sum(&mdsc->metric.total_metadatas), > + sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum); > + jiffies_to_timespec64(sum, &ts); > + meta->sec = cpu_to_le32(ts.tv_sec); > + meta->nsec = cpu_to_le32(ts.tv_nsec); > + items++; > + dout("metadata latency metric total %lld, sum lat %lld", total, sum); > + > + /* encode the dentry lease metric */ > + lease = (struct ceph_metric_dentry_lease *)(meta + 1); > + lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); > + lease->ver = 1; > + lease->compat = 1; > + lease->data_len = cpu_to_le32(sizeof(*lease) - 10); > + lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit)); > + lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis)); > + lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries)); > + items++; > + dout("dentry lease metric hit %lld, mis %lld, total dentries %lld", > + le64_to_cpu(lease->hit), le64_to_cpu(lease->mis), > + le64_to_cpu(lease->total)); > + > + put_unaligned_le32(items, &head->num); > + msg->front.iov_len = cpu_to_le32(len); > + msg->hdr.version = cpu_to_le16(1); > + msg->hdr.compat_version = cpu_to_le16(1); > + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); > + dout("send metrics to mds%d %p\n", s->s_mds, msg); > + ceph_con_send(&s->s_con, msg); > + > + return true; > +} > + > +#define CEPH_WORK_DELAY_DEF 5 > +static void __schedule_delayed(struct delayed_work *work, int delay) > +{ > + unsigned int hz = round_jiffies_relative(HZ * delay); > + > + schedule_delayed_work(work, hz); > +} > + > static void schedule_delayed(struct ceph_mds_client *mdsc) > { > - int delay = 5; > - unsigned hz = round_jiffies_relative(HZ * delay); > - schedule_delayed_work(&mdsc->delayed_work, hz); > + __schedule_delayed(&mdsc->delayed_work, CEPH_WORK_DELAY_DEF); > +} > + > +static void metric_schedule_delayed(struct ceph_mds_client *mdsc) > +{ > + /* delay CEPH_WORK_DELAY_DEF seconds when idle */ > + int delay = metric_send_interval ? : CEPH_WORK_DELAY_DEF; > + > + __schedule_delayed(&mdsc->metric_delayed_work, delay); > +} > + > +static bool check_session_state(struct ceph_mds_client *mdsc, > + struct ceph_mds_session *s) > +{ > + if (s->s_state == CEPH_MDS_SESSION_CLOSING) { > + dout("resending session close request for mds%d\n", > + s->s_mds); > + request_close_session(mdsc, s); > + return false; > + } > + if (s->s_ttl && time_after(jiffies, s->s_ttl)) { > + if (s->s_state == CEPH_MDS_SESSION_OPEN) { > + s->s_state = CEPH_MDS_SESSION_HUNG; > + pr_info("mds%d hung\n", s->s_mds); > + } > + } > + if (s->s_state == CEPH_MDS_SESSION_NEW || > + s->s_state == CEPH_MDS_SESSION_RESTARTING || > + s->s_state == CEPH_MDS_SESSION_REJECTED) > + /* this mds is failed or recovering, just wait */ > + return false; > + > + return true; > } > > +/* > + * delayed work -- periodically trim expired leases, renew caps with mds > + */ > static void delayed_work(struct work_struct *work) > { > int i; > @@ -4116,23 +4267,8 @@ static void delayed_work(struct work_struct *work) > struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); > if (!s) > continue; > - if (s->s_state == CEPH_MDS_SESSION_CLOSING) { > - dout("resending session close request for mds%d\n", > - s->s_mds); > - request_close_session(mdsc, s); > - ceph_put_mds_session(s); > - continue; > - } > - if (s->s_ttl && time_after(jiffies, s->s_ttl)) { > - if (s->s_state == CEPH_MDS_SESSION_OPEN) { > - s->s_state = CEPH_MDS_SESSION_HUNG; > - pr_info("mds%d hung\n", s->s_mds); > - } > - } > - if (s->s_state == CEPH_MDS_SESSION_NEW || > - s->s_state == CEPH_MDS_SESSION_RESTARTING || > - s->s_state == CEPH_MDS_SESSION_REJECTED) { > - /* this mds is failed or recovering, just wait */ > + > + if (!check_session_state(mdsc, s)) { > ceph_put_mds_session(s); > continue; > } > @@ -4164,8 +4300,53 @@ static void delayed_work(struct work_struct *work) > schedule_delayed(mdsc); > } > > -static int ceph_mdsc_metric_init(struct ceph_client_metric *metric) > +static void metric_delayed_work(struct work_struct *work) > +{ > + struct ceph_mds_client *mdsc = > + container_of(work, struct ceph_mds_client, metric_delayed_work.work); > + struct ceph_mds_session *s; > + u64 nr_caps = 0; > + bool ret; > + int i; > + > + if (!metric_send_interval) > + goto idle; > + > + dout("mdsc metric_delayed_work\n"); > + > + mutex_lock(&mdsc->mutex); > + for (i = 0; i < mdsc->max_sessions; i++) { > + s = __ceph_lookup_mds_session(mdsc, i); > + if (!s) > + continue; > + nr_caps += s->s_nr_caps; > + ceph_put_mds_session(s); > + } > + > + for (i = 0; i < mdsc->max_sessions; i++) { > + s = __ceph_lookup_mds_session(mdsc, i); > + if (!s) > + continue; > + if (!check_session_state(mdsc, s)) { > + ceph_put_mds_session(s); > + continue; > + } > + > + /* Only send the metric once in any available session */ > + ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps); > + ceph_put_mds_session(s); > + if (ret) > + break; > + } > + mutex_unlock(&mdsc->mutex); > + > +idle: > + metric_schedule_delayed(mdsc); Looks like this will schedule metric_delayed_work() every 5 seconds even if metric_send_interval = 0 (i.e. sending is disabled). What is the reason for that? Thanks, Ilya