On Tue, Feb 11, 2020 at 2:30 AM Xiubo Li <xiubli@xxxxxxxxxx> wrote: > > On 2020/2/10 23:34, Ilya Dryomov wrote: > > On Mon, Feb 10, 2020 at 6:34 AM <xiubli@xxxxxxxxxx> wrote: > >> From: Xiubo Li <xiubli@xxxxxxxxxx> > >> > >> Add metric_send_interval module parameter support, the default valume > >> is 0, means disabled. If none zero it will enable the transmission of > >> the metrics to the ceph cluster periodically per metric_send_interval > >> seconds. > >> > >> This will send the caps, dentry lease and read/write/metadata perf > >> metrics to any available MDS only once per metric_send_interval > >> seconds. > >> > >> URL: https://tracker.ceph.com/issues/43215 > >> Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> > >> --- > >> fs/ceph/mds_client.c | 235 +++++++++++++++++++++++++++++++---- > >> fs/ceph/mds_client.h | 2 + > >> fs/ceph/metric.h | 76 +++++++++++ > >> fs/ceph/super.c | 4 + > >> fs/ceph/super.h | 1 + > >> include/linux/ceph/ceph_fs.h | 1 + > >> 6 files changed, 294 insertions(+), 25 deletions(-) > >> > >> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > >> index d414eded6810..f9a6f95c7941 100644 > >> --- a/fs/ceph/mds_client.c > >> +++ b/fs/ceph/mds_client.c > >> @@ -4085,16 +4085,167 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) > >> ceph_force_reconnect(fsc->sb); > >> } > >> > >> -/* > >> - * delayed work -- periodically trim expired leases, renew caps with mds > >> - */ > >> +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, > >> + struct ceph_mds_session *s, > >> + u64 nr_caps) > >> +{ > >> + struct ceph_metric_head *head; > >> + struct ceph_metric_cap *cap; > >> + struct ceph_metric_dentry_lease *lease; > >> + struct ceph_metric_read_latency *read; > >> + struct ceph_metric_write_latency *write; > >> + struct ceph_metric_metadata_latency *meta; > >> + struct ceph_msg *msg; > >> + struct timespec64 ts; > >> + s64 sum, total; > >> + s32 items = 0; > >> + s32 len; > >> + > >> + if (!mdsc || !s) > >> + return false; > >> + > >> + len = sizeof(*head) + sizeof(*cap) + sizeof(*lease) + sizeof(*read) > >> + + sizeof(*write) + sizeof(*meta); > >> + > >> + msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); > >> + if (!msg) { > >> + pr_err("send metrics to mds%d, failed to allocate message\n", > >> + s->s_mds); > >> + return false; > >> + } > >> + > >> + head = msg->front.iov_base; > >> + > >> + /* encode the cap metric */ > >> + cap = (struct ceph_metric_cap *)(head + 1); > >> + cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); > >> + cap->ver = 1; > >> + cap->compat = 1; > >> + cap->data_len = cpu_to_le32(sizeof(*cap) - 10); > >> + cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit)); > >> + cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis)); > >> + cap->total = cpu_to_le64(nr_caps); > >> + items++; > >> + > >> + dout("cap metric hit %lld, mis %lld, total caps %lld", > >> + le64_to_cpu(cap->hit), le64_to_cpu(cap->mis), > >> + le64_to_cpu(cap->total)); > >> + > >> + /* encode the read latency metric */ > >> + read = (struct ceph_metric_read_latency *)(cap + 1); > >> + read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); > >> + read->ver = 1; > >> + read->compat = 1; > >> + read->data_len = cpu_to_le32(sizeof(*read) - 10); > >> + total = percpu_counter_sum(&mdsc->metric.total_reads), > >> + sum = percpu_counter_sum(&mdsc->metric.read_latency_sum); > >> + jiffies_to_timespec64(sum, &ts); > >> + read->sec = cpu_to_le32(ts.tv_sec); > >> + read->nsec = cpu_to_le32(ts.tv_nsec); > >> + items++; > >> + dout("read latency metric total %lld, sum lat %lld", total, sum); > >> + > >> + /* encode the write latency metric */ > >> + write = (struct ceph_metric_write_latency *)(read + 1); > >> + write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); > >> + write->ver = 1; > >> + write->compat = 1; > >> + write->data_len = cpu_to_le32(sizeof(*write) - 10); > >> + total = percpu_counter_sum(&mdsc->metric.total_writes), > >> + sum = percpu_counter_sum(&mdsc->metric.write_latency_sum); > >> + jiffies_to_timespec64(sum, &ts); > >> + write->sec = cpu_to_le32(ts.tv_sec); > >> + write->nsec = cpu_to_le32(ts.tv_nsec); > >> + items++; > >> + dout("write latency metric total %lld, sum lat %lld", total, sum); > >> + > >> + /* encode the metadata latency metric */ > >> + meta = (struct ceph_metric_metadata_latency *)(write + 1); > >> + meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); > >> + meta->ver = 1; > >> + meta->compat = 1; > >> + meta->data_len = cpu_to_le32(sizeof(*meta) - 10); > >> + total = percpu_counter_sum(&mdsc->metric.total_metadatas), > >> + sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum); > >> + jiffies_to_timespec64(sum, &ts); > >> + meta->sec = cpu_to_le32(ts.tv_sec); > >> + meta->nsec = cpu_to_le32(ts.tv_nsec); > >> + items++; > >> + dout("metadata latency metric total %lld, sum lat %lld", total, sum); > >> + > >> + /* encode the dentry lease metric */ > >> + lease = (struct ceph_metric_dentry_lease *)(meta + 1); > >> + lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); > >> + lease->ver = 1; > >> + lease->compat = 1; > >> + lease->data_len = cpu_to_le32(sizeof(*lease) - 10); > >> + lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit)); > >> + lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis)); > >> + lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries)); > >> + items++; > >> + dout("dentry lease metric hit %lld, mis %lld, total dentries %lld", > >> + le64_to_cpu(lease->hit), le64_to_cpu(lease->mis), > >> + le64_to_cpu(lease->total)); > >> + > >> + put_unaligned_le32(items, &head->num); > >> + msg->front.iov_len = cpu_to_le32(len); > >> + msg->hdr.version = cpu_to_le16(1); > >> + msg->hdr.compat_version = cpu_to_le16(1); > >> + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); > >> + dout("send metrics to mds%d %p\n", s->s_mds, msg); > >> + ceph_con_send(&s->s_con, msg); > >> + > >> + return true; > >> +} > >> + > >> +#define CEPH_WORK_DELAY_DEF 5 > >> +static void __schedule_delayed(struct delayed_work *work, int delay) > >> +{ > >> + unsigned int hz = round_jiffies_relative(HZ * delay); > >> + > >> + schedule_delayed_work(work, hz); > >> +} > >> + > >> static void schedule_delayed(struct ceph_mds_client *mdsc) > >> { > >> - int delay = 5; > >> - unsigned hz = round_jiffies_relative(HZ * delay); > >> - schedule_delayed_work(&mdsc->delayed_work, hz); > >> + __schedule_delayed(&mdsc->delayed_work, CEPH_WORK_DELAY_DEF); > >> +} > >> + > >> +static void metric_schedule_delayed(struct ceph_mds_client *mdsc) > >> +{ > >> + /* delay CEPH_WORK_DELAY_DEF seconds when idle */ > >> + int delay = metric_send_interval ? : CEPH_WORK_DELAY_DEF; > >> + > >> + __schedule_delayed(&mdsc->metric_delayed_work, delay); > >> +} > >> + > >> +static bool check_session_state(struct ceph_mds_client *mdsc, > >> + struct ceph_mds_session *s) > >> +{ > >> + if (s->s_state == CEPH_MDS_SESSION_CLOSING) { > >> + dout("resending session close request for mds%d\n", > >> + s->s_mds); > >> + request_close_session(mdsc, s); > >> + return false; > >> + } > >> + if (s->s_ttl && time_after(jiffies, s->s_ttl)) { > >> + if (s->s_state == CEPH_MDS_SESSION_OPEN) { > >> + s->s_state = CEPH_MDS_SESSION_HUNG; > >> + pr_info("mds%d hung\n", s->s_mds); > >> + } > >> + } > >> + if (s->s_state == CEPH_MDS_SESSION_NEW || > >> + s->s_state == CEPH_MDS_SESSION_RESTARTING || > >> + s->s_state == CEPH_MDS_SESSION_REJECTED) > >> + /* this mds is failed or recovering, just wait */ > >> + return false; > >> + > >> + return true; > >> } > >> > >> +/* > >> + * delayed work -- periodically trim expired leases, renew caps with mds > >> + */ > >> static void delayed_work(struct work_struct *work) > >> { > >> int i; > >> @@ -4116,23 +4267,8 @@ static void delayed_work(struct work_struct *work) > >> struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); > >> if (!s) > >> continue; > >> - if (s->s_state == CEPH_MDS_SESSION_CLOSING) { > >> - dout("resending session close request for mds%d\n", > >> - s->s_mds); > >> - request_close_session(mdsc, s); > >> - ceph_put_mds_session(s); > >> - continue; > >> - } > >> - if (s->s_ttl && time_after(jiffies, s->s_ttl)) { > >> - if (s->s_state == CEPH_MDS_SESSION_OPEN) { > >> - s->s_state = CEPH_MDS_SESSION_HUNG; > >> - pr_info("mds%d hung\n", s->s_mds); > >> - } > >> - } > >> - if (s->s_state == CEPH_MDS_SESSION_NEW || > >> - s->s_state == CEPH_MDS_SESSION_RESTARTING || > >> - s->s_state == CEPH_MDS_SESSION_REJECTED) { > >> - /* this mds is failed or recovering, just wait */ > >> + > >> + if (!check_session_state(mdsc, s)) { > >> ceph_put_mds_session(s); > >> continue; > >> } > >> @@ -4164,8 +4300,53 @@ static void delayed_work(struct work_struct *work) > >> schedule_delayed(mdsc); > >> } > >> > >> -static int ceph_mdsc_metric_init(struct ceph_client_metric *metric) > >> +static void metric_delayed_work(struct work_struct *work) > >> +{ > >> + struct ceph_mds_client *mdsc = > >> + container_of(work, struct ceph_mds_client, metric_delayed_work.work); > >> + struct ceph_mds_session *s; > >> + u64 nr_caps = 0; > >> + bool ret; > >> + int i; > >> + > >> + if (!metric_send_interval) > >> + goto idle; > >> + > >> + dout("mdsc metric_delayed_work\n"); > >> + > >> + mutex_lock(&mdsc->mutex); > >> + for (i = 0; i < mdsc->max_sessions; i++) { > >> + s = __ceph_lookup_mds_session(mdsc, i); > >> + if (!s) > >> + continue; > >> + nr_caps += s->s_nr_caps; > >> + ceph_put_mds_session(s); > >> + } > >> + > >> + for (i = 0; i < mdsc->max_sessions; i++) { > >> + s = __ceph_lookup_mds_session(mdsc, i); > >> + if (!s) > >> + continue; > >> + if (!check_session_state(mdsc, s)) { > >> + ceph_put_mds_session(s); > >> + continue; > >> + } > >> + > >> + /* Only send the metric once in any available session */ > >> + ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps); > >> + ceph_put_mds_session(s); > >> + if (ret) > >> + break; > >> + } > >> + mutex_unlock(&mdsc->mutex); > >> + > >> +idle: > >> + metric_schedule_delayed(mdsc); > > Looks like this will schedule metric_delayed_work() every 5 seconds > > even if metric_send_interval = 0 (i.e. sending is disabled). What is > > the reason for that? > > Hi Ilya, > > Before I folded the metric_delayed_work() into delayed_work(). But for > the this version since the interval is settable, so it hard to calculate > the next schedule delay for that. > > When it is idle just looping every 5 seconds, I thought though this is > not a very graceful approach it won't introduce too much overload. If we > do not like this, let's switch it to a completion. Take a look at module_param_cb macro. I think you can provide a setter and schedule the first work / modify the delay from there. That said, I'm not sure making the interval configurable is a good idea. I'm not saying you need to change anything -- just that if it was me, I would send these metrics once per tick (i.e. delayed_work) with an on/off switch and no other tunables. Thanks, Ilya