Re: [PATCH resend v5 08/11] ceph: periodically send perf metrics to MDS

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, 2020-01-29 at 03:27 -0500, xiubli@xxxxxxxxxx wrote:
> From: Xiubo Li <xiubli@xxxxxxxxxx>
> 
> Add enable/disable sending metrics to MDS debugfs and disabled as
> default, if it's enabled the kclient will send metrics every
> second.
> 
> This will send global dentry lease hit/miss and read/write/metadata
> latency metrics and each session's caps hit/miss metric to MDS.
> 
> Every time only sends the global metrics once via any availible
> session.
> 
> URL: https://tracker.ceph.com/issues/43215
> Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
> ---
>  fs/ceph/debugfs.c            |  44 +++++++-
>  fs/ceph/mds_client.c         | 201 ++++++++++++++++++++++++++++++++---
>  fs/ceph/mds_client.h         |   3 +
>  fs/ceph/metric.h             |  76 +++++++++++++
>  fs/ceph/super.h              |   1 +
>  include/linux/ceph/ceph_fs.h |   1 +
>  6 files changed, 307 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> index 7fd031c18309..8aae7ecea54a 100644
> --- a/fs/ceph/debugfs.c
> +++ b/fs/ceph/debugfs.c
> @@ -124,6 +124,40 @@ static int mdsc_show(struct seq_file *s, void *p)
>  	return 0;
>  }
>  
> +/*
> + * metrics debugfs
> + */
> +static int sending_metrics_set(void *data, u64 val)
> +{
> +	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
> +	struct ceph_mds_client *mdsc = fsc->mdsc;
> +
> +	if (val > 1) {
> +		pr_err("Invalid sending metrics set value %llu\n", val);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&mdsc->mutex);
> +	mdsc->sending_metrics = (unsigned int)val;

Shouldn't that be a bool cast? Do we even need a cast there?

> +	mutex_unlock(&mdsc->mutex);
> +
> +	return 0;
> +}
> +
> +static int sending_metrics_get(void *data, u64 *val)
> +{
> +	struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
> +	struct ceph_mds_client *mdsc = fsc->mdsc;
> +
> +	mutex_lock(&mdsc->mutex);
> +	*val = (u64)mdsc->sending_metrics;
> +	mutex_unlock(&mdsc->mutex);
> +
> +	return 0;
> +}
> +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get,
> +			sending_metrics_set, "%llu\n");
> +

I'd like to hear more about how we expect users to use this facility.
This debugfs file doesn't seem consistent with the rest of the UI, and I
imagine if the box reboots you'd have to (manually) re-enable it after
mount, right? Maybe this should be a mount option instead?


>  static int metric_show(struct seq_file *s, void *p)
>  {
>  	struct ceph_fs_client *fsc = s->private;
> @@ -302,11 +336,9 @@ static int congestion_kb_get(void *data, u64 *val)
>  	*val = (u64)fsc->mount_options->congestion_kb;
>  	return 0;
>  }
> -
>  DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
>  			congestion_kb_set, "%llu\n");
>  
> -
>  void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
>  {
>  	dout("ceph_fs_debugfs_cleanup\n");
> @@ -316,6 +348,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
>  	debugfs_remove(fsc->debugfs_mds_sessions);
>  	debugfs_remove(fsc->debugfs_caps);
>  	debugfs_remove(fsc->debugfs_metric);
> +	debugfs_remove(fsc->debugfs_sending_metrics);
>  	debugfs_remove(fsc->debugfs_mdsc);
>  }
>  
> @@ -356,6 +389,13 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
>  						fsc,
>  						&mdsc_show_fops);
>  
> +	fsc->debugfs_sending_metrics =
> +			debugfs_create_file("sending_metrics",
> +					    0600,
> +					    fsc->client->debugfs_dir,
> +					    fsc,
> +					    &sending_metrics_fops);
> +
>  	fsc->debugfs_metric = debugfs_create_file("metrics",
>  						  0400,
>  						  fsc->client->debugfs_dir,
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 92a933810a79..d765804dc855 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -4104,13 +4104,156 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
>  	ceph_force_reconnect(fsc->sb);
>  }
>  
> +/*
> + * called under s_mutex
> + */
> +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
> +				   struct ceph_mds_session *s,
> +				   bool skip_global)
> +{
> +	struct ceph_metric_head *head;
> +	struct ceph_metric_cap *cap;
> +	struct ceph_metric_dentry_lease *lease;
> +	struct ceph_metric_read_latency *read;
> +	struct ceph_metric_write_latency *write;
> +	struct ceph_metric_metadata_latency *meta;
> +	struct ceph_msg *msg;
> +	struct timespec64 ts;
> +	s32 len = sizeof(*head) + sizeof(*cap);
> +	s64 sum, total, avg;
> +	s32 items = 0;
> +
> +	if (!mdsc || !s)
> +		return false;
> +
> +	if (!skip_global) {
> +		len += sizeof(*lease);
> +		len += sizeof(*read);
> +		len += sizeof(*write);
> +		len += sizeof(*meta);
> +	}
> +
> +	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
> +	if (!msg) {
> +		pr_err("send metrics to mds%d, failed to allocate message\n",
> +		       s->s_mds);
> +		return false;
> +	}
> +
> +	head = msg->front.iov_base;
> +
> +	/* encode the cap metric */
> +	cap = (struct ceph_metric_cap *)(head + 1);
> +	cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
> +	cap->ver = 1;
> +	cap->campat = 1;
> +	cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
> +	cap->hit = cpu_to_le64(percpu_counter_sum(&s->i_caps_hit));
> +	cap->mis = cpu_to_le64(percpu_counter_sum(&s->i_caps_mis));
> +	cap->total = cpu_to_le64(s->s_nr_caps);
> +	items++;
> +
> +	dout("cap metric hit %lld, mis %lld, total caps %lld",
> +	     le64_to_cpu(cap->hit), le64_to_cpu(cap->mis),
> +	     le64_to_cpu(cap->total));
> +
> +	/* only send the global once */
> +	if (skip_global)
> +		goto skip_global;
> +
> +	/* encode the dentry lease metric */
> +	lease = (struct ceph_metric_dentry_lease *)(cap + 1);
> +	lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
> +	lease->ver = 1;
> +	lease->campat = 1;
> +	lease->data_len = cpu_to_le32(sizeof(*lease) - 10);
> +	lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit));
> +	lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis));
> +	lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries));
> +	items++;
> +
> +	dout("dentry lease metric hit %lld, mis %lld, total dentries %lld",
> +	     le64_to_cpu(lease->hit), le64_to_cpu(lease->mis),
> +	     le64_to_cpu(lease->total));
> +
> +	/* encode the read latency metric */
> +	read = (struct ceph_metric_read_latency *)(lease + 1);
> +	read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
> +	read->ver = 1;
> +	read->campat = 1;
> +	read->data_len = cpu_to_le32(sizeof(*read) - 10);
> +	total = percpu_counter_sum(&mdsc->metric.total_reads),
> +	sum = percpu_counter_sum(&mdsc->metric.read_latency_sum);
> +	avg = total ? sum / total : 0;
> +	ts = ns_to_timespec64(avg);
> +	read->sec = cpu_to_le32(ts.tv_sec);
> +	read->nsec = cpu_to_le32(ts.tv_nsec);
> +	items++;
> +
> +	dout("read latency metric total %lld, sum lat %lld, avg lat %lld",
> +	     total, sum, avg);
> +
> +	/* encode the write latency metric */
> +	write = (struct ceph_metric_write_latency *)(read + 1);
> +	write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
> +	write->ver = 1;
> +	write->campat = 1;
> +	write->data_len = cpu_to_le32(sizeof(*write) - 10);
> +	total = percpu_counter_sum(&mdsc->metric.total_writes),
> +	sum = percpu_counter_sum(&mdsc->metric.write_latency_sum);
> +	avg = total ? sum / total : 0;
> +	ts = ns_to_timespec64(avg);
> +	write->sec = cpu_to_le32(ts.tv_sec);
> +	write->nsec = cpu_to_le32(ts.tv_nsec);
> +	items++;
> +
> +	dout("write latency metric total %lld, sum lat %lld, avg lat %lld",
> +	     total, sum, avg);
> +
> +	/* encode the metadata latency metric */
> +	meta = (struct ceph_metric_metadata_latency *)(write + 1);
> +	meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
> +	meta->ver = 1;
> +	meta->campat = 1;
> +	meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
> +	total = percpu_counter_sum(&mdsc->metric.total_metadatas),
> +	sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum);
> +	avg = total ? sum / total : 0;
> +	ts = ns_to_timespec64(avg);
> +	meta->sec = cpu_to_le32(ts.tv_sec);
> +	meta->nsec = cpu_to_le32(ts.tv_nsec);
> +	items++;
> +
> +	dout("metadata latency metric total %lld, sum lat %lld, avg lat %lld",
> +	     total, sum, avg);
> +
> +skip_global:
> +	put_unaligned_le32(items, &head->num);
> +	msg->front.iov_len = cpu_to_le32(len);
> +	msg->hdr.version = cpu_to_le16(1);
> +	msg->hdr.compat_version = cpu_to_le16(1);
> +	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
> +	dout("send metrics to mds%d %p\n", s->s_mds, msg);
> +	ceph_con_send(&s->s_con, msg);
> +
> +	return true;
> +}
> +
>  /*
>   * delayed work -- periodically trim expired leases, renew caps with mds
>   */
> +#define CEPH_WORK_DELAY_DEF 5
>  static void schedule_delayed(struct ceph_mds_client *mdsc)
>  {
> -	int delay = 5;
> -	unsigned hz = round_jiffies_relative(HZ * delay);
> +	unsigned int hz;
> +	int delay = CEPH_WORK_DELAY_DEF;
> +
> +	mutex_lock(&mdsc->mutex);
> +	if (mdsc->sending_metrics)
> +		delay = 1;
> +	mutex_unlock(&mdsc->mutex);
> +

The mdsc->mutex is dropped in the callers a little before this is
called, so this is a little too mutex-thrashy. I think you'd be better
off changing this function to be called with the mutex still held.

> +	hz = round_jiffies_relative(HZ * delay);
>  	schedule_delayed_work(&mdsc->delayed_work, hz);
>  }
>  
> @@ -4121,18 +4264,28 @@ static void delayed_work(struct work_struct *work)
>  		container_of(work, struct ceph_mds_client, delayed_work.work);
>  	int renew_interval;
>  	int renew_caps;
> +	bool metric_only;
> +	bool sending_metrics;
> +	bool g_skip = false;
>  
>  	dout("mdsc delayed_work\n");
>  
>  	mutex_lock(&mdsc->mutex);
> -	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
> -	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
> -				   mdsc->last_renew_caps);
> -	if (renew_caps)
> -		mdsc->last_renew_caps = jiffies;
> +	sending_metrics = !!mdsc->sending_metrics;
> +	metric_only = mdsc->sending_metrics &&
> +		(mdsc->ticks++ % CEPH_WORK_DELAY_DEF);
> +
> +	if (!metric_only) {
> +		renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
> +		renew_caps = time_after_eq(jiffies, HZ*renew_interval +
> +					   mdsc->last_renew_caps);
> +		if (renew_caps)
> +			mdsc->last_renew_caps = jiffies;
> +	}
>  
>  	for (i = 0; i < mdsc->max_sessions; i++) {
>  		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
> +
>  		if (!s)
>  			continue;
>  		if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
> @@ -4158,13 +4311,20 @@ static void delayed_work(struct work_struct *work)
>  		mutex_unlock(&mdsc->mutex);
>  
>  		mutex_lock(&s->s_mutex);
> -		if (renew_caps)
> -			send_renew_caps(mdsc, s);
> -		else
> -			ceph_con_keepalive(&s->s_con);
> -		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
> -		    s->s_state == CEPH_MDS_SESSION_HUNG)
> -			ceph_send_cap_releases(mdsc, s);
> +
> +		if (sending_metrics)
> +			g_skip = ceph_mdsc_send_metrics(mdsc, s, g_skip);
> +
> +		if (!metric_only) {
> +			if (renew_caps)
> +				send_renew_caps(mdsc, s);
> +			else
> +				ceph_con_keepalive(&s->s_con);
> +			if (s->s_state == CEPH_MDS_SESSION_OPEN ||
> +					s->s_state == CEPH_MDS_SESSION_HUNG)
> +				ceph_send_cap_releases(mdsc, s);
> +		}
> +
>  		mutex_unlock(&s->s_mutex);
>  		ceph_put_mds_session(s);
>  
> @@ -4172,6 +4332,9 @@ static void delayed_work(struct work_struct *work)
>  	}
>  	mutex_unlock(&mdsc->mutex);
>  
> +	if (metric_only)
> +		goto delay_work;
> +
>  	ceph_check_delayed_caps(mdsc);
>  
>  	ceph_queue_cap_reclaim_work(mdsc);
> @@ -4180,11 +4343,13 @@ static void delayed_work(struct work_struct *work)
>  
>  	maybe_recover_session(mdsc);
>  
> +delay_work:
>  	schedule_delayed(mdsc);
>  }
>  
> -static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
> +static int ceph_mdsc_metric_init(struct ceph_mds_client *mdsc)
>  {
> +	struct ceph_client_metric *metric = &mdsc->metric;
>  	int ret;
>  
>  	if (!metric)
> @@ -4222,7 +4387,9 @@ static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
>  	if (ret)
>  		goto err_metadata_latency_sum;
>  
> -	return ret;
> +	mdsc->sending_metrics = 0;
> +	mdsc->ticks = 0;
> +	return 0;
>  err_metadata_latency_sum:
>  	percpu_counter_destroy(&metric->total_metadatas);
>  err_total_metadatas:
> @@ -4294,7 +4461,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>  	init_waitqueue_head(&mdsc->cap_flushing_wq);
>  	INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
>  	atomic_set(&mdsc->cap_reclaim_pending, 0);
> -	err = ceph_mdsc_metric_init(&mdsc->metric);
> +	err = ceph_mdsc_metric_init(mdsc);
>  	if (err)
>  		goto err_mdsmap;
>  
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 574d4e5a5de2..a0ece55d987c 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -451,6 +451,9 @@ struct ceph_mds_client {
>  	struct list_head  dentry_leases;     /* fifo list */
>  	struct list_head  dentry_dir_leases; /* lru list */
>  
> +	/* metrics */
> +	unsigned int		  sending_metrics;
> +	unsigned int		  ticks;
>  	struct ceph_client_metric metric;
>  
>  	spinlock_t		snapid_map_lock;
> diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
> index 3cda616ba594..352eb753ce25 100644
> --- a/fs/ceph/metric.h
> +++ b/fs/ceph/metric.h
> @@ -4,6 +4,82 @@
>  
>  #include <linux/ceph/osd_client.h>
>  
> +enum ceph_metric_type {
> +	CLIENT_METRIC_TYPE_CAP_INFO,
> +	CLIENT_METRIC_TYPE_READ_LATENCY,
> +	CLIENT_METRIC_TYPE_WRITE_LATENCY,
> +	CLIENT_METRIC_TYPE_METADATA_LATENCY,
> +	CLIENT_METRIC_TYPE_DENTRY_LEASE,
> +
> +	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
> +};
> +
> +/* metric caps header */
> +struct ceph_metric_cap {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  campat;

I think you meant "compat" here.

> +
> +	__le32 data_len; /* length of sizeof(hit + mis + total) */
> +	__le64 hit;
> +	__le64 mis;
> +	__le64 total;
> +} __attribute__ ((packed));
> +
> +/* metric dentry lease header */
> +struct ceph_metric_dentry_lease {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  campat;
> +
> +	__le32 data_len; /* length of sizeof(hit + mis + total) */
> +	__le64 hit;
> +	__le64 mis;
> +	__le64 total;
> +} __attribute__ ((packed));
> +
> +/* metric read latency header */
> +struct ceph_metric_read_latency {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  campat;
> +
> +	__le32 data_len; /* length of sizeof(sec + nsec) */
> +	__le32 sec;
> +	__le32 nsec;
> +} __attribute__ ((packed));
> +
> +/* metric write latency header */
> +struct ceph_metric_write_latency {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  campat;
> +
> +	__le32 data_len; /* length of sizeof(sec + nsec) */
> +	__le32 sec;
> +	__le32 nsec;
> +} __attribute__ ((packed));
> +
> +/* metric metadata latency header */
> +struct ceph_metric_metadata_latency {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  campat;
> +
> +	__le32 data_len; /* length of sizeof(sec + nsec) */
> +	__le32 sec;
> +	__le32 nsec;
> +} __attribute__ ((packed));
> +
> +struct ceph_metric_head {
> +	__le32 num;	/* the number of metrics will be sent */

"the number of metrics that will be sent"

> +} __attribute__ ((packed));
> +
>  /* This is the global metrics */
>  struct ceph_client_metric {
>  	atomic64_t		total_dentries;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 3f4829222528..a91431e9bdf7 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -128,6 +128,7 @@ struct ceph_fs_client {
>  	struct dentry *debugfs_congestion_kb;
>  	struct dentry *debugfs_bdi;
>  	struct dentry *debugfs_mdsc, *debugfs_mdsmap;
> +	struct dentry *debugfs_sending_metrics;
>  	struct dentry *debugfs_metric;
>  	struct dentry *debugfs_mds_sessions;
>  #endif
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index a099f60feb7b..6028d3e865e4 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -130,6 +130,7 @@ struct ceph_dir_layout {
>  #define CEPH_MSG_CLIENT_REQUEST         24
>  #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
>  #define CEPH_MSG_CLIENT_REPLY           26
> +#define CEPH_MSG_CLIENT_METRICS         29
>  #define CEPH_MSG_CLIENT_CAPS            0x310
>  #define CEPH_MSG_CLIENT_LEASE           0x311
>  #define CEPH_MSG_CLIENT_SNAP            0x312

-- 
Jeff Layton <jlayton@xxxxxxxxxx>




[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux