Re: [PATCH v3] IB/core: Trace points for diagnosing completion queue issues

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




> On Oct 11, 2019, at 1:26 PM, Parav Pandit <parav@xxxxxxxxxxxx> wrote:
> 
> 
> 
>> -----Original Message-----
>> From: linux-rdma-owner@xxxxxxxxxxxxxxx <linux-rdma-
>> owner@xxxxxxxxxxxxxxx> On Behalf Of Chuck Lever
>> Sent: Wednesday, October 9, 2019 11:55 AM
>> To: linux-rdma@xxxxxxxxxxxxxxx
>> Subject: [PATCH v3] IB/core: Trace points for diagnosing completion queue
>> issues
>> 
>> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
>> ---
>> drivers/infiniband/core/Makefile |    2
>> drivers/infiniband/core/cq.c     |   29 +++--
>> drivers/infiniband/core/trace.c  |   14 ++
>> include/rdma/ib_verbs.h          |    2
>> include/trace/events/rdma_core.h |  218
>> ++++++++++++++++++++++++++++++++++++++
>> 5 files changed, 255 insertions(+), 10 deletions(-)  create mode 100644
>> drivers/infiniband/core/trace.c  create mode 100644
>> include/trace/events/rdma_core.h
>> 
>> Changes since v2:
>> - Removed extraneous changes to include/trace/events/rdma.h
>> 
>> Changes since RFC:
>> - Addressed comments from Parav Pandit <parav@xxxxxxxxxxxx>
>> 
>> 
>> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
>> index 09881bd..68d9e27 100644
>> --- a/drivers/infiniband/core/Makefile
>> +++ b/drivers/infiniband/core/Makefile
>> @@ -11,7 +11,7 @@ ib_core-y :=			packer.o ud_header.o
>> verbs.o cq.o rw.o sysfs.o \
>> 				device.o fmr_pool.o cache.o netlink.o \
>> 				roce_gid_mgmt.o mr_pool.o addr.o
>> sa_query.o \
>> 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
>> -				nldev.o restrack.o counters.o
>> +				nldev.o restrack.o counters.o trace.o
>> 
>> ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
>> ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git
>> a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index
>> bbfded6..bcde992 100644
>> --- a/drivers/infiniband/core/cq.c
>> +++ b/drivers/infiniband/core/cq.c
>> @@ -7,6 +7,8 @@
>> #include <linux/slab.h>
>> #include <rdma/ib_verbs.h>
>> 
>> +#include <trace/events/rdma_core.h>
>> +
>> /* # of WCs to poll for with a single call to ib_poll_cq */
>> #define IB_POLL_BATCH			16
>> #define IB_POLL_BATCH_DIRECT		8
>> @@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct *w)
>> 
>> 	dim->state = DIM_START_MEASURE;
>> 
>> +	trace_cq_modify(cq, comps, usec);
>> 	cq->device->ops.modify_cq(cq, comps, usec);  }
>> 
>> @@ -70,13 +73,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
>> struct ib_wc *wcs,  {
>> 	int i, n, completed = 0;
>> 
>> -	/*
>> -	 * budget might be (-1) if the caller does not
>> -	 * want to bound this call, thus we need unsigned
>> -	 * minimum here.
>> -	 */
>> -	while ((n = ib_poll_cq(cq, min_t(u32, batch,
>> -					 budget - completed), wcs)) > 0) {
>> +	trace_cq_process(cq);
>> +	while ((n = ib_poll_cq(cq, batch, wcs)) > 0) {
> Before this change, on first attempt to poll the cq, it will poll for min(batch, budget).
> With this change, it will poll for batch.
> This is functional change than just adding the trace points.
> I am not sure if this has any effect on the overall polling.
> But it may be worth to keep such functional change in pre-patch which consist of this change, moving comment section, batch recalculation.

Or find a way to add the trace point without the functional change.


>> +		trace_cq_poll(cq, batch, n);
>> 		for (i = 0; i < n; i++) {
>> 			struct ib_wc *wc = &wcs[i];
>> 
>> @@ -87,9 +86,15 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
>> struct ib_wc *wcs,
>> 		}
>> 
>> 		completed += n;
>> -
>> 		if (n != batch || (budget != -1 && completed >= budget))
>> 			break;
>> +
>> +		/*
>> +		 * budget might be (-1) if the caller does not
>> +		 * want to bound this call, thus we need unsigned
>> +		 * minimum here.
>> +		 */
>> +		batch = min_t(u32, batch, budget - completed);
>> 	}
>> 
>> 	return completed;
>> @@ -131,8 +136,10 @@ static int ib_poll_handler(struct irq_poll *iop, int
>> budget)
>> 	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
>> 	if (completed < budget) {
>> 		irq_poll_complete(&cq->iop);
>> -		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
>> +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
>> +			trace_cq_reschedule(cq);
>> 			irq_poll_sched(&cq->iop);
>> +		}
>> 	}
>> 
>> 	if (dim)
>> @@ -143,6 +150,7 @@ static int ib_poll_handler(struct irq_poll *iop, int
>> budget)
>> 
>> static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)  {
>> +	trace_cq_schedule(cq);
>> 	irq_poll_sched(&cq->iop);
>> }
>> 
>> @@ -162,6 +170,7 @@ static void ib_cq_poll_work(struct work_struct *work)
>> 
>> static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)  {
>> +	trace_cq_schedule(cq);
>> 	queue_work(cq->comp_wq, &cq->work);
>> }
>> 
>> @@ -239,6 +248,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev,
>> void *private,
>> 		goto out_destroy_cq;
>> 	}
>> 
>> +	trace_cq_alloc(cq, comp_vector, poll_ctx);
>> 	return cq;
>> 
>> out_destroy_cq:
>> @@ -304,6 +314,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata
>> *udata)
>> 		WARN_ON_ONCE(1);
>> 	}
>> 
>> +	trace_cq_free(cq);
>> 	rdma_restrack_del(&cq->res);
>> 	cq->device->ops.destroy_cq(cq, udata);
>> 	if (cq->dim)
>> diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c
>> new file mode 100644 index 0000000..6c3514b
>> --- /dev/null
>> +++ b/drivers/infiniband/core/trace.c
>> @@ -0,0 +1,14 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Trace points for core RDMA functions.
>> + *
>> + * Author: Chuck Lever <chuck.lever@xxxxxxxxxx>
>> + *
>> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
>> + */
>> +
>> +#define CREATE_TRACE_POINTS
>> +
>> +#include <rdma/ib_verbs.h>
>> +
>> +#include <trace/events/rdma_core.h>
>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
>> 6a47ba8..95a6bce 100644
>> --- a/include/rdma/ib_verbs.h
>> +++ b/include/rdma/ib_verbs.h
>> @@ -1555,6 +1555,8 @@ struct ib_cq {
>> 	};
>> 	struct workqueue_struct *comp_wq;
>> 	struct dim *dim;
>> +	ktime_t timestamp;
>> +	bool interrupt;
> Its unclear when to update timestamp and interrupt. Comment will help.

These are both updated only in the new trace points. Is more than
that needed in a comment?


>> 	/*
>> 	 * Implementation details of the RDMA core, don't use in drivers:
>> 	 */
>> diff --git a/include/trace/events/rdma_core.h
>> b/include/trace/events/rdma_core.h
>> new file mode 100644
>> index 0000000..c1397a3
>> --- /dev/null
>> +++ b/include/trace/events/rdma_core.h
>> @@ -0,0 +1,218 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Trace point definitions for core RDMA functions.
>> + *
>> + * Author: Chuck Lever <chuck.lever@xxxxxxxxxx>
>> + *
>> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
>> + */
>> +
>> +#undef TRACE_SYSTEM
>> +#define TRACE_SYSTEM rdma_core
>> +
>> +#if !defined(_TRACE_RDMA_CORE_H) ||
>> defined(TRACE_HEADER_MULTI_READ)
>> +#define _TRACE_RDMA_CORE_H
>> +
>> +#include <linux/tracepoint.h>
>> +#include <rdma/ib_verbs.h>
>> +#include <rdma/restrack.h>
>> +
>> +/*
>> + * enum ib_poll_context, from include/rdma/ib_verbs.h  */
>> +#define IB_POLL_CTX_LIST			\
>> +	ib_poll_ctx(DIRECT)			\
>> +	ib_poll_ctx(SOFTIRQ)			\
>> +	ib_poll_ctx(WORKQUEUE)			\
>> +	ib_poll_ctx_end(UNBOUND_WORKQUEUE)
>> +
>> +#undef ib_poll_ctx
>> +#undef ib_poll_ctx_end
>> +
>> +#define ib_poll_ctx(x)		TRACE_DEFINE_ENUM(IB_POLL_##x);
>> +#define ib_poll_ctx_end(x)	TRACE_DEFINE_ENUM(IB_POLL_##x);
>> +
>> +IB_POLL_CTX_LIST
>> +
>> +#undef ib_poll_ctx
>> +#undef ib_poll_ctx_end
>> +
>> +#define ib_poll_ctx(x)		{ IB_POLL_##x, #x },
>> +#define ib_poll_ctx_end(x)	{ IB_POLL_##x, #x }
>> +
>> +#define rdma_show_ib_poll_ctx(x) \
>> +		__print_symbolic(x, IB_POLL_CTX_LIST)
>> +
>> +/**
>> + ** Completion Queue events
>> + **/
>> +
>> +TRACE_EVENT(cq_schedule,
>> +	TP_PROTO(
>> +		struct ib_cq *cq
>> +	),
>> +
>> +	TP_ARGS(cq),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u32, id)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		cq->timestamp = ktime_get();
>> +		cq->interrupt = true;
>> +
>> +		__entry->id = cq->res.id;
>> +	),
>> +
>> +	TP_printk("id %u", __entry->id)
>> +);
>> +
>> +TRACE_EVENT(cq_reschedule,
>> +	TP_PROTO(
>> +		struct ib_cq *cq
>> +	),
>> +
>> +	TP_ARGS(cq),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u32, id)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		cq->timestamp = ktime_get();
>> +		cq->interrupt = false;
>> +
>> +		__entry->id = cq->res.id;
>> +	),
>> +
>> +	TP_printk("id %u", __entry->id)
>> +);
>> +
>> +TRACE_EVENT(cq_process,
>> +	TP_PROTO(
>> +		const struct ib_cq *cq
>> +	),
>> +
>> +	TP_ARGS(cq),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(s64, latency)
>> +		__field(u32, id)
>> +		__field(bool, interrupt)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		ktime_t latency = ktime_sub(ktime_get(), cq->timestamp);
>> +
>> +		__entry->id = cq->res.id;
>> +		__entry->latency = ktime_to_us(latency);
>> +		__entry->interrupt = cq->interrupt;
>> +	),
>> +
>> +	TP_printk("id %u wake-up took %lld [us] from %s",
> It might be better to prefix 'id' with 'cq', so that in future rdma wide trace points, we can have multiple resource id's printed consistently as qpid, cqid, mrid etc; and don't have to rely on the function where it is used to decode what that id means.

I left out the "cq" here because the trace point names are prefixed
with "cq_". However, now that you bring it up, I can imagine cases
where a trace point might report information about two different
resources that both have a restrack ID.

How about "cq.id=%u" ?


> I had mixed thoughts on whether to pass ib_cq* or rdma_restrack_entry*.
> I was thinking of rdma_restrack_entry*, as it makes future code for other resources also anchored on the resource id.

On the other hand, a trace point might someday want to report the
value of a field in struct ib_cq.


>> +		__entry->id, __entry->latency,
>> +		__entry->interrupt ? "interrupt" : "reschedule"
>> +	)
>> +);
>> +
>> +TRACE_EVENT(cq_poll,
>> +	TP_PROTO(
>> +		const struct ib_cq *cq,
>> +		int requested,
>> +		int rc
>> +	),
>> +
>> +	TP_ARGS(cq, requested, rc),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u32, id)
>> +		__field(int, requested)
>> +		__field(int, rc)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__entry->id = cq->res.id;
>> +		__entry->requested = requested;
>> +		__entry->rc = rc;
>> +	),
>> +
>> +	TP_printk("id %u requested %d, returned %d",
>> +		__entry->id, __entry->requested, __entry->rc
>> +	)
>> +);
>> +
>> +TRACE_EVENT(cq_modify,
>> +	TP_PROTO(
>> +		const struct ib_cq *cq,
>> +		u16 comps,
>> +		u16 usec
>> +	),
>> +
>> +	TP_ARGS(cq, comps, usec),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u32, id)
>> +		__field(unsigned int, comps)
>> +		__field(unsigned int, usec)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__entry->id = cq->res.id;
>> +		__entry->comps = comps;
>> +		__entry->usec = usec;
>> +	),
>> +
>> +	TP_printk("id %u comps=%u usec=%u",
>> +		__entry->id, __entry->comps, __entry->usec
>> +	)
>> +);
>> +
>> +TRACE_EVENT(cq_alloc,
>> +	TP_PROTO(
>> +		const struct ib_cq *cq,
>> +		int comp_vector,
>> +		enum ib_poll_context poll_ctx
>> +	),
>> +
>> +	TP_ARGS(cq, comp_vector, poll_ctx),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u32, id)
>> +		__field(int, comp_vector)
>> +		__field(unsigned long, poll_ctx)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__entry->id = cq->res.id;
>> +		__entry->comp_vector = comp_vector;
>> +		__entry->poll_ctx = poll_ctx;
>> +	),
>> +
>> +	TP_printk("id %u comp_vector=%d poll_ctx=%s",
>> +		__entry->id, __entry->comp_vector,
>> +		rdma_show_ib_poll_ctx(__entry->poll_ctx)
>> +	)
>> +);
>> +
>> +TRACE_EVENT(cq_free,
>> +	TP_PROTO(
>> +		const struct ib_cq *cq
>> +	),
>> +
>> +	TP_ARGS(cq),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u32, id)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__entry->id = cq->res.id;
>> +	),
>> +
>> +	TP_printk("id %u", __entry->id)
>> +);
>> +
>> +#endif /* _TRACE_RDMA_CORE_H */
>> +
>> +#include <trace/define_trace.h>

--
Chuck Lever







[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux