> -----Original Message----- > From: Chuck Lever <chuck.lever@xxxxxxxxxx> > Sent: Friday, October 11, 2019 12:39 PM > To: Parav Pandit <parav@xxxxxxxxxxxx> > Cc: linux-rdma@xxxxxxxxxxxxxxx > Subject: Re: [PATCH v3] IB/core: Trace points for diagnosing completion queue > issues > > > > > On Oct 11, 2019, at 1:26 PM, Parav Pandit <parav@xxxxxxxxxxxx> wrote: > > > > > > > >> -----Original Message----- > >> From: linux-rdma-owner@xxxxxxxxxxxxxxx <linux-rdma- > >> owner@xxxxxxxxxxxxxxx> On Behalf Of Chuck Lever > >> Sent: Wednesday, October 9, 2019 11:55 AM > >> To: linux-rdma@xxxxxxxxxxxxxxx > >> Subject: [PATCH v3] IB/core: Trace points for diagnosing completion > >> queue issues > >> > >> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> > >> --- > >> drivers/infiniband/core/Makefile | 2 > >> drivers/infiniband/core/cq.c | 29 +++-- > >> drivers/infiniband/core/trace.c | 14 ++ > >> include/rdma/ib_verbs.h | 2 > >> include/trace/events/rdma_core.h | 218 > >> ++++++++++++++++++++++++++++++++++++++ > >> 5 files changed, 255 insertions(+), 10 deletions(-) create mode > >> 100644 drivers/infiniband/core/trace.c create mode 100644 > >> include/trace/events/rdma_core.h > >> > >> Changes since v2: > >> - Removed extraneous changes to include/trace/events/rdma.h > >> > >> Changes since RFC: > >> - Addressed comments from Parav Pandit <parav@xxxxxxxxxxxx> > >> > >> > >> diff --git a/drivers/infiniband/core/Makefile > >> b/drivers/infiniband/core/Makefile > >> index 09881bd..68d9e27 100644 > >> --- a/drivers/infiniband/core/Makefile > >> +++ b/drivers/infiniband/core/Makefile > >> @@ -11,7 +11,7 @@ ib_core-y := packer.o ud_header.o > >> verbs.o cq.o rw.o sysfs.o \ > >> device.o fmr_pool.o cache.o netlink.o \ > >> roce_gid_mgmt.o mr_pool.o addr.o > >> sa_query.o \ > >> multicast.o mad.o smi.o agent.o mad_rmpp.o \ > >> - nldev.o restrack.o counters.o > >> + nldev.o restrack.o counters.o trace.o > >> > >> ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o > >> ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git > >> a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index > >> bbfded6..bcde992 100644 > >> --- a/drivers/infiniband/core/cq.c > >> +++ b/drivers/infiniband/core/cq.c > >> @@ -7,6 +7,8 @@ > >> #include <linux/slab.h> > >> #include <rdma/ib_verbs.h> > >> > >> +#include <trace/events/rdma_core.h> > >> + > >> /* # of WCs to poll for with a single call to ib_poll_cq */ > >> #define IB_POLL_BATCH 16 > >> #define IB_POLL_BATCH_DIRECT 8 > >> @@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct > >> *w) > >> > >> dim->state = DIM_START_MEASURE; > >> > >> + trace_cq_modify(cq, comps, usec); > >> cq->device->ops.modify_cq(cq, comps, usec); } > >> > >> @@ -70,13 +73,9 @@ static int __ib_process_cq(struct ib_cq *cq, int > >> budget, struct ib_wc *wcs, { > >> int i, n, completed = 0; > >> > >> - /* > >> - * budget might be (-1) if the caller does not > >> - * want to bound this call, thus we need unsigned > >> - * minimum here. > >> - */ > >> - while ((n = ib_poll_cq(cq, min_t(u32, batch, > >> - budget - completed), wcs)) > 0) { > >> + trace_cq_process(cq); > >> + while ((n = ib_poll_cq(cq, batch, wcs)) > 0) { > > Before this change, on first attempt to poll the cq, it will poll for min(batch, > budget). > > With this change, it will poll for batch. > > This is functional change than just adding the trace points. > > I am not sure if this has any effect on the overall polling. > > But it may be worth to keep such functional change in pre-patch which consist > of this change, moving comment section, batch recalculation. > > Or find a way to add the trace point without the functional change. > > :-) yes. > >> + trace_cq_poll(cq, batch, n); > >> for (i = 0; i < n; i++) { > >> struct ib_wc *wc = &wcs[i]; > >> > >> @@ -87,9 +86,15 @@ static int __ib_process_cq(struct ib_cq *cq, int > >> budget, struct ib_wc *wcs, > >> } > >> > >> completed += n; > >> - > >> if (n != batch || (budget != -1 && completed >= budget)) > >> break; > >> + > >> + /* > >> + * budget might be (-1) if the caller does not > >> + * want to bound this call, thus we need unsigned > >> + * minimum here. > >> + */ > >> + batch = min_t(u32, batch, budget - completed); > >> } > >> > >> return completed; > >> @@ -131,8 +136,10 @@ static int ib_poll_handler(struct irq_poll *iop, > >> int > >> budget) > >> completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); > >> if (completed < budget) { > >> irq_poll_complete(&cq->iop); > >> - if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) > >> + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) { > >> + trace_cq_reschedule(cq); > >> irq_poll_sched(&cq->iop); > >> + } > >> } > >> > >> if (dim) > >> @@ -143,6 +150,7 @@ static int ib_poll_handler(struct irq_poll *iop, > >> int > >> budget) > >> > >> static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) > >> { > >> + trace_cq_schedule(cq); > >> irq_poll_sched(&cq->iop); > >> } > >> > >> @@ -162,6 +170,7 @@ static void ib_cq_poll_work(struct work_struct > >> *work) > >> > >> static void ib_cq_completion_workqueue(struct ib_cq *cq, void > >> *private) { > >> + trace_cq_schedule(cq); > >> queue_work(cq->comp_wq, &cq->work); > >> } > >> > >> @@ -239,6 +248,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device > >> *dev, void *private, > >> goto out_destroy_cq; > >> } > >> > >> + trace_cq_alloc(cq, comp_vector, poll_ctx); > >> return cq; > >> > >> out_destroy_cq: > >> @@ -304,6 +314,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct > >> ib_udata > >> *udata) > >> WARN_ON_ONCE(1); > >> } > >> > >> + trace_cq_free(cq); > >> rdma_restrack_del(&cq->res); > >> cq->device->ops.destroy_cq(cq, udata); > >> if (cq->dim) > >> diff --git a/drivers/infiniband/core/trace.c > >> b/drivers/infiniband/core/trace.c new file mode 100644 index > >> 0000000..6c3514b > >> --- /dev/null > >> +++ b/drivers/infiniband/core/trace.c > >> @@ -0,0 +1,14 @@ > >> +// SPDX-License-Identifier: GPL-2.0-only > >> +/* > >> + * Trace points for core RDMA functions. > >> + * > >> + * Author: Chuck Lever <chuck.lever@xxxxxxxxxx> > >> + * > >> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. > >> + */ > >> + > >> +#define CREATE_TRACE_POINTS > >> + > >> +#include <rdma/ib_verbs.h> > >> + > >> +#include <trace/events/rdma_core.h> > >> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index > >> 6a47ba8..95a6bce 100644 > >> --- a/include/rdma/ib_verbs.h > >> +++ b/include/rdma/ib_verbs.h > >> @@ -1555,6 +1555,8 @@ struct ib_cq { > >> }; > >> struct workqueue_struct *comp_wq; > >> struct dim *dim; > >> + ktime_t timestamp; > >> + bool interrupt; > > Its unclear when to update timestamp and interrupt. Comment will help. > > These are both updated only in the new trace points. Is more than that needed > in a comment? > No. It wasn't clear to me that when to update the timestamp. But I guess its obvious.. > > >> /* > >> * Implementation details of the RDMA core, don't use in drivers: > >> */ > >> diff --git a/include/trace/events/rdma_core.h > >> b/include/trace/events/rdma_core.h > >> new file mode 100644 > >> index 0000000..c1397a3 > >> --- /dev/null > >> +++ b/include/trace/events/rdma_core.h > >> @@ -0,0 +1,218 @@ > >> +/* SPDX-License-Identifier: GPL-2.0-only */ > >> +/* > >> + * Trace point definitions for core RDMA functions. > >> + * > >> + * Author: Chuck Lever <chuck.lever@xxxxxxxxxx> > >> + * > >> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. > >> + */ > >> + > >> +#undef TRACE_SYSTEM > >> +#define TRACE_SYSTEM rdma_core > >> + > >> +#if !defined(_TRACE_RDMA_CORE_H) || > >> defined(TRACE_HEADER_MULTI_READ) > >> +#define _TRACE_RDMA_CORE_H > >> + > >> +#include <linux/tracepoint.h> > >> +#include <rdma/ib_verbs.h> > >> +#include <rdma/restrack.h> > >> + > >> +/* > >> + * enum ib_poll_context, from include/rdma/ib_verbs.h */ > >> +#define IB_POLL_CTX_LIST \ > >> + ib_poll_ctx(DIRECT) \ > >> + ib_poll_ctx(SOFTIRQ) \ > >> + ib_poll_ctx(WORKQUEUE) \ > >> + ib_poll_ctx_end(UNBOUND_WORKQUEUE) > >> + > >> +#undef ib_poll_ctx > >> +#undef ib_poll_ctx_end > >> + > >> +#define ib_poll_ctx(x) TRACE_DEFINE_ENUM(IB_POLL_##x); > >> +#define ib_poll_ctx_end(x) TRACE_DEFINE_ENUM(IB_POLL_##x); > >> + > >> +IB_POLL_CTX_LIST > >> + > >> +#undef ib_poll_ctx > >> +#undef ib_poll_ctx_end > >> + > >> +#define ib_poll_ctx(x) { IB_POLL_##x, #x }, > >> +#define ib_poll_ctx_end(x) { IB_POLL_##x, #x } > >> + > >> +#define rdma_show_ib_poll_ctx(x) \ > >> + __print_symbolic(x, IB_POLL_CTX_LIST) > >> + > >> +/** > >> + ** Completion Queue events > >> + **/ > >> + > >> +TRACE_EVENT(cq_schedule, > >> + TP_PROTO( > >> + struct ib_cq *cq > >> + ), > >> + > >> + TP_ARGS(cq), > >> + > >> + TP_STRUCT__entry( > >> + __field(u32, id) > >> + ), > >> + > >> + TP_fast_assign( > >> + cq->timestamp = ktime_get(); > >> + cq->interrupt = true; > >> + > >> + __entry->id = cq->res.id; > >> + ), > >> + > >> + TP_printk("id %u", __entry->id) > >> +); > >> + > >> +TRACE_EVENT(cq_reschedule, > >> + TP_PROTO( > >> + struct ib_cq *cq > >> + ), > >> + > >> + TP_ARGS(cq), > >> + > >> + TP_STRUCT__entry( > >> + __field(u32, id) > >> + ), > >> + > >> + TP_fast_assign( > >> + cq->timestamp = ktime_get(); > >> + cq->interrupt = false; > >> + > >> + __entry->id = cq->res.id; > >> + ), > >> + > >> + TP_printk("id %u", __entry->id) > >> +); > >> + > >> +TRACE_EVENT(cq_process, > >> + TP_PROTO( > >> + const struct ib_cq *cq > >> + ), > >> + > >> + TP_ARGS(cq), > >> + > >> + TP_STRUCT__entry( > >> + __field(s64, latency) > >> + __field(u32, id) > >> + __field(bool, interrupt) > >> + ), > >> + > >> + TP_fast_assign( > >> + ktime_t latency = ktime_sub(ktime_get(), cq->timestamp); > >> + > >> + __entry->id = cq->res.id; > >> + __entry->latency = ktime_to_us(latency); > >> + __entry->interrupt = cq->interrupt; > >> + ), > >> + > >> + TP_printk("id %u wake-up took %lld [us] from %s", > > It might be better to prefix 'id' with 'cq', so that in future rdma wide trace > points, we can have multiple resource id's printed consistently as qpid, cqid, > mrid etc; and don't have to rely on the function where it is used to decode what > that id means. > > I left out the "cq" here because the trace point names are prefixed with "cq_". > However, now that you bring it up, I can imagine cases where a trace point > might report information about two different resources that both have a > restrack ID. > > How about "cq.id=%u" ? > Looks good. > > > I had mixed thoughts on whether to pass ib_cq* or rdma_restrack_entry*. > > I was thinking of rdma_restrack_entry*, as it makes future code for other > resources also anchored on the resource id. > > On the other hand, a trace point might someday want to report the value of a > field in struct ib_cq. > Yes. that is exactly why I had mix thoughts. So I think ib_cq is ok. > > >> + __entry->id, __entry->latency, > >> + __entry->interrupt ? "interrupt" : "reschedule" > >> + ) > >> +); > >> + > >> +TRACE_EVENT(cq_poll, > >> + TP_PROTO( > >> + const struct ib_cq *cq, > >> + int requested, > >> + int rc > >> + ), > >> + > >> + TP_ARGS(cq, requested, rc), > >> + > >> + TP_STRUCT__entry( > >> + __field(u32, id) > >> + __field(int, requested) > >> + __field(int, rc) > >> + ), > >> + > >> + TP_fast_assign( > >> + __entry->id = cq->res.id; > >> + __entry->requested = requested; > >> + __entry->rc = rc; > >> + ), > >> + > >> + TP_printk("id %u requested %d, returned %d", > >> + __entry->id, __entry->requested, __entry->rc > >> + ) > >> +); > >> + > >> +TRACE_EVENT(cq_modify, > >> + TP_PROTO( > >> + const struct ib_cq *cq, > >> + u16 comps, > >> + u16 usec > >> + ), > >> + > >> + TP_ARGS(cq, comps, usec), > >> + > >> + TP_STRUCT__entry( > >> + __field(u32, id) > >> + __field(unsigned int, comps) > >> + __field(unsigned int, usec) > >> + ), > >> + > >> + TP_fast_assign( > >> + __entry->id = cq->res.id; > >> + __entry->comps = comps; > >> + __entry->usec = usec; > >> + ), > >> + > >> + TP_printk("id %u comps=%u usec=%u", > >> + __entry->id, __entry->comps, __entry->usec > >> + ) > >> +); > >> + > >> +TRACE_EVENT(cq_alloc, > >> + TP_PROTO( > >> + const struct ib_cq *cq, > >> + int comp_vector, > >> + enum ib_poll_context poll_ctx > >> + ), > >> + > >> + TP_ARGS(cq, comp_vector, poll_ctx), > >> + > >> + TP_STRUCT__entry( > >> + __field(u32, id) > >> + __field(int, comp_vector) > >> + __field(unsigned long, poll_ctx) > >> + ), > >> + > >> + TP_fast_assign( > >> + __entry->id = cq->res.id; > >> + __entry->comp_vector = comp_vector; > >> + __entry->poll_ctx = poll_ctx; > >> + ), > >> + > >> + TP_printk("id %u comp_vector=%d poll_ctx=%s", > >> + __entry->id, __entry->comp_vector, > >> + rdma_show_ib_poll_ctx(__entry->poll_ctx) > >> + ) > >> +); > >> + > >> +TRACE_EVENT(cq_free, > >> + TP_PROTO( > >> + const struct ib_cq *cq > >> + ), > >> + > >> + TP_ARGS(cq), > >> + > >> + TP_STRUCT__entry( > >> + __field(u32, id) > >> + ), > >> + > >> + TP_fast_assign( > >> + __entry->id = cq->res.id; > >> + ), > >> + > >> + TP_printk("id %u", __entry->id) > >> +); > >> + > >> +#endif /* _TRACE_RDMA_CORE_H */ > >> + > >> +#include <trace/define_trace.h> > > -- > Chuck Lever > >