> On Oct 11, 2019, at 1:38 PM, Chuck Lever <chuck.lever@xxxxxxxxxx> wrote: > > > >> On Oct 11, 2019, at 1:26 PM, Parav Pandit <parav@xxxxxxxxxxxx> wrote: >> >> >> >>> -----Original Message----- >>> From: linux-rdma-owner@xxxxxxxxxxxxxxx <linux-rdma- >>> owner@xxxxxxxxxxxxxxx> On Behalf Of Chuck Lever >>> Sent: Wednesday, October 9, 2019 11:55 AM >>> To: linux-rdma@xxxxxxxxxxxxxxx >>> Subject: [PATCH v3] IB/core: Trace points for diagnosing completion queue >>> issues >>> >>> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> >>> --- >>> drivers/infiniband/core/Makefile | 2 >>> drivers/infiniband/core/cq.c | 29 +++-- >>> drivers/infiniband/core/trace.c | 14 ++ >>> include/rdma/ib_verbs.h | 2 >>> include/trace/events/rdma_core.h | 218 >>> ++++++++++++++++++++++++++++++++++++++ >>> 5 files changed, 255 insertions(+), 10 deletions(-) create mode 100644 >>> drivers/infiniband/core/trace.c create mode 100644 >>> include/trace/events/rdma_core.h >>> >>> Changes since v2: >>> - Removed extraneous changes to include/trace/events/rdma.h >>> >>> Changes since RFC: >>> - Addressed comments from Parav Pandit <parav@xxxxxxxxxxxx> >>> >>> >>> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile >>> index 09881bd..68d9e27 100644 >>> --- a/drivers/infiniband/core/Makefile >>> +++ b/drivers/infiniband/core/Makefile >>> @@ -11,7 +11,7 @@ ib_core-y := packer.o ud_header.o >>> verbs.o cq.o rw.o sysfs.o \ >>> device.o fmr_pool.o cache.o netlink.o \ >>> roce_gid_mgmt.o mr_pool.o addr.o >>> sa_query.o \ >>> multicast.o mad.o smi.o agent.o mad_rmpp.o \ >>> - nldev.o restrack.o counters.o >>> + nldev.o restrack.o counters.o trace.o >>> >>> ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o >>> ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git >>> a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index >>> bbfded6..bcde992 100644 >>> --- a/drivers/infiniband/core/cq.c >>> +++ b/drivers/infiniband/core/cq.c >>> @@ -7,6 +7,8 @@ >>> #include <linux/slab.h> >>> #include <rdma/ib_verbs.h> >>> >>> +#include <trace/events/rdma_core.h> >>> + >>> /* # of WCs to poll for with a single call to ib_poll_cq */ >>> #define IB_POLL_BATCH 16 >>> #define IB_POLL_BATCH_DIRECT 8 >>> @@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct *w) >>> >>> dim->state = DIM_START_MEASURE; >>> >>> + trace_cq_modify(cq, comps, usec); >>> cq->device->ops.modify_cq(cq, comps, usec); } >>> >>> @@ -70,13 +73,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget, >>> struct ib_wc *wcs, { >>> int i, n, completed = 0; >>> >>> - /* >>> - * budget might be (-1) if the caller does not >>> - * want to bound this call, thus we need unsigned >>> - * minimum here. >>> - */ >>> - while ((n = ib_poll_cq(cq, min_t(u32, batch, >>> - budget - completed), wcs)) > 0) { >>> + trace_cq_process(cq); >>> + while ((n = ib_poll_cq(cq, batch, wcs)) > 0) { >> Before this change, on first attempt to poll the cq, it will poll for min(batch, budget). >> With this change, it will poll for batch. >> This is functional change than just adding the trace points. >> I am not sure if this has any effect on the overall polling. >> But it may be worth to keep such functional change in pre-patch which consist of this change, moving comment section, batch recalculation. > > Or find a way to add the trace point without the functional change. > > >>> + trace_cq_poll(cq, batch, n); >>> for (i = 0; i < n; i++) { >>> struct ib_wc *wc = &wcs[i]; >>> >>> @@ -87,9 +86,15 @@ static int __ib_process_cq(struct ib_cq *cq, int budget, >>> struct ib_wc *wcs, >>> } >>> >>> completed += n; >>> - >>> if (n != batch || (budget != -1 && completed >= budget)) >>> break; >>> + >>> + /* >>> + * budget might be (-1) if the caller does not >>> + * want to bound this call, thus we need unsigned >>> + * minimum here. >>> + */ >>> + batch = min_t(u32, batch, budget - completed); >>> } >>> >>> return completed; >>> @@ -131,8 +136,10 @@ static int ib_poll_handler(struct irq_poll *iop, int >>> budget) >>> completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); >>> if (completed < budget) { >>> irq_poll_complete(&cq->iop); >>> - if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) >>> + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) { >>> + trace_cq_reschedule(cq); >>> irq_poll_sched(&cq->iop); >>> + } >>> } >>> >>> if (dim) >>> @@ -143,6 +150,7 @@ static int ib_poll_handler(struct irq_poll *iop, int >>> budget) >>> >>> static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) { >>> + trace_cq_schedule(cq); >>> irq_poll_sched(&cq->iop); >>> } >>> >>> @@ -162,6 +170,7 @@ static void ib_cq_poll_work(struct work_struct *work) >>> >>> static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) { >>> + trace_cq_schedule(cq); >>> queue_work(cq->comp_wq, &cq->work); >>> } >>> >>> @@ -239,6 +248,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, >>> void *private, >>> goto out_destroy_cq; >>> } >>> >>> + trace_cq_alloc(cq, comp_vector, poll_ctx); >>> return cq; >>> >>> out_destroy_cq: >>> @@ -304,6 +314,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata >>> *udata) >>> WARN_ON_ONCE(1); >>> } >>> >>> + trace_cq_free(cq); >>> rdma_restrack_del(&cq->res); >>> cq->device->ops.destroy_cq(cq, udata); >>> if (cq->dim) >>> diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c >>> new file mode 100644 index 0000000..6c3514b >>> --- /dev/null >>> +++ b/drivers/infiniband/core/trace.c >>> @@ -0,0 +1,14 @@ >>> +// SPDX-License-Identifier: GPL-2.0-only >>> +/* >>> + * Trace points for core RDMA functions. >>> + * >>> + * Author: Chuck Lever <chuck.lever@xxxxxxxxxx> >>> + * >>> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. >>> + */ >>> + >>> +#define CREATE_TRACE_POINTS >>> + >>> +#include <rdma/ib_verbs.h> >>> + >>> +#include <trace/events/rdma_core.h> >>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index >>> 6a47ba8..95a6bce 100644 >>> --- a/include/rdma/ib_verbs.h >>> +++ b/include/rdma/ib_verbs.h >>> @@ -1555,6 +1555,8 @@ struct ib_cq { >>> }; >>> struct workqueue_struct *comp_wq; >>> struct dim *dim; >>> + ktime_t timestamp; >>> + bool interrupt; >> Its unclear when to update timestamp and interrupt. Comment will help. > > These are both updated only in the new trace points. Is more than > that needed in a comment? > > >>> /* >>> * Implementation details of the RDMA core, don't use in drivers: >>> */ >>> diff --git a/include/trace/events/rdma_core.h >>> b/include/trace/events/rdma_core.h >>> new file mode 100644 >>> index 0000000..c1397a3 >>> --- /dev/null >>> +++ b/include/trace/events/rdma_core.h >>> @@ -0,0 +1,218 @@ >>> +/* SPDX-License-Identifier: GPL-2.0-only */ >>> +/* >>> + * Trace point definitions for core RDMA functions. >>> + * >>> + * Author: Chuck Lever <chuck.lever@xxxxxxxxxx> >>> + * >>> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. >>> + */ >>> + >>> +#undef TRACE_SYSTEM >>> +#define TRACE_SYSTEM rdma_core >>> + >>> +#if !defined(_TRACE_RDMA_CORE_H) || >>> defined(TRACE_HEADER_MULTI_READ) >>> +#define _TRACE_RDMA_CORE_H >>> + >>> +#include <linux/tracepoint.h> >>> +#include <rdma/ib_verbs.h> >>> +#include <rdma/restrack.h> >>> + >>> +/* >>> + * enum ib_poll_context, from include/rdma/ib_verbs.h */ >>> +#define IB_POLL_CTX_LIST \ >>> + ib_poll_ctx(DIRECT) \ >>> + ib_poll_ctx(SOFTIRQ) \ >>> + ib_poll_ctx(WORKQUEUE) \ >>> + ib_poll_ctx_end(UNBOUND_WORKQUEUE) >>> + >>> +#undef ib_poll_ctx >>> +#undef ib_poll_ctx_end >>> + >>> +#define ib_poll_ctx(x) TRACE_DEFINE_ENUM(IB_POLL_##x); >>> +#define ib_poll_ctx_end(x) TRACE_DEFINE_ENUM(IB_POLL_##x); >>> + >>> +IB_POLL_CTX_LIST >>> + >>> +#undef ib_poll_ctx >>> +#undef ib_poll_ctx_end >>> + >>> +#define ib_poll_ctx(x) { IB_POLL_##x, #x }, >>> +#define ib_poll_ctx_end(x) { IB_POLL_##x, #x } >>> + >>> +#define rdma_show_ib_poll_ctx(x) \ >>> + __print_symbolic(x, IB_POLL_CTX_LIST) >>> + >>> +/** >>> + ** Completion Queue events >>> + **/ >>> + >>> +TRACE_EVENT(cq_schedule, >>> + TP_PROTO( >>> + struct ib_cq *cq >>> + ), >>> + >>> + TP_ARGS(cq), >>> + >>> + TP_STRUCT__entry( >>> + __field(u32, id) >>> + ), >>> + >>> + TP_fast_assign( >>> + cq->timestamp = ktime_get(); >>> + cq->interrupt = true; >>> + >>> + __entry->id = cq->res.id; >>> + ), >>> + >>> + TP_printk("id %u", __entry->id) >>> +); >>> + >>> +TRACE_EVENT(cq_reschedule, >>> + TP_PROTO( >>> + struct ib_cq *cq >>> + ), >>> + >>> + TP_ARGS(cq), >>> + >>> + TP_STRUCT__entry( >>> + __field(u32, id) >>> + ), >>> + >>> + TP_fast_assign( >>> + cq->timestamp = ktime_get(); >>> + cq->interrupt = false; >>> + >>> + __entry->id = cq->res.id; >>> + ), >>> + >>> + TP_printk("id %u", __entry->id) >>> +); >>> + >>> +TRACE_EVENT(cq_process, >>> + TP_PROTO( >>> + const struct ib_cq *cq >>> + ), >>> + >>> + TP_ARGS(cq), >>> + >>> + TP_STRUCT__entry( >>> + __field(s64, latency) >>> + __field(u32, id) >>> + __field(bool, interrupt) >>> + ), >>> + >>> + TP_fast_assign( >>> + ktime_t latency = ktime_sub(ktime_get(), cq->timestamp); >>> + >>> + __entry->id = cq->res.id; >>> + __entry->latency = ktime_to_us(latency); >>> + __entry->interrupt = cq->interrupt; >>> + ), >>> + >>> + TP_printk("id %u wake-up took %lld [us] from %s", >> It might be better to prefix 'id' with 'cq', so that in future rdma wide trace points, we can have multiple resource id's printed consistently as qpid, cqid, mrid etc; and don't have to rely on the function where it is used to decode what that id means. > > I left out the "cq" here because the trace point names are prefixed > with "cq_". However, now that you bring it up, I can imagine cases > where a trace point might report information about two different > resources that both have a restrack ID. > > How about "cq.id=%u" ? > > >> I had mixed thoughts on whether to pass ib_cq* or rdma_restrack_entry*. >> I was thinking of rdma_restrack_entry*, as it makes future code for other resources also anchored on the resource id. > > On the other hand, a trace point might someday want to report the > value of a field in struct ib_cq. Btw, is it worth adding a helper function to return res.id? u32 rdma_restrack_id(const struct rdma_restrack_entry *res) { return res->id; } Or is this just noise? >>> + __entry->id, __entry->latency, >>> + __entry->interrupt ? "interrupt" : "reschedule" >>> + ) >>> +); >>> + >>> +TRACE_EVENT(cq_poll, >>> + TP_PROTO( >>> + const struct ib_cq *cq, >>> + int requested, >>> + int rc >>> + ), >>> + >>> + TP_ARGS(cq, requested, rc), >>> + >>> + TP_STRUCT__entry( >>> + __field(u32, id) >>> + __field(int, requested) >>> + __field(int, rc) >>> + ), >>> + >>> + TP_fast_assign( >>> + __entry->id = cq->res.id; >>> + __entry->requested = requested; >>> + __entry->rc = rc; >>> + ), >>> + >>> + TP_printk("id %u requested %d, returned %d", >>> + __entry->id, __entry->requested, __entry->rc >>> + ) >>> +); >>> + >>> +TRACE_EVENT(cq_modify, >>> + TP_PROTO( >>> + const struct ib_cq *cq, >>> + u16 comps, >>> + u16 usec >>> + ), >>> + >>> + TP_ARGS(cq, comps, usec), >>> + >>> + TP_STRUCT__entry( >>> + __field(u32, id) >>> + __field(unsigned int, comps) >>> + __field(unsigned int, usec) >>> + ), >>> + >>> + TP_fast_assign( >>> + __entry->id = cq->res.id; >>> + __entry->comps = comps; >>> + __entry->usec = usec; >>> + ), >>> + >>> + TP_printk("id %u comps=%u usec=%u", >>> + __entry->id, __entry->comps, __entry->usec >>> + ) >>> +); >>> + >>> +TRACE_EVENT(cq_alloc, >>> + TP_PROTO( >>> + const struct ib_cq *cq, >>> + int comp_vector, >>> + enum ib_poll_context poll_ctx >>> + ), >>> + >>> + TP_ARGS(cq, comp_vector, poll_ctx), >>> + >>> + TP_STRUCT__entry( >>> + __field(u32, id) >>> + __field(int, comp_vector) >>> + __field(unsigned long, poll_ctx) >>> + ), >>> + >>> + TP_fast_assign( >>> + __entry->id = cq->res.id; >>> + __entry->comp_vector = comp_vector; >>> + __entry->poll_ctx = poll_ctx; >>> + ), >>> + >>> + TP_printk("id %u comp_vector=%d poll_ctx=%s", >>> + __entry->id, __entry->comp_vector, >>> + rdma_show_ib_poll_ctx(__entry->poll_ctx) >>> + ) >>> +); >>> + >>> +TRACE_EVENT(cq_free, >>> + TP_PROTO( >>> + const struct ib_cq *cq >>> + ), >>> + >>> + TP_ARGS(cq), >>> + >>> + TP_STRUCT__entry( >>> + __field(u32, id) >>> + ), >>> + >>> + TP_fast_assign( >>> + __entry->id = cq->res.id; >>> + ), >>> + >>> + TP_printk("id %u", __entry->id) >>> +); >>> + >>> +#endif /* _TRACE_RDMA_CORE_H */ >>> + >>> +#include <trace/define_trace.h> > > -- > Chuck Lever -- Chuck Lever