> On Oct 20, 2019, at 8:07 PM, Parav Pandit <parav@xxxxxxxxxxxx> wrote: > > > >> -----Original Message----- >> From: linux-rdma-owner@xxxxxxxxxxxxxxx <linux-rdma- >> owner@xxxxxxxxxxxxxxx> On Behalf Of Chuck Lever >> Sent: Saturday, October 12, 2019 2:43 PM >> To: linux-rdma@xxxxxxxxxxxxxxx >> Subject: [PATCH v4] IB/core: Trace points for diagnosing completion queue >> issues >> >> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> >> --- >> drivers/infiniband/core/Makefile | 2 >> drivers/infiniband/core/cq.c | 27 ++++ >> drivers/infiniband/core/trace.c | 14 ++ >> include/rdma/ib_verbs.h | 5 + >> include/trace/events/rdma_core.h | 251 >> ++++++++++++++++++++++++++++++++++++++ >> 5 files changed, 295 insertions(+), 4 deletions(-) create mode 100644 >> drivers/infiniband/core/trace.c create mode 100644 >> include/trace/events/rdma_core.h >> >> Changes since v3: >> - Reverted unnecessary behavior change in __ib_process_cq >> - Clarified what "id" is in trace point output >> - Added comment before new fields in struct ib_cq >> - New trace point that fires when there is a CQ allocation failure >> >> Changes since v2: >> - Removed extraneous changes to include/trace/events/rdma.h >> >> Changes since RFC: >> - Display CQ's global resource ID instead of it's pointer address >> >> diff --git a/drivers/infiniband/core/Makefile >> b/drivers/infiniband/core/Makefile >> index 09881bd..68d9e27 100644 >> --- a/drivers/infiniband/core/Makefile >> +++ b/drivers/infiniband/core/Makefile >> @@ -11,7 +11,7 @@ ib_core-y := packer.o ud_header.o >> verbs.o cq.o rw.o sysfs.o \ >> device.o fmr_pool.o cache.o netlink.o \ >> roce_gid_mgmt.o mr_pool.o addr.o >> sa_query.o \ >> multicast.o mad.o smi.o agent.o mad_rmpp.o >> \ >> - nldev.o restrack.o counters.o >> + nldev.o restrack.o counters.o trace.o >> >> ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o >> ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git >> a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index >> bbfded6..e035895 100644 >> --- a/drivers/infiniband/core/cq.c >> +++ b/drivers/infiniband/core/cq.c >> @@ -7,6 +7,8 @@ >> #include <linux/slab.h> >> #include <rdma/ib_verbs.h> >> >> +#include <trace/events/rdma_core.h> >> + >> /* # of WCs to poll for with a single call to ib_poll_cq */ >> #define IB_POLL_BATCH 16 >> #define IB_POLL_BATCH_DIRECT 8 >> @@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct >> *w) >> >> dim->state = DIM_START_MEASURE; >> >> + trace_cq_modify(cq, comps, usec); >> cq->device->ops.modify_cq(cq, comps, usec); } >> >> @@ -65,18 +68,29 @@ static void rdma_dim_init(struct ib_cq *cq) >> INIT_WORK(&dim->work, ib_cq_rdma_dim_work); } >> >> +static int __ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc >> +*wc) { >> + int rc; >> + >> + rc = ib_poll_cq(cq, num_entries, wc); >> + trace_cq_poll(cq, num_entries, rc); >> + return rc; >> +} >> + >> static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, >> int batch) >> { >> int i, n, completed = 0; >> >> + trace_cq_process(cq); >> + >> /* >> * budget might be (-1) if the caller does not >> * want to bound this call, thus we need unsigned >> * minimum here. >> */ >> - while ((n = ib_poll_cq(cq, min_t(u32, batch, >> - budget - completed), wcs)) > 0) { >> + while ((n = __ib_poll_cq(cq, min_t(u32, batch, >> + budget - completed), wcs)) > 0) { >> for (i = 0; i < n; i++) { >> struct ib_wc *wc = &wcs[i]; >> >> @@ -131,8 +145,10 @@ static int ib_poll_handler(struct irq_poll *iop, int >> budget) >> completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); >> if (completed < budget) { >> irq_poll_complete(&cq->iop); >> - if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) >> + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) { >> + trace_cq_reschedule(cq); >> irq_poll_sched(&cq->iop); >> + } >> } >> >> if (dim) >> @@ -143,6 +159,7 @@ static int ib_poll_handler(struct irq_poll *iop, int >> budget) >> >> static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) { >> + trace_cq_schedule(cq); >> irq_poll_sched(&cq->iop); >> } >> >> @@ -162,6 +179,7 @@ static void ib_cq_poll_work(struct work_struct >> *work) >> >> static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) { >> + trace_cq_schedule(cq); >> queue_work(cq->comp_wq, &cq->work); >> } >> >> @@ -239,6 +257,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device >> *dev, void *private, >> goto out_destroy_cq; >> } >> >> + trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx); >> return cq; >> >> out_destroy_cq: >> @@ -248,6 +267,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device >> *dev, void *private, >> kfree(cq->wc); >> out_free_cq: >> kfree(cq); >> + trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret); >> return ERR_PTR(ret); >> } >> EXPORT_SYMBOL(__ib_alloc_cq_user); >> @@ -304,6 +324,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct >> ib_udata *udata) >> WARN_ON_ONCE(1); >> } >> >> + trace_cq_free(cq); >> rdma_restrack_del(&cq->res); >> cq->device->ops.destroy_cq(cq, udata); >> if (cq->dim) >> diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c >> new file mode 100644 index 0000000..6c3514b >> --- /dev/null >> +++ b/drivers/infiniband/core/trace.c >> @@ -0,0 +1,14 @@ >> +// SPDX-License-Identifier: GPL-2.0-only >> +/* >> + * Trace points for core RDMA functions. >> + * >> + * Author: Chuck Lever <chuck.lever@xxxxxxxxxx> >> + * >> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. >> + */ >> + >> +#define CREATE_TRACE_POINTS >> + >> +#include <rdma/ib_verbs.h> >> + >> +#include <trace/events/rdma_core.h> >> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index >> 6a47ba8..43468a3 100644 >> --- a/include/rdma/ib_verbs.h >> +++ b/include/rdma/ib_verbs.h >> @@ -1555,6 +1555,11 @@ struct ib_cq { >> }; >> struct workqueue_struct *comp_wq; >> struct dim *dim; >> + >> + /* updated only by trace points */ >> + ktime_t timestamp; >> + bool interrupt; >> + >> /* >> * Implementation details of the RDMA core, don't use in drivers: >> */ >> diff --git a/include/trace/events/rdma_core.h >> b/include/trace/events/rdma_core.h >> new file mode 100644 >> index 0000000..0d56065 >> --- /dev/null >> +++ b/include/trace/events/rdma_core.h >> @@ -0,0 +1,251 @@ >> +/* SPDX-License-Identifier: GPL-2.0-only */ >> +/* >> + * Trace point definitions for core RDMA functions. >> + * >> + * Author: Chuck Lever <chuck.lever@xxxxxxxxxx> >> + * >> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. >> + */ >> + >> +#undef TRACE_SYSTEM >> +#define TRACE_SYSTEM rdma_core >> + >> +#if !defined(_TRACE_RDMA_CORE_H) || >> defined(TRACE_HEADER_MULTI_READ) >> +#define _TRACE_RDMA_CORE_H >> + >> +#include <linux/tracepoint.h> >> +#include <rdma/ib_verbs.h> >> +#include <rdma/restrack.h> >> + >> +/* >> + * enum ib_poll_context, from include/rdma/ib_verbs.h */ >> +#define IB_POLL_CTX_LIST \ >> + ib_poll_ctx(DIRECT) \ >> + ib_poll_ctx(SOFTIRQ) \ >> + ib_poll_ctx(WORKQUEUE) \ >> + ib_poll_ctx_end(UNBOUND_WORKQUEUE) >> + >> +#undef ib_poll_ctx >> +#undef ib_poll_ctx_end >> + >> +#define ib_poll_ctx(x) TRACE_DEFINE_ENUM(IB_POLL_##x); >> +#define ib_poll_ctx_end(x) TRACE_DEFINE_ENUM(IB_POLL_##x); >> + >> +IB_POLL_CTX_LIST >> + >> +#undef ib_poll_ctx >> +#undef ib_poll_ctx_end >> + >> +#define ib_poll_ctx(x) { IB_POLL_##x, #x }, >> +#define ib_poll_ctx_end(x) { IB_POLL_##x, #x } >> + >> +#define rdma_show_ib_poll_ctx(x) \ >> + __print_symbolic(x, IB_POLL_CTX_LIST) >> + >> +/** >> + ** Completion Queue events >> + **/ >> + >> +TRACE_EVENT(cq_schedule, >> + TP_PROTO( >> + struct ib_cq *cq >> + ), >> + >> + TP_ARGS(cq), >> + >> + TP_STRUCT__entry( >> + __field(u32, id) >> + ), >> + >> + TP_fast_assign( >> + cq->timestamp = ktime_get(); >> + cq->interrupt = true; >> + >> + __entry->id = cq->res.id; >> + ), >> + >> + TP_printk("cq.id=%u", __entry->id) >> +); >> + >> +TRACE_EVENT(cq_reschedule, >> + TP_PROTO( >> + struct ib_cq *cq >> + ), >> + >> + TP_ARGS(cq), >> + >> + TP_STRUCT__entry( >> + __field(u32, id) >> + ), >> + >> + TP_fast_assign( >> + cq->timestamp = ktime_get(); >> + cq->interrupt = false; >> + >> + __entry->id = cq->res.id; >> + ), >> + >> + TP_printk("cq.id=%u", __entry->id) >> +); >> + >> +TRACE_EVENT(cq_process, >> + TP_PROTO( >> + const struct ib_cq *cq >> + ), >> + >> + TP_ARGS(cq), >> + >> + TP_STRUCT__entry( >> + __field(s64, latency) >> + __field(u32, id) >> + __field(bool, interrupt) >> + ), >> + >> + TP_fast_assign( >> + ktime_t latency = ktime_sub(ktime_get(), cq->timestamp); >> + >> + __entry->id = cq->res.id; >> + __entry->latency = ktime_to_us(latency); >> + __entry->interrupt = cq->interrupt; >> + ), >> + >> + TP_printk("cq.id=%u wake-up took %lld [us] from %s", >> + __entry->id, __entry->latency, >> + __entry->interrupt ? "interrupt" : "reschedule" >> + ) >> +); >> + >> +TRACE_EVENT(cq_poll, >> + TP_PROTO( >> + const struct ib_cq *cq, >> + int requested, >> + int rc >> + ), >> + >> + TP_ARGS(cq, requested, rc), >> + >> + TP_STRUCT__entry( >> + __field(u32, id) >> + __field(int, requested) >> + __field(int, rc) >> + ), >> + >> + TP_fast_assign( >> + __entry->id = cq->res.id; >> + __entry->requested = requested; >> + __entry->rc = rc; >> + ), >> + >> + TP_printk("cq.id=%u requested %d, returned %d", >> + __entry->id, __entry->requested, __entry->rc >> + ) >> +); >> + >> +TRACE_EVENT(cq_modify, >> + TP_PROTO( >> + const struct ib_cq *cq, >> + u16 comps, >> + u16 usec >> + ), >> + >> + TP_ARGS(cq, comps, usec), >> + >> + TP_STRUCT__entry( >> + __field(u32, id) >> + __field(unsigned int, comps) >> + __field(unsigned int, usec) >> + ), >> + >> + TP_fast_assign( >> + __entry->id = cq->res.id; >> + __entry->comps = comps; >> + __entry->usec = usec; >> + ), >> + >> + TP_printk("cq.id=%u comps=%u usec=%u", >> + __entry->id, __entry->comps, __entry->usec >> + ) >> +); >> + >> +TRACE_EVENT(cq_alloc, >> + TP_PROTO( >> + const struct ib_cq *cq, >> + int nr_cqe, >> + int comp_vector, >> + enum ib_poll_context poll_ctx >> + ), >> + >> + TP_ARGS(cq, nr_cqe, comp_vector, poll_ctx), >> + >> + TP_STRUCT__entry( >> + __field(u32, id) >> + __field(int, nr_cqe) >> + __field(int, comp_vector) >> + __field(unsigned long, poll_ctx) >> + ), >> + >> + TP_fast_assign( >> + __entry->id = cq->res.id; >> + __entry->nr_cqe = nr_cqe; >> + __entry->comp_vector = comp_vector; >> + __entry->poll_ctx = poll_ctx; >> + ), >> + >> + TP_printk("cq.id=%u nr_cqe=%d comp_vector=%d poll_ctx=%s", >> + __entry->id, __entry->nr_cqe, __entry->comp_vector, >> + rdma_show_ib_poll_ctx(__entry->poll_ctx) >> + ) >> +); >> + >> +TRACE_EVENT(cq_alloc_error, >> + TP_PROTO( >> + int nr_cqe, >> + int comp_vector, >> + enum ib_poll_context poll_ctx, >> + int rc >> + ), >> + >> + TP_ARGS(nr_cqe, comp_vector, poll_ctx, rc), >> + >> + TP_STRUCT__entry( >> + __field(int, rc) >> + __field(int, nr_cqe) >> + __field(int, comp_vector) >> + __field(unsigned long, poll_ctx) >> + ), >> + >> + TP_fast_assign( >> + __entry->rc = rc; >> + __entry->nr_cqe = nr_cqe; >> + __entry->comp_vector = comp_vector; >> + __entry->poll_ctx = poll_ctx; >> + ), >> + >> + TP_printk("nr_cqe=%d comp_vector=%d poll_ctx=%s rc=%d", >> + __entry->nr_cqe, __entry->comp_vector, >> + rdma_show_ib_poll_ctx(__entry->poll_ctx), __entry->rc >> + ) >> +); >> + >> +TRACE_EVENT(cq_free, >> + TP_PROTO( >> + const struct ib_cq *cq >> + ), >> + >> + TP_ARGS(cq), >> + >> + TP_STRUCT__entry( >> + __field(u32, id) >> + ), >> + >> + TP_fast_assign( >> + __entry->id = cq->res.id; >> + ), >> + >> + TP_printk("cq.id=%u", __entry->id) >> +); >> + >> +#endif /* _TRACE_RDMA_CORE_H */ >> + >> +#include <trace/define_trace.h> > Reviewed-by: Parav Pandit <parav@xxxxxxxxxxxx> Thank you, Parav! -- Chuck Lever