From: Sebastian Sanchez <sebastian.sanchez@xxxxxxxxx> The struct ib_wc uses two cache-lines per completion, and it is unaligned. This structure used to fit within one cacheline, but it was expanded by fields added in the following patches: dd5f03beb4f7 ("IB/core: Ethernet L2 attributes in verbs/cm structures") c865f24628b9 ("IB/core: Add rdma_network_type to wc") These new fields are only needed for ethernet and for HCAs that don't provide the network type to search the proper GID in the GID table. Since there are two cache-lines, more cache-lines are dirtied per work completion entry. Create a kernel only rvt_wc structure that is a single aligned cache-line. This reduces the cache lines used per completion and eliminates any cache line push-pull by aligning the size to a cache-line. Cache-aligning the new kernel completion queue expands struct rvt_cq_wc breaking the ABI for the user completion queue. Therefore, decouple the kernel completion queue from struct rvt_cq_wc to prevent this. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@xxxxxxxxx> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx> --- drivers/infiniband/hw/hfi1/rc.c | 2 drivers/infiniband/hw/hfi1/ruc.c | 4 - drivers/infiniband/hw/hfi1/uc.c | 2 drivers/infiniband/hw/hfi1/ud.c | 4 - drivers/infiniband/hw/qib/qib_rc.c | 2 drivers/infiniband/hw/qib/qib_ruc.c | 4 - drivers/infiniband/hw/qib/qib_uc.c | 2 drivers/infiniband/hw/qib/qib_ud.c | 4 - drivers/infiniband/sw/rdmavt/cq.c | 146 ++++++++++++++++++++----------- drivers/infiniband/sw/rdmavt/qp.c | 4 - drivers/infiniband/sw/rdmavt/trace_cq.h | 6 + include/rdma/rdmavt_cq.h | 35 ++++++- include/rdma/rdmavt_qp.h | 2 13 files changed, 142 insertions(+), 75 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index da58046..eecd1bf 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -2041,7 +2041,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) u32 hdrsize = packet->hlen; u32 psn = ib_bth_get_psn(packet->ohdr); u32 pad = packet->pad; - struct ib_wc wc; + struct rvt_wc wc; u32 pmtu = qp->pmtu; int diff; struct ib_reth *reth; diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 3daa94b..bab720c 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -60,7 +60,7 @@ static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) { int i, j, ret; - struct ib_wc wc; + struct rvt_wc wc; struct rvt_lkey_table *rkt; struct rvt_pd *pd; struct rvt_sge_state *ss; @@ -323,7 +323,7 @@ static void ruc_loopback(struct rvt_qp *sqp) struct rvt_swqe *wqe; struct rvt_sge *sge; unsigned long flags; - struct ib_wc wc; + struct rvt_wc wc; u64 sdata; atomic64_t *maddr; enum ib_wc_status send_status; diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 9d7a311..b0c5eee 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -312,7 +312,7 @@ void hfi1_uc_rcv(struct hfi1_packet *packet) u32 hdrsize = packet->hlen; u32 psn; u32 pad = packet->pad; - struct ib_wc wc; + struct rvt_wc wc; u32 pmtu = qp->pmtu; struct ib_reth *reth; int ret; diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 49a38a6..8e8bccd 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -79,7 +79,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) unsigned long flags; struct rvt_sge_state ssge; struct rvt_sge *sge; - struct ib_wc wc; + struct rvt_wc wc; u32 length; enum ib_qp_type sqptype, dqptype; @@ -847,7 +847,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) { struct ib_other_headers *ohdr = packet->ohdr; u32 hdrsize = packet->hlen; - struct ib_wc wc; + struct rvt_wc wc; u32 qkey; u32 src_qp; u16 pkey; diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index c9955d4..24874c9 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -1744,7 +1744,7 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr, u32 hdrsize; u32 psn; u32 pad; - struct ib_wc wc; + struct rvt_wc wc; u32 pmtu = qp->pmtu; int diff; struct ib_reth *reth; diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index 4662cc7..4b021f9 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -44,7 +44,7 @@ static int qib_init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) { int i, j, ret; - struct ib_wc wc; + struct rvt_wc wc; struct rvt_lkey_table *rkt; struct rvt_pd *pd; struct rvt_sge_state *ss; @@ -341,7 +341,7 @@ static void qib_ruc_loopback(struct rvt_qp *sqp) struct rvt_swqe *wqe; struct rvt_sge *sge; unsigned long flags; - struct ib_wc wc; + struct rvt_wc wc; u64 sdata; atomic64_t *maddr; enum ib_wc_status send_status; diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 840eec6..993b1fc 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -242,7 +242,7 @@ void qib_uc_rcv(struct qib_ibport *ibp, struct ib_header *hdr, u32 hdrsize; u32 psn; u32 pad; - struct ib_wc wc; + struct rvt_wc wc; u32 pmtu = qp->pmtu; struct ib_reth *reth; int ret; diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 3e4ff77..29f274f 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -58,7 +58,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) unsigned long flags; struct rvt_sge_state ssge; struct rvt_sge *sge; - struct ib_wc wc; + struct rvt_wc wc; u32 length; enum ib_qp_type sqptype, dqptype; @@ -434,7 +434,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, int opcode; u32 hdrsize; u32 pad; - struct ib_wc wc; + struct rvt_wc wc; u32 qkey; u32 src_qp; u16 dlid; diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index fb52b66..b554d7c 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -60,7 +60,7 @@ * * This may be called with qp->s_lock held. */ -void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) +void rvt_cq_enter(struct rvt_cq *cq, struct rvt_wc *entry, bool solicited) { struct rvt_cq_wc *wc; unsigned long flags; @@ -95,7 +95,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) return; } trace_rvt_cq_enter(cq, entry, head); - if (cq->ip) { + if (!cq->kqueue) { wc->uqueue[head].wr_id = entry->wr_id; wc->uqueue[head].status = entry->status; wc->uqueue[head].opcode = entry->opcode; @@ -113,7 +113,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) /* Make sure entry is written before the head index. */ smp_wmb(); } else { - wc->kqueue[head] = *entry; + cq->kqueue[head] = *entry; } wc->head = next; @@ -201,33 +201,27 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, if (!cq) return ERR_PTR(-ENOMEM); - /* - * Allocate the completion queue entries and head/tail pointers. - * This is allocated separately so that it can be resized and - * also mapped into user space. - * We need to use vmalloc() in order to support mmap and large - * numbers of entries. - */ - sz = sizeof(*wc); - if (udata && udata->outlen >= sizeof(__u64)) - sz += sizeof(struct ib_uverbs_wc) * (entries + 1); - else - sz += sizeof(struct ib_wc) * (entries + 1); - wc = udata ? - vmalloc_user(sz) : - vzalloc_node(sz, rdi->dparms.node); - if (!wc) { - ret = ERR_PTR(-ENOMEM); - goto bail_cq; - } - - /* - * Return the address of the WC as the offset to mmap. - * See rvt_mmap() for details. - */ if (udata && udata->outlen >= sizeof(__u64)) { int err; + /* + * Allocate the user completion queue entries and head/tail + * pointers. This is allocated separately so that it can be + * resized and also mapped into user space. + * We need to use vmalloc() in order to support mmap and large + * numbers of entries. + */ + sz = sizeof(*wc) + sizeof(struct ib_uverbs_wc) * (entries + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = ERR_PTR(-ENOMEM); + goto bail_cq; + } + + /* + * Return the address of the WC as the offset to mmap. + * See rvt_mmap() for details. + */ cq->ip = rvt_create_mmap_info(rdi, sz, context, wc); if (!cq->ip) { ret = ERR_PTR(-ENOMEM); @@ -240,6 +234,24 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, ret = ERR_PTR(err); goto bail_ip; } + } else { + /* + * Allocate head and tail pointers for kernel completion + * queue. + */ + wc = vzalloc_node(sizeof(*wc), rdi->dparms.node); + if (!wc) { + ret = ERR_PTR(-ENOMEM); + goto bail_cq; + } + + /* Allocate the kernel completion queue entries */ + sz = sizeof(struct rvt_wc) * (entries + 1); + cq->kqueue = vzalloc_node(sz, rdi->dparms.node); + if (!cq->kqueue) { + ret = ERR_PTR(-ENOMEM); + goto bail_wc; + } } spin_lock_irq(&rdi->n_cqs_lock); @@ -275,6 +287,7 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, goto done; bail_ip: + vfree(cq->kqueue); kfree(cq->ip); bail_wc: vfree(wc); @@ -305,6 +318,7 @@ int rvt_destroy_cq(struct ib_cq *ibcq) kref_put(&cq->ip->ref, rvt_release_mmap_info); else vfree(cq->queue); + vfree(cq->kqueue); kfree(cq); return 0; @@ -352,11 +366,13 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); - struct rvt_cq_wc *old_wc; - struct rvt_cq_wc *wc; + struct rvt_cq_wc *old_wc = NULL; + struct rvt_cq_wc *wc = NULL; + struct rvt_wc *old_kqueue = NULL; u32 head, tail, n; int ret; u32 sz; + struct rvt_wc *kqueue = NULL; struct rvt_dev_info *rdi = cq->rdi; if (cqe < 1 || cqe > rdi->dparms.props.max_cqe) @@ -365,16 +381,17 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) /* * Need to use vmalloc() if we want to support large #s of entries. */ - sz = sizeof(*wc); - if (udata && udata->outlen >= sizeof(__u64)) - sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); - else - sz += sizeof(struct ib_wc) * (cqe + 1); - wc = udata ? - vmalloc_user(sz) : - vzalloc_node(sz, rdi->dparms.node); - if (!wc) - return -ENOMEM; + if (!cq->kqueue) { + sz = sizeof(*wc) + sizeof(struct ib_uverbs_wc) * (cqe + 1); + wc = vmalloc_user(sz); + if (!wc) + return -ENOMEM; + } else { + sz = sizeof(struct rvt_wc) * (cqe + 1); + kqueue = vzalloc_node(sz, rdi->dparms.node); + if (!kqueue) + return -ENOMEM; + } /* Check that we can write the offset to mmap. */ if (udata && udata->outlen >= sizeof(__u64)) { @@ -390,11 +407,10 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) * Make sure head and tail are sane since they * might be user writable. */ - old_wc = cq->queue; - head = old_wc->head; + head = cq->queue->head; if (head > (u32)cq->ibcq.cqe) head = (u32)cq->ibcq.cqe; - tail = old_wc->tail; + tail = cq->queue->tail; if (tail > (u32)cq->ibcq.cqe) tail = (u32)cq->ibcq.cqe; if (head < tail) @@ -406,21 +422,31 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) goto bail_unlock; } for (n = 0; tail != head; n++) { - if (cq->ip) - wc->uqueue[n] = old_wc->uqueue[tail]; + if (!cq->kqueue) + wc->uqueue[n] = cq->queue->uqueue[tail]; else - wc->kqueue[n] = old_wc->kqueue[tail]; + kqueue[n] = cq->kqueue[tail]; if (tail == (u32)cq->ibcq.cqe) tail = 0; else tail++; } cq->ibcq.cqe = cqe; - wc->head = n; - wc->tail = 0; - cq->queue = wc; + + /* A new work completion is only allocated for the user mode case */ + if (wc) { + wc->head = n; + wc->tail = 0; + + old_wc = cq->queue; + cq->queue = wc; + } else { /* kernel work completions were allocated */ + old_kqueue = cq->kqueue; + cq->kqueue = kqueue; + } spin_unlock_irq(&cq->lock); + vfree(old_kqueue); vfree(old_wc); if (cq->ip) { @@ -454,6 +480,24 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) return ret; } +static void copy_rvt_wc_to_ib_wc(struct ib_wc *ibwc, struct rvt_wc *rvtwc) +{ + ibwc->wr_id = rvtwc->wr_id; + ibwc->status = rvtwc->status; + ibwc->opcode = rvtwc->opcode; + ibwc->vendor_err = rvtwc->vendor_err; + ibwc->byte_len = rvtwc->byte_len; + ibwc->qp = rvtwc->qp; + ibwc->ex.invalidate_rkey = rvtwc->ex.invalidate_rkey; + ibwc->src_qp = rvtwc->src_qp; + ibwc->wc_flags = rvtwc->wc_flags; + ibwc->slid = rvtwc->slid; + ibwc->pkey_index = rvtwc->pkey_index; + ibwc->sl = rvtwc->sl; + ibwc->dlid_path_bits = rvtwc->dlid_path_bits; + ibwc->port_num = rvtwc->port_num; +} + /** * rvt_poll_cq - poll for work completion entries * @ibcq: the completion queue to poll @@ -474,7 +518,7 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) u32 tail; /* The kernel can only poll a kernel completion queue */ - if (cq->ip) + if (!cq->kqueue) return -EINVAL; spin_lock_irqsave(&cq->lock, flags); @@ -487,8 +531,8 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) if (tail == wc->head) break; /* The kernel doesn't need a RMB since it has the lock. */ - trace_rvt_cq_poll(cq, &wc->kqueue[tail], npolled); - *entry = wc->kqueue[tail]; + trace_rvt_cq_poll(cq, &cq->kqueue[tail], npolled); + copy_rvt_wc_to_ib_wc(entry, &cq->kqueue[tail]); if (tail >= cq->ibcq.cqe) tail = 0; else diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index c82e6bb..8d816e4 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1049,7 +1049,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, */ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) { - struct ib_wc wc; + struct rvt_wc wc; int ret = 0; struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); @@ -1571,7 +1571,7 @@ int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, return -ENOMEM; } if (unlikely(qp_err_flush)) { - struct ib_wc wc; + struct rvt_wc wc; memset(&wc, 0, sizeof(wc)); wc.qp = &qp->ibqp; diff --git a/drivers/infiniband/sw/rdmavt/trace_cq.h b/drivers/infiniband/sw/rdmavt/trace_cq.h index a315850..7d89665 100644 --- a/drivers/infiniband/sw/rdmavt/trace_cq.h +++ b/drivers/infiniband/sw/rdmavt/trace_cq.h @@ -76,7 +76,7 @@ DECLARE_EVENT_CLASS( rvt_cq_entry_template, - TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx), + TP_PROTO(struct rvt_cq *cq, struct rvt_wc *wc, u32 idx), TP_ARGS(cq, wc, idx), TP_STRUCT__entry( RDI_DEV_ENTRY(cq->rdi) @@ -110,12 +110,12 @@ DEFINE_EVENT( rvt_cq_entry_template, rvt_cq_enter, - TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx), + TP_PROTO(struct rvt_cq *cq, struct rvt_wc *wc, u32 idx), TP_ARGS(cq, wc, idx)); DEFINE_EVENT( rvt_cq_entry_template, rvt_cq_poll, - TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx), + TP_PROTO(struct rvt_cq *cq, struct rvt_wc *wc, u32 idx), TP_ARGS(cq, wc, idx)); #endif /* __RVT_TRACE_CQ_H */ diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h index 51fd00b..8bd7885 100644 --- a/include/rdma/rdmavt_cq.h +++ b/include/rdma/rdmavt_cq.h @@ -61,6 +61,30 @@ #define RVT_CQ_NONE (IB_CQ_NEXT_COMP + 1) /* + * If any fields within struct rvt_wc change, the function + * copy_rvt_wc_to_ib_wc() should be updated. + */ +struct rvt_wc { + u64 wr_id; + enum ib_wc_status status; + enum ib_wc_opcode opcode; + u32 vendor_err; + u32 byte_len; + struct ib_qp *qp; + union { + __be32 imm_data; + u32 invalidate_rkey; + } ex; + u32 src_qp; + int wc_flags; + u32 slid; + u16 pkey_index; + u8 sl; + u8 dlid_path_bits; + u8 port_num; /* valid only for DR SMPs onswitches*/ +} ____cacheline_aligned_in_smp; + +/* * This structure is used to contain the head pointer, tail pointer, * and completion queue entries as a single memory allocation so * it can be mmap'ed into user space. @@ -68,11 +92,8 @@ struct rvt_cq_wc { u32 head; /* index of next entry to fill */ u32 tail; /* index of next ib_poll_cq() entry */ - union { - /* these are actually size ibcq.cqe + 1 */ - struct ib_uverbs_wc uqueue[0]; - struct ib_wc kqueue[0]; - }; + /* this is actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[0]; }; /* @@ -87,6 +108,8 @@ struct rvt_cq { struct rvt_dev_info *rdi; struct rvt_cq_wc *queue; struct rvt_mmap_info *ip; + /* this is actually size ibcq.cqe + 1 */ + struct rvt_wc *kqueue; }; static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq) @@ -94,6 +117,6 @@ struct rvt_cq { return container_of(ibcq, struct rvt_cq, ibcq); } -void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited); +void rvt_cq_enter(struct rvt_cq *cq, struct rvt_wc *entry, bool solicited); #endif /* DEF_RDMAVT_INCCQH */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 89ab88c..3bd49ec 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -585,7 +585,7 @@ static inline void rvt_qp_swqe_complete( if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || (wqe->wr.send_flags & IB_SEND_SIGNALED) || status != IB_WC_SUCCESS) { - struct ib_wc wc; + struct rvt_wc wc; memset(&wc, 0, sizeof(wc)); wc.wr_id = wqe->wr.wr_id; -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html