Hi Sebastian, > -----Original Message----- > From: linux-rdma-owner@xxxxxxxxxxxxxxx <linux-rdma-owner@xxxxxxxxxxxxxxx> > On Behalf Of Dennis Dalessandro > Sent: Monday, September 10, 2018 10:50 AM > To: jgg@xxxxxxxx; dledford@xxxxxxxxxx > Cc: linux-rdma@xxxxxxxxxxxxxxx; Mike Marciniszyn > <mike.marciniszyn@xxxxxxxxx>; Sebastian Sanchez > <sebastian.sanchez@xxxxxxxxx>; Kamenee Arumugam > <kamenee.arumugam@xxxxxxxxx> > Subject: [PATCH for-next 2/3] IB/{hfi1, rdmavt, qib}: Fit completions into single > aligned cache-line > > From: Sebastian Sanchez <sebastian.sanchez@xxxxxxxxx> > > The struct ib_wc uses two cache-lines per completion, and it is unaligned. This I agree that it is unaligned. Bodong and I fixed it in commit [1] to continue to keep ib_wc < 64 bytes and padded for 64B by compiler. (described more in commit log of [1]). On which architecture do you see it using two cache-lines? [1] Commit cd2a6e7d384b ("IB/core: Fix ib_wc structure size to remain in 64 bytes boundary") > structure used to fit within one cacheline, but it was expanded by fields added in > the following patches: > > dd5f03beb4f7 ("IB/core: Ethernet L2 attributes in verbs/cm structures") > c865f24628b9 ("IB/core: Add rdma_network_type to wc") > > These new fields are only needed for ethernet and for HCAs that don't provide > the network type to search the proper GID in the GID table. > Since there are two cache-lines, more cache-lines are dirtied per work > completion entry. > > Create a kernel only rvt_wc structure that is a single aligned cache-line. This > reduces the cache lines used per completion and eliminates any cache line push- > pull by aligning the size to a cache-line. > > Cache-aligning the new kernel completion queue expands struct rvt_cq_wc > breaking the ABI for the user completion queue. Therefore, decouple the kernel > completion queue from struct rvt_cq_wc to prevent this. > > Reviewed-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx> > Reviewed-by: Sebastian Sanchez <sebastian.sanchez@xxxxxxxxx> > Signed-off-by: Sebastian Sanchez <sebastian.sanchez@xxxxxxxxx> > Signed-off-by: Kamenee Arumugam <kamenee.arumugam@xxxxxxxxx> > Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx> > --- > drivers/infiniband/hw/hfi1/rc.c | 2 + > drivers/infiniband/hw/hfi1/ruc.c | 2 + > drivers/infiniband/hw/hfi1/uc.c | 2 + > drivers/infiniband/hw/hfi1/ud.c | 4 +- > drivers/infiniband/hw/qib/qib_rc.c | 2 + > drivers/infiniband/hw/qib/qib_ruc.c | 2 + > drivers/infiniband/hw/qib/qib_uc.c | 2 + > drivers/infiniband/hw/qib/qib_ud.c | 4 +- > drivers/infiniband/sw/rdmavt/cq.c | 57 ++++++++++++++++++++----------- > drivers/infiniband/sw/rdmavt/qp.c | 6 ++- > drivers/infiniband/sw/rdmavt/trace_cq.h | 6 ++- > include/rdma/rdmavt_cq.h | 29 +++++++++++++++- > include/rdma/rdmavt_qp.h | 2 + > 13 files changed, 81 insertions(+), 39 deletions(-) > > diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c > index 9bd63ab..975d91a 100644 > --- a/drivers/infiniband/hw/hfi1/rc.c > +++ b/drivers/infiniband/hw/hfi1/rc.c > @@ -2041,7 +2041,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) > u32 hdrsize = packet->hlen; > u32 psn = ib_bth_get_psn(packet->ohdr); > u32 pad = packet->pad; > - struct ib_wc wc; > + struct rvt_wc wc; > u32 pmtu = qp->pmtu; > int diff; > struct ib_reth *reth; > diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c > index 5f56f3c..019713a 100644 > --- a/drivers/infiniband/hw/hfi1/ruc.c > +++ b/drivers/infiniband/hw/hfi1/ruc.c > @@ -173,7 +173,7 @@ static void ruc_loopback(struct rvt_qp *sqp) > struct rvt_swqe *wqe; > struct rvt_sge *sge; > unsigned long flags; > - struct ib_wc wc; > + struct rvt_wc wc; > u64 sdata; > atomic64_t *maddr; > enum ib_wc_status send_status; > diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c > index e254dce..5b5e057 100644 > --- a/drivers/infiniband/hw/hfi1/uc.c > +++ b/drivers/infiniband/hw/hfi1/uc.c > @@ -312,7 +312,7 @@ void hfi1_uc_rcv(struct hfi1_packet *packet) > u32 hdrsize = packet->hlen; > u32 psn; > u32 pad = packet->pad; > - struct ib_wc wc; > + struct rvt_wc wc; > u32 pmtu = qp->pmtu; > struct ib_reth *reth; > int ret; > diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c > index 70d39fc..8a4785e 100644 > --- a/drivers/infiniband/hw/hfi1/ud.c > +++ b/drivers/infiniband/hw/hfi1/ud.c > @@ -79,7 +79,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct > rvt_swqe *swqe) > unsigned long flags; > struct rvt_sge_state ssge; > struct rvt_sge *sge; > - struct ib_wc wc; > + struct rvt_wc wc; > u32 length; > enum ib_qp_type sqptype, dqptype; > > @@ -867,7 +867,7 @@ static int opa_smp_check(struct hfi1_ibport *ibp, u16 > pkey, u8 sc5, void hfi1_ud_rcv(struct hfi1_packet *packet) { > u32 hdrsize = packet->hlen; > - struct ib_wc wc; > + struct rvt_wc wc; > u32 src_qp; > u16 pkey; > int mgmt_pkey_idx = -1; > diff --git a/drivers/infiniband/hw/qib/qib_rc.c > b/drivers/infiniband/hw/qib/qib_rc.c > index f35fdeb..a121d2c 100644 > --- a/drivers/infiniband/hw/qib/qib_rc.c > +++ b/drivers/infiniband/hw/qib/qib_rc.c > @@ -1744,7 +1744,7 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct > ib_header *hdr, > u32 hdrsize; > u32 psn; > u32 pad; > - struct ib_wc wc; > + struct rvt_wc wc; > u32 pmtu = qp->pmtu; > int diff; > struct ib_reth *reth; > diff --git a/drivers/infiniband/hw/qib/qib_ruc.c > b/drivers/infiniband/hw/qib/qib_ruc.c > index f8a7de7..1230c37 100644 > --- a/drivers/infiniband/hw/qib/qib_ruc.c > +++ b/drivers/infiniband/hw/qib/qib_ruc.c > @@ -191,7 +191,7 @@ static void qib_ruc_loopback(struct rvt_qp *sqp) > struct rvt_swqe *wqe; > struct rvt_sge *sge; > unsigned long flags; > - struct ib_wc wc; > + struct rvt_wc wc; > u64 sdata; > atomic64_t *maddr; > enum ib_wc_status send_status; > diff --git a/drivers/infiniband/hw/qib/qib_uc.c > b/drivers/infiniband/hw/qib/qib_uc.c > index 3e54bc1..fa7dd3a 100644 > --- a/drivers/infiniband/hw/qib/qib_uc.c > +++ b/drivers/infiniband/hw/qib/qib_uc.c > @@ -242,7 +242,7 @@ void qib_uc_rcv(struct qib_ibport *ibp, struct ib_header > *hdr, > u32 hdrsize; > u32 psn; > u32 pad; > - struct ib_wc wc; > + struct rvt_wc wc; > u32 pmtu = qp->pmtu; > struct ib_reth *reth; > int ret; > diff --git a/drivers/infiniband/hw/qib/qib_ud.c > b/drivers/infiniband/hw/qib/qib_ud.c > index f8d029a..580ebc8 100644 > --- a/drivers/infiniband/hw/qib/qib_ud.c > +++ b/drivers/infiniband/hw/qib/qib_ud.c > @@ -58,7 +58,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct > rvt_swqe *swqe) > unsigned long flags; > struct rvt_sge_state ssge; > struct rvt_sge *sge; > - struct ib_wc wc; > + struct rvt_wc wc; > u32 length; > enum ib_qp_type sqptype, dqptype; > > @@ -434,7 +434,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header > *hdr, > int opcode; > u32 hdrsize; > u32 pad; > - struct ib_wc wc; > + struct rvt_wc wc; > u32 qkey; > u32 src_qp; > u16 dlid; > diff --git a/drivers/infiniband/sw/rdmavt/cq.c > b/drivers/infiniband/sw/rdmavt/cq.c > index 6406a08..e729335 100644 > --- a/drivers/infiniband/sw/rdmavt/cq.c > +++ b/drivers/infiniband/sw/rdmavt/cq.c > @@ -62,10 +62,10 @@ > * > * This may be called with qp->s_lock held. > */ > -void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) > +void rvt_cq_enter(struct rvt_cq *cq, struct rvt_wc *entry, bool > +solicited) > { > struct ib_uverbs_wc *uqueue = NULL; > - struct ib_wc *kqueue = NULL; > + struct rvt_wc *kqueue = NULL; > struct rvt_cq_wc *u_wc = NULL; > struct rvt_k_cq_wc *k_wc = NULL; > unsigned long flags; > @@ -230,6 +230,8 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, > * numbers of entries. > */ > if (udata && udata->outlen >= sizeof(__u64)) { > + int err; > + > sz = sizeof(struct ib_uverbs_wc) * (entries + 1); > sz += sizeof(*u_wc); > u_wc = vmalloc_user(sz); > @@ -237,23 +239,11 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, > ret = ERR_PTR(-ENOMEM); > goto bail_cq; > } > - } else { > - sz = sizeof(struct ib_wc) * (entries + 1); > - sz += sizeof(*k_wc); > - k_wc = vzalloc_node(sz, rdi->dparms.node); > - if (!k_wc) { > - ret = ERR_PTR(-ENOMEM); > - goto bail_cq; > - } > - } > - > - /* > - * Return the address of the WC as the offset to mmap. > - * See rvt_mmap() for details. > - */ > - if (udata && udata->outlen >= sizeof(__u64)) { > - int err; > > + /* > + * Return the address of the WC as the offset to mmap. > + * See rvt_mmap() for details. > + */ > cq->ip = rvt_create_mmap_info(rdi, sz, context, u_wc); > if (!cq->ip) { > ret = ERR_PTR(-ENOMEM); > @@ -266,6 +256,14 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, > ret = ERR_PTR(err); > goto bail_ip; > } > + } else { > + sz = sizeof(struct rvt_wc) * (entries + 1); > + sz += sizeof(*k_wc); > + k_wc = vzalloc_node(sz, rdi->dparms.node); > + if (!k_wc) { > + ret = ERR_PTR(-ENOMEM); > + goto bail_cq; > + } > } > > spin_lock_irq(&rdi->n_cqs_lock); > @@ -343,6 +341,7 @@ int rvt_destroy_cq(struct ib_cq *ibcq) > kref_put(&cq->ip->ref, rvt_release_mmap_info); > else > vfree(cq->queue); > + vfree(cq->kqueue); > kfree(cq); > > return 0; > @@ -418,7 +417,7 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct > ib_udata *udata) > if (!u_wc) > return -ENOMEM; > } else { > - sz = sizeof(struct ib_wc) * (cqe + 1); > + sz = sizeof(struct rvt_wc) * (cqe + 1); > sz += sizeof(*k_wc); > k_wc = vzalloc_node(sz, rdi->dparms.node); > if (!k_wc) > @@ -520,6 +519,24 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct > ib_udata *udata) > return ret; > } > > +static void copy_rvt_wc_to_ib_wc(struct ib_wc *ibwc, struct rvt_wc > +*rvtwc) { > + ibwc->wr_id = rvtwc->wr_id; > + ibwc->status = rvtwc->status; > + ibwc->opcode = rvtwc->opcode; > + ibwc->vendor_err = rvtwc->vendor_err; > + ibwc->byte_len = rvtwc->byte_len; > + ibwc->qp = rvtwc->qp; > + ibwc->ex.invalidate_rkey = rvtwc->ex.invalidate_rkey; > + ibwc->src_qp = rvtwc->src_qp; > + ibwc->wc_flags = rvtwc->wc_flags; > + ibwc->slid = rvtwc->slid; > + ibwc->pkey_index = rvtwc->pkey_index; > + ibwc->sl = rvtwc->sl; > + ibwc->dlid_path_bits = rvtwc->dlid_path_bits; > + ibwc->port_num = rvtwc->port_num; > +} > + > /** > * rvt_poll_cq - poll for work completion entries > * @ibcq: the completion queue to poll > @@ -554,7 +571,7 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, > struct ib_wc *entry) > break; > /* The kernel doesn't need a RMB since it has the lock. */ > trace_rvt_cq_poll(cq, &wc->kqueue[tail], npolled); > - *entry = wc->kqueue[tail]; > + copy_rvt_wc_to_ib_wc(entry, &wc->kqueue[tail]); > if (tail >= cq->ibcq.cqe) > tail = 0; > else > diff --git a/drivers/infiniband/sw/rdmavt/qp.c > b/drivers/infiniband/sw/rdmavt/qp.c > index 5ce403c..e2cc827 100644 > --- a/drivers/infiniband/sw/rdmavt/qp.c > +++ b/drivers/infiniband/sw/rdmavt/qp.c > @@ -1049,7 +1049,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, > */ > int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) { > - struct ib_wc wc; > + struct rvt_wc wc; > int ret = 0; > struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); > > @@ -1573,7 +1573,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct > ib_recv_wr *wr, > return -ENOMEM; > } > if (unlikely(qp_err_flush)) { > - struct ib_wc wc; > + struct rvt_wc wc; > > memset(&wc, 0, sizeof(wc)); > wc.qp = &qp->ibqp; > @@ -1996,7 +1996,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct > ib_recv_wr *wr, static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) { > int i, j, ret; > - struct ib_wc wc; > + struct rvt_wc wc; > struct rvt_lkey_table *rkt; > struct rvt_pd *pd; > struct rvt_sge_state *ss; > diff --git a/drivers/infiniband/sw/rdmavt/trace_cq.h > b/drivers/infiniband/sw/rdmavt/trace_cq.h > index df8e1ad..832308d 100644 > --- a/drivers/infiniband/sw/rdmavt/trace_cq.h > +++ b/drivers/infiniband/sw/rdmavt/trace_cq.h > @@ -109,7 +109,7 @@ > > DECLARE_EVENT_CLASS( > rvt_cq_entry_template, > - TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx), > + TP_PROTO(struct rvt_cq *cq, struct rvt_wc *wc, u32 idx), > TP_ARGS(cq, wc, idx), > TP_STRUCT__entry( > RDI_DEV_ENTRY(cq->rdi) > @@ -143,12 +143,12 @@ > > DEFINE_EVENT( > rvt_cq_entry_template, rvt_cq_enter, > - TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx), > + TP_PROTO(struct rvt_cq *cq, struct rvt_wc *wc, u32 idx), > TP_ARGS(cq, wc, idx)); > > DEFINE_EVENT( > rvt_cq_entry_template, rvt_cq_poll, > - TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx), > + TP_PROTO(struct rvt_cq *cq, struct rvt_wc *wc, u32 idx), > TP_ARGS(cq, wc, idx)); > > #endif /* __RVT_TRACE_CQ_H */ > diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h index > 9f25be0..a5d3e3c 100644 > --- a/include/rdma/rdmavt_cq.h > +++ b/include/rdma/rdmavt_cq.h > @@ -62,6 +62,30 @@ > #include <rdma/rvt-abi.h> > > /* > + * If any fields within struct rvt_wc change, the function > + * copy_rvt_wc_to_ib_wc() should be updated. > + */ > +struct rvt_wc { > + u64 wr_id; > + enum ib_wc_status status; > + enum ib_wc_opcode opcode; > + u32 vendor_err; > + u32 byte_len; > + struct ib_qp *qp; > + union { > + __be32 imm_data; > + u32 invalidate_rkey; > + } ex; > + u32 src_qp; > + int wc_flags; > + u32 slid; > + u16 pkey_index; > + u8 sl; > + u8 dlid_path_bits; > + u8 port_num; /* valid only for DR SMPs > onswitches*/ > +} ____cacheline_aligned_in_smp; > + > +/* > * This structure is used to contain the head pointer, tail pointer, > * and completion queue entries as a single memory allocation so > * it can be mmap'ed into user space. > @@ -69,7 +93,8 @@ > struct rvt_k_cq_wc { > u32 head; /* index of next entry to fill */ > u32 tail; /* index of next ib_poll_cq() entry */ > - struct ib_wc kqueue[0]; > + /* this is actually size ibcq.cqe + 1 */ > + struct rvt_wc kqueue[0]; > }; > > /* > @@ -93,6 +118,6 @@ struct rvt_cq { > return container_of(ibcq, struct rvt_cq, ibcq); } > > -void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited); > +void rvt_cq_enter(struct rvt_cq *cq, struct rvt_wc *entry, bool > +solicited); > > #endif /* DEF_RDMAVT_INCCQH */ > diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index > 927f6d5..125db26 100644 > --- a/include/rdma/rdmavt_qp.h > +++ b/include/rdma/rdmavt_qp.h > @@ -589,7 +589,7 @@ static inline void rvt_qp_swqe_complete( > if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || > (wqe->wr.send_flags & IB_SEND_SIGNALED) || > status != IB_WC_SUCCESS) { > - struct ib_wc wc; > + struct rvt_wc wc; > > memset(&wc, 0, sizeof(wc)); > wc.wr_id = wqe->wr.wr_id;