Based on help from Steve the barriers here are change to consistently bracket WC memory writes with wc_wmb() like other drivers do. This allows some of the wc_wmb() calls that were not related to WC memory be downgraded to wmb(). The driver was probably correct (at least for x86-64) but did not follow the idiom established by the other drivers for working with WC memry. Signed-off-by: Jason Gunthorpe <jgunthorpe@xxxxxxxxxxxxxxxxxxxx> --- providers/cxgb4/qp.c | 20 ++++++++++++++++++-- providers/cxgb4/t4.h | 48 +++++++++++++++++++++++++++++++++++------------- providers/cxgb4/verbs.c | 2 ++ 3 files changed, 55 insertions(+), 15 deletions(-) diff --git a/providers/cxgb4/qp.c b/providers/cxgb4/qp.c index 700fe02c77c269..45eaca45029e60 100644 --- a/providers/cxgb4/qp.c +++ b/providers/cxgb4/qp.c @@ -52,7 +52,12 @@ static void copy_wr_to_sq(struct t4_wq *wq, union t4_wr *wqe, u8 len16) dst = (u64 *)((u8 *)wq->sq.queue + wq->sq.wq_pidx * T4_EQ_ENTRY_SIZE); if (t4_sq_onchip(wq)) { len16 = align(len16, 4); - wc_wmb(); + + /* In onchip mode the copy below will be made to WC memory and + * could trigger DMA. In offchip mode the copy below only + * queues the WQE, DMA cannot start until t4_ring_sq_db + * happens */ + mmio_wc_start(); } while (len16) { *dst++ = *src++; @@ -62,7 +67,13 @@ static void copy_wr_to_sq(struct t4_wq *wq, union t4_wr *wqe, u8 len16) if (dst == (u64 *)&wq->sq.queue[wq->sq.size]) dst = (u64 *)wq->sq.queue; len16--; + + /* NOTE len16 cannot be large enough to write to the + same sq.queue memory twice in this loop */ } + + if (t4_sq_onchip(wq)) + mmio_flush_writes(); } static void copy_wr_to_rq(struct t4_wq *wq, union t4_recv_wr *wqe, u8 len16) @@ -274,7 +285,9 @@ static void ring_kernel_db(struct c4iw_qp *qhp, u32 qid, u16 idx) int mask; int __attribute__((unused)) ret; - wc_wmb(); + /* FIXME: Why do we need this barrier if the kernel is going to + trigger the DMA? */ + udma_to_device_barrier(); if (qid == qhp->wq.sq.qid) { attr.sq_psn = idx; mask = IBV_QP_SQ_PSN; @@ -385,8 +398,11 @@ int c4iw_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, len16, wqe); } else ring_kernel_db(qhp, qhp->wq.sq.qid, idx); + /* This write is only for debugging, the value does not matter for DMA + */ qhp->wq.sq.queue[qhp->wq.sq.size].status.host_wq_pidx = \ (qhp->wq.sq.wq_pidx); + pthread_spin_unlock(&qhp->lock); return err; } diff --git a/providers/cxgb4/t4.h b/providers/cxgb4/t4.h index a457e2f2921727..a845a367cfbb8c 100644 --- a/providers/cxgb4/t4.h +++ b/providers/cxgb4/t4.h @@ -317,9 +317,12 @@ enum { }; struct t4_sq { + /* queue is either host memory or WC MMIO memory if + * t4_sq_onchip(). */ union t4_wr *queue; struct t4_swsqe *sw_sq; struct t4_swsqe *oldest_read; + /* udb is either UC or WC MMIO memory depending on device version. */ volatile u32 *udb; size_t memsize; u32 qid; @@ -367,12 +370,6 @@ struct t4_wq { u8 *db_offp; }; -static inline void t4_ma_sync(struct t4_wq *wq, int page_size) -{ - wc_wmb(); - *((volatile u32 *)wq->sq.ma_sync) = 1; -} - static inline int t4_rqes_posted(struct t4_wq *wq) { return wq->rq.in_use; @@ -444,8 +441,11 @@ static inline void t4_sq_produce(struct t4_wq *wq, u8 len16) wq->sq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); if (wq->sq.wq_pidx >= wq->sq.size * T4_SQ_NUM_SLOTS) wq->sq.wq_pidx %= wq->sq.size * T4_SQ_NUM_SLOTS; - if (!wq->error) + if (!wq->error) { + /* This write is only for debugging, the value does not matter + * for DMA */ wq->sq.queue[wq->sq.size].status.host_pidx = (wq->sq.pidx); + } } static inline void t4_sq_consume(struct t4_wq *wq) @@ -457,10 +457,14 @@ static inline void t4_sq_consume(struct t4_wq *wq) if (++wq->sq.cidx == wq->sq.size) wq->sq.cidx = 0; assert((wq->sq.cidx != wq->sq.pidx) || wq->sq.in_use == 0); - if (!wq->error) + if (!wq->error){ + /* This write is only for debugging, the value does not matter + * for DMA */ wq->sq.queue[wq->sq.size].status.host_cidx = wq->sq.cidx; + } } +/* Copies to WC MMIO memory */ static void copy_wqe_to_udb(volatile u32 *udb_offset, void *wqe) { u64 *src, *dst; @@ -482,8 +486,8 @@ extern int t5_en_wc; static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, u8 t4, u8 len16, union t4_wr *wqe) { - wc_wmb(); if (!t4) { + mmio_wc_start(); if (t5_en_wc && inc == 1 && wq->sq.wc_reg_available) { PDBG("%s: WC wq->sq.pidx = %d; len16=%d\n", __func__, wq->sq.pidx, len16); @@ -494,30 +498,45 @@ static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, u8 t4, u8 len16, writel(QID_V(wq->sq.bar2_qid) | PIDX_T5_V(inc), wq->sq.udb); } - wc_wmb(); + /* udb is WC for > t4 devices */ + mmio_flush_writes(); return; } + + udma_to_device_barrier(); if (ma_wr) { if (t4_sq_onchip(wq)) { int i; + + mmio_wc_start(); for (i = 0; i < 16; i++) *(volatile u32 *)&wq->sq.queue[wq->sq.size].flits[2+i] = i; + mmio_flush_writes(); } } else { if (t4_sq_onchip(wq)) { int i; + + mmio_wc_start(); for (i = 0; i < 16; i++) + /* FIXME: What is this supposed to be doing? + * Writing to the same address multiple times + * with WC memory is not guarenteed to + * generate any more than one TLP. Why isn't + * writing to WC memory marked volatile? */ *(u32 *)&wq->sq.queue[wq->sq.size].flits[2] = i; + mmio_flush_writes(); } } + /* udb is UC for t4 devices */ writel(QID_V(wq->sq.qid & wq->qid_mask) | PIDX_V(inc), wq->sq.udb); } static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc, u8 t4, u8 len16, union t4_recv_wr *wqe) { - wc_wmb(); if (!t4) { + mmio_wc_start(); if (t5_en_wc && inc == 1 && wq->sq.wc_reg_available) { PDBG("%s: WC wq->rq.pidx = %d; len16=%d\n", __func__, wq->rq.pidx, len16); @@ -528,9 +547,12 @@ static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc, u8 t4, u8 len16, writel(QID_V(wq->rq.bar2_qid) | PIDX_T5_V(inc), wq->rq.udb); } - wc_wmb(); + /* udb is WC for > t4 devices */ + mmio_flush_writes(); return; } + /* udb is UC for t4 devices */ + udma_to_device_barrier(); writel(QID_V(wq->rq.qid & wq->qid_mask) | PIDX_V(inc), wq->rq.udb); } @@ -655,7 +677,7 @@ static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe) cq->error = 1; assert(0); } else if (t4_valid_cqe(cq, &cq->queue[cq->cidx])) { - rmb(); + udma_from_device_barrier(); *cqe = &cq->queue[cq->cidx]; ret = 0; } else diff --git a/providers/cxgb4/verbs.c b/providers/cxgb4/verbs.c index 32ed44c63d8402..e7620dc02ae0a7 100644 --- a/providers/cxgb4/verbs.c +++ b/providers/cxgb4/verbs.c @@ -573,6 +573,8 @@ static void reset_qp(struct c4iw_qp *qhp) qhp->wq.rq.cidx = qhp->wq.rq.pidx = qhp->wq.rq.in_use = 0; qhp->wq.sq.oldest_read = NULL; memset(qhp->wq.sq.queue, 0, qhp->wq.sq.memsize); + if (t4_sq_onchip(&qhp->wq)) + mmio_flush_writes(); memset(qhp->wq.rq.queue, 0, qhp->wq.rq.memsize); } -- 2.7.4 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html