Signed-off-by: Bernard Metzler <bmt@xxxxxxxxxxxxxx> --- drivers/infiniband/sw/siw/siw_qp_rx.c | 1381 +++++++++++++++++++++++++++++++++ 1 file changed, 1381 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_qp_rx.c diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c new file mode 100644 index 000000000000..21c07c6f1bf9 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -0,0 +1,1381 @@ +/* + * Software iWARP device driver for Linux + * + * Authors: Bernard Metzler <bmt@xxxxxxxxxxxxxx> + * Fredy Neeser <nfd@xxxxxxxxxxxxxx> + * + * Copyright (c) 2008-2017, IBM Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * - Neither the name of IBM nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/net.h> +#include <linux/scatterlist.h> +#include <linux/highmem.h> +#include <net/sock.h> +#include <net/tcp_states.h> +#include <net/tcp.h> + +#include <rdma/iw_cm.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_user_verbs.h> + +#include "siw.h" +#include "siw_obj.h" +#include "siw_cm.h" + + +/* + * ---------------------------- + * DDP reassembly for Softiwarp + * ---------------------------- + * For the ordering of transmitted DDP segments, the relevant iWARP ordering + * rules are as follows: + * + * - RDMAP (RFC 5040): Section 7.5, Rule 17: + * "RDMA Read Response Message processing at the Remote Peer (reading + * the specified Tagged Buffer) MUST be started only after the RDMA + * Read Request Message has been Delivered by the DDP layer (thus, + * all previous RDMA Messages have been properly submitted for + * ordered Placement)." + * + * - DDP (RFC 5041): Section 5.3: + * "At the Data Source, DDP: + * o MUST transmit DDP Messages in the order they were submitted to + * the DDP layer, + * o SHOULD transmit DDP Segments within a DDP Message in increasing + * MO order for Untagged DDP Messages, and in increasing TO order + * for Tagged DDP Messages." + * + * Combining these rules implies that, although RDMAP does not provide + * ordering between operations that are generated from the two ends of an + * RDMAP stream, DDP *must not* transmit an RDMA Read Response Message before + * it has finished transmitting SQ operations that were already submitted + * to the DDP layer. It follows that an iWARP transmitter must fully + * serialize RDMAP messages belonging to the same QP. + * + * Given that a TCP socket receives DDP segments in peer transmit order, + * we obtain the following ordering of received DDP segments: + * + * (i) the received DDP segments of RDMAP messages for the same QP + * cannot be interleaved + * (ii) the received DDP segments of a single RDMAP message *should* + * arrive in order. + * + * The Softiwarp transmitter obeys rule #2 in DDP Section 5.3. + * With this property, the "should" becomes a "must" in (ii) above, + * which simplifies DDP reassembly considerably. + * The Softiwarp receiver currently relies on this property + * and reports an error if DDP segments of the same RDMAP message + * do not arrive in sequence. + */ + +static inline int siw_crc_rxhdr(struct siw_iwarp_rx *ctx) +{ + crypto_shash_init(ctx->mpa_crc_hd); + + return siw_crc_array(ctx->mpa_crc_hd, (u8 *)&ctx->hdr, + ctx->fpdu_part_rcvd); +} + +/* + * siw_rx_umem() + * + * Receive data of @len into target referenced by @rctx. + * This function does not check if umem is within bounds requested by + * @len and @t_off. @umem_ends indicates if routine should + * not update chunk position pointers after the point it is + * currently receiving + * + * @rctx: Receive Context + * @umem: siw representation of target memory + * @dest_addr: 1, if rctx chunk pointer should not be updated after len. + */ +static int siw_rx_umem(struct siw_iwarp_rx *rctx, struct siw_umem *umem, + u64 dest_addr, int len) +{ + void *dest; + int pg_off = dest_addr & ~PAGE_MASK, + copied = 0, + bytes, + rv; + + while (len) { + struct page *p = siw_get_upage(umem, dest_addr); + + if (unlikely(!p)) { + pr_warn("%s: QP[%d]: bogus addr: %p, %p\n", + __func__, RX_QPID(rctx), + (void *)dest_addr, (void *)umem->fp_addr); + /* siw internal error */ + rctx->skb_copied += copied; + rctx->skb_new -= copied; + copied = -EFAULT; + + goto out; + } + + bytes = min(len, (int)PAGE_SIZE - pg_off); + dest = kmap_atomic(p); + + rv = skb_copy_bits(rctx->skb, rctx->skb_offset, dest + pg_off, + bytes); + + dprint(DBG_RX, + "(QP%d): skb_copy_bits():: Page %p, bytes=%u, rv=%d\n", + RX_QPID(rctx), p, bytes, rv); + + if (likely(!rv)) { + if (rctx->mpa_crc_hd) + rv = siw_crc_page(rctx->mpa_crc_hd, p, pg_off, + bytes); + + rctx->skb_offset += bytes; + copied += bytes; + len -= bytes; + dest_addr += bytes; + pg_off = 0; + } + kunmap_atomic(dest); + + if (unlikely(rv)) { + rctx->skb_copied += copied; + rctx->skb_new -= copied; + copied = -EFAULT; + + dprint(DBG_RX|DBG_ON, "(QP%d): failed with %d\n", + RX_QPID(rctx), rv); + + goto out; + } + } + /* + * store chunk position for resume + */ + rctx->skb_copied += copied; + rctx->skb_new -= copied; +out: + return copied; +} + +static inline int siw_rx_kva(struct siw_iwarp_rx *rctx, void *kva, int len) +{ + int rv; + + dprint(DBG_RX, "(QP%d): receive %d bytes into %p\n", RX_QPID(rctx), + len, kva); + + rv = skb_copy_bits(rctx->skb, rctx->skb_offset, kva, len); + if (likely(!rv)) { + rctx->skb_offset += len; + rctx->skb_copied += len; + rctx->skb_new -= len; + if (rctx->mpa_crc_hd) { + rv = siw_crc_array(rctx->mpa_crc_hd, kva, len); + if (rv) + goto error; + } + return len; + } + dprint(DBG_ON, "(QP%d): failed: len %d, addr %p, rv %d\n", + RX_QPID(rctx), len, kva, rv); +error: + return rv; +} + +static int siw_rx_pbl(struct siw_iwarp_rx *rctx, struct siw_mr *mr, + u64 addr, int len) +{ + struct siw_pbl *pbl = mr->pbl; + u64 offset = addr - mr->mem.va; + int copied = 0; + + while (len) { + int bytes; + u64 buf_addr = siw_pbl_get_buffer(pbl, offset, &bytes, + &rctx->pbl_idx); + if (buf_addr == 0) + break; + bytes = min(bytes, len); + if (siw_rx_kva(rctx, (void *)buf_addr, bytes) == bytes) { + copied += bytes; + offset += bytes; + len -= bytes; + } else + break; + } + return copied; +} + +/* + * siw_rresp_check_ntoh() + * + * Check incoming RRESP fragment header against expected + * header values and update expected values for potential next + * fragment. + * + * NOTE: This function must be called only if a RRESP DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segement. + */ +static inline int siw_rresp_check_ntoh(struct siw_iwarp_rx *rctx) +{ + struct iwarp_rdma_rresp *rresp = &rctx->hdr.rresp; + struct siw_wqe *wqe = &rctx->wqe_active; + + u32 sink_stag = be32_to_cpu(rresp->sink_stag); + u64 sink_to = be64_to_cpu(rresp->sink_to); + + if (rctx->first_ddp_seg) { + rctx->ddp_stag = wqe->sqe.sge[0].lkey; + rctx->ddp_to = wqe->sqe.sge[0].laddr; + rctx->pbl_idx = 0; + } + if (rctx->ddp_stag != sink_stag) { + dprint(DBG_RX|DBG_ON, + " received STAG=%08x, expected STAG=%08x\n", + sink_stag, rctx->ddp_stag); + return -EINVAL; + } + if (rctx->ddp_to != sink_to) { + dprint(DBG_RX|DBG_ON, + " received TO=%016llx, expected TO=%016llx\n", + (unsigned long long)sink_to, + (unsigned long long)rctx->ddp_to); + return -EINVAL; + } + if (!rctx->more_ddp_segs && (wqe->processed + rctx->fpdu_part_rem + != wqe->bytes)) { + dprint(DBG_RX|DBG_ON, + " RRESP len error, peer sent %d, RREQ asked %d\n", + wqe->processed + rctx->fpdu_part_rem, wqe->bytes); + return -EINVAL; + } + return 0; +} + +/* + * siw_write_check_ntoh() + * + * Check incoming WRITE fragment header against expected + * header values and update expected values for potential next + * fragment + * + * NOTE: This function must be called only if a WRITE DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segement. + */ +static inline int siw_write_check_ntoh(struct siw_iwarp_rx *rctx) +{ + struct iwarp_rdma_write *write = &rctx->hdr.rwrite; + + u32 sink_stag = be32_to_cpu(write->sink_stag); + u64 sink_to = be64_to_cpu(write->sink_to); + + if (rctx->first_ddp_seg) { + rctx->ddp_stag = sink_stag; + rctx->ddp_to = sink_to; + rctx->pbl_idx = 0; + } else { + if (rctx->ddp_stag != sink_stag) { + dprint(DBG_RX|DBG_ON, + " received STAG=%08x, expected STAG=%08x\n", + sink_stag, rctx->ddp_stag); + return -EINVAL; + } + if (rctx->ddp_to != sink_to) { + dprint(DBG_RX|DBG_ON, + " received TO=%016llx, expected TO=%016llx\n", + (unsigned long long)sink_to, + (unsigned long long)rctx->ddp_to); + return -EINVAL; + } + } + return 0; +} + +/* + * siw_send_check_ntoh() + * + * Check incoming SEND fragment header against expected + * header values and update expected MSN if no next + * fragment expected + * + * NOTE: This function must be called only if a SEND DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segement. + */ +static inline int siw_send_check_ntoh(struct siw_iwarp_rx *rctx) +{ + struct iwarp_send_inv *send = &rctx->hdr.send_inv; + struct siw_wqe *wqe = &rctx->wqe_active; + + u32 ddp_msn = be32_to_cpu(send->ddp_msn); + u32 ddp_mo = be32_to_cpu(send->ddp_mo); + u32 ddp_qn = be32_to_cpu(send->ddp_qn); + + if (ddp_qn != RDMAP_UNTAGGED_QN_SEND) { + dprint(DBG_RX|DBG_ON, " Invalid DDP QN %d for SEND\n", + ddp_qn); + return -EINVAL; + } + if (unlikely(ddp_msn != rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) { + dprint(DBG_RX|DBG_ON, " received MSN=%u, expected MSN=%u\n", + ddp_msn, rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); + return -EINVAL; + } + if (unlikely(ddp_mo != wqe->processed)) { + dprint(DBG_RX|DBG_ON, " Received MO=%u, expected MO=%u\n", + ddp_mo, wqe->processed); + return -EINVAL; + } + if (rctx->first_ddp_seg) { + /* initialize user memory write position */ + rctx->sge_idx = 0; + rctx->sge_off = 0; + rctx->pbl_idx = 0; + /* only valid for SEND_INV and SEND_SE_INV operations */ + rctx->inval_stag = be32_to_cpu(send->inval_stag); + } + if (unlikely(wqe->bytes < wqe->processed + rctx->fpdu_part_rem)) { + dprint(DBG_RX|DBG_ON, " Receive space short: (%d - %d) < %d\n", + wqe->bytes, wqe->processed, rctx->fpdu_part_rem); + wqe->wc_status = SIW_WC_LOC_LEN_ERR; + return -EINVAL; + } + return 0; +} + +static struct siw_wqe *siw_rqe_get(struct siw_qp *qp) +{ + struct siw_rqe *rqe; + struct siw_srq *srq = qp->srq; + struct siw_wqe *wqe = NULL; + unsigned long flags; + bool srq_used = false; + + if (srq) { + /* + * 'srq_used' usage: + * convince gcc we know what we do. testing validity + * of 'srq' should be sufficient but gives + * "‘flags’ may be used uninitialized ..." later for unlock + */ + srq_used = true; + spin_lock_irqsave(&srq->lock, flags); + rqe = &srq->recvq[srq->rq_get % srq->num_rqe]; + } else + rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size]; + + if (likely(rqe->flags == SIW_WQE_VALID)) { + int num_sge = rqe->num_sge; + + if (likely(num_sge <= SIW_MAX_SGE)) { + int i = 0; + + wqe = rx_wqe(qp); + rx_type(wqe) = SIW_OP_RECEIVE; + wqe->wr_status = SIW_WR_INPROGRESS; + wqe->bytes = 0; + wqe->processed = 0; + + wqe->rqe.id = rqe->id; + wqe->rqe.num_sge = num_sge; + + while (i < num_sge) { + wqe->rqe.sge[i].laddr = rqe->sge[i].laddr; + wqe->rqe.sge[i].lkey = rqe->sge[i].lkey; + wqe->rqe.sge[i].length = rqe->sge[i].length; + wqe->bytes += wqe->rqe.sge[i].length; + wqe->mem[i].obj = NULL; + i++; + } + /* can be re-used by appl */ + smp_store_mb(rqe->flags, 0); + } else { + pr_info("RQE: too many SGE's: %d\n", rqe->num_sge); + goto out; + } + if (srq_used == false) + qp->rq_get++; + else { + if (srq->armed) { + /* Test SRQ limit */ + u32 off = (srq->rq_get + srq->limit) % + srq->num_rqe; + struct siw_rqe *rqe2 = &srq->recvq[off]; + + if (!(rqe2->flags & SIW_WQE_VALID)) { + srq->armed = 0; + siw_srq_event(srq, + IB_EVENT_SRQ_LIMIT_REACHED); + } + } + srq->rq_get++; + } + } +out: + if (srq_used) + spin_unlock_irqrestore(&srq->lock, flags); + + return wqe; +} + +/* + * siw_proc_send: + * + * Process one incoming SEND and place data into memory referenced by + * receive wqe. + * + * Function supports partially received sends (suspending/resuming + * current receive wqe processing) + * + * return value: + * 0: reached the end of a DDP segment + * -EAGAIN: to be called again to finish the DDP segment + */ +int siw_proc_send(struct siw_qp *qp, struct siw_iwarp_rx *rctx) +{ + struct siw_wqe *wqe; + struct siw_sge *sge; + u32 data_bytes, /* all data bytes available */ + rcvd_bytes; /* sum of data bytes rcvd */ + int rv = 0; + + if (rctx->first_ddp_seg) { + wqe = siw_rqe_get(qp); + if (unlikely(!wqe)) + return -ENOENT; + } else + wqe = rx_wqe(qp); + + if (rctx->state == SIW_GET_DATA_START) { + rv = siw_send_check_ntoh(rctx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + if (!rctx->fpdu_part_rem) /* zero length SEND */ + return 0; + } + data_bytes = min(rctx->fpdu_part_rem, rctx->skb_new); + rcvd_bytes = 0; + + /* A zero length SEND will skip below loop */ + while (data_bytes) { + struct siw_pd *pd; + struct siw_mr *mr; + union siw_mem_resolved *mem; + u32 sge_bytes; /* data bytes avail for SGE */ + + sge = &wqe->rqe.sge[rctx->sge_idx]; + + if (!sge->length) { + /* just skip empty sge's */ + rctx->sge_idx++; + rctx->sge_off = 0; + rctx->pbl_idx = 0; + continue; + } + sge_bytes = min(data_bytes, sge->length - rctx->sge_off); + mem = &wqe->mem[rctx->sge_idx]; + + /* + * check with QP's PD if no SRQ present, SRQ's PD otherwise + */ + pd = qp->srq == NULL ? qp->pd : qp->srq->pd; + + rv = siw_check_sge(pd, sge, mem, SIW_MEM_LWRITE, rctx->sge_off, + sge_bytes); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + break; + } + mr = siw_mem2mr(mem->obj); + if (mr->mem_obj == NULL) + rv = siw_rx_kva(rctx, + (void *)(sge->laddr + rctx->sge_off), + sge_bytes); + else if (!mr->mem.is_pbl) + rv = siw_rx_umem(rctx, mr->umem, + sge->laddr + rctx->sge_off, sge_bytes); + else + rv = siw_rx_pbl(rctx, mr, + sge->laddr + rctx->sge_off, sge_bytes); + + if (unlikely(rv != sge_bytes)) { + wqe->processed += rcvd_bytes; + return -EINVAL; + } + rctx->sge_off += rv; + + if (rctx->sge_off == sge->length) { + rctx->sge_idx++; + rctx->sge_off = 0; + rctx->pbl_idx = 0; + } + data_bytes -= rv; + rcvd_bytes += rv; + + rctx->fpdu_part_rem -= rv; + rctx->fpdu_part_rcvd += rv; + } + wqe->processed += rcvd_bytes; + + if (!rctx->fpdu_part_rem) + return 0; + + return (rv < 0) ? rv : -EAGAIN; +} + +/* + * siw_proc_write: + * + * Place incoming WRITE after referencing and checking target buffer + + * Function supports partially received WRITEs (suspending/resuming + * current receive processing) + * + * return value: + * 0: reached the end of a DDP segment + * -EAGAIN: to be called again to finish the DDP segment + */ + +int siw_proc_write(struct siw_qp *qp, struct siw_iwarp_rx *rctx) +{ + struct siw_dev *dev = qp->hdr.sdev; + struct siw_mem *mem; + struct siw_mr *mr; + int bytes, + rv; + + if (rctx->state == SIW_GET_DATA_START) { + + if (!rctx->fpdu_part_rem) /* zero length WRITE */ + return 0; + + rv = siw_write_check_ntoh(rctx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + } + bytes = min(rctx->fpdu_part_rem, rctx->skb_new); + + if (rctx->first_ddp_seg) { + rx_mem(qp) = siw_mem_id2obj(dev, rctx->ddp_stag >> 8); + rx_wqe(qp)->wr_status = SIW_WR_INPROGRESS; + rx_type(rx_wqe(qp)) = SIW_OP_WRITE; + } + if (unlikely(!rx_mem(qp))) { + dprint(DBG_RX|DBG_ON, + "(QP%d): Sink STag not found/invalid, STag=0x%08x\n", + QP_ID(qp), rctx->ddp_stag); + return -EINVAL; + } + mem = rx_mem(qp); + /* + * Rtag not checked against mem's tag again because + * hdr check guarantees same tag as before if fragmented + */ + rv = siw_check_mem(qp->pd, mem, rctx->ddp_to + rctx->fpdu_part_rcvd, + SIW_MEM_RWRITE, bytes); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + return rv; + } + + mr = siw_mem2mr(mem); + if (mr->mem_obj == NULL) + rv = siw_rx_kva(rctx, + (void *)(rctx->ddp_to + rctx->fpdu_part_rcvd), + bytes); + else if (!mr->mem.is_pbl) + rv = siw_rx_umem(rctx, mr->umem, + rctx->ddp_to + rctx->fpdu_part_rcvd, bytes); + else + rv = siw_rx_pbl(rctx, mr, + rctx->ddp_to + rctx->fpdu_part_rcvd, bytes); + + if (unlikely(rv != bytes)) + return -EINVAL; + + rctx->fpdu_part_rem -= rv; + rctx->fpdu_part_rcvd += rv; + + if (!rctx->fpdu_part_rem) { + rctx->ddp_to += rctx->fpdu_part_rcvd; + return 0; + } + + return (rv < 0) ? rv : -EAGAIN; +} + +/* + * inbound RREQ's cannot carry user data. + */ +int siw_proc_rreq(struct siw_qp *qp, struct siw_iwarp_rx *rctx) +{ + if (!rctx->fpdu_part_rem) + return 0; + + dprint(DBG_ON|DBG_RX, "(QP%d): RREQ with MPA len %d\n", QP_ID(qp), + be16_to_cpu(rctx->hdr.ctrl.mpa_len)); + + return -EPROTO; +} + +/* + * siw_init_rresp: + * + * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE. + * Put it at the tail of the IRQ, if there is another WQE currently in + * transmit processing. If not, make it the current WQE to be processed + * and schedule transmit processing. + * + * Can be called from softirq context and from process + * context (RREAD socket loopback case!) + * + * return value: + * 0: success, + * failure code otherwise + */ + +static int siw_init_rresp(struct siw_qp *qp, struct siw_iwarp_rx *rctx) +{ + struct siw_wqe *tx_work = tx_wqe(qp); + struct siw_sqe *resp; + + uint64_t raddr = be64_to_cpu(rctx->hdr.rreq.sink_to), + laddr = be64_to_cpu(rctx->hdr.rreq.source_to); + uint32_t length = be32_to_cpu(rctx->hdr.rreq.read_size), + lkey = be32_to_cpu(rctx->hdr.rreq.source_stag), + rkey = be32_to_cpu(rctx->hdr.rreq.sink_stag); + int run_sq = 1, rv = 0; + unsigned long flags; + + spin_lock_irqsave(&qp->sq_lock, flags); + + if (tx_work->wr_status == SIW_WR_IDLE) { + /* + * immediately schedule READ response w/o + * consuming IRQ entry: IRQ must be empty. + */ + tx_work->processed = 0; + tx_work->mem[0].obj = NULL; + tx_work->wr_status = SIW_WR_QUEUED; + resp = &tx_work->sqe; + } else { + resp = irq_alloc_free(qp); + run_sq = 0; + } + if (likely(resp)) { + resp->opcode = SIW_OP_READ_RESPONSE; + + resp->sge[0].length = length; + resp->sge[0].laddr = laddr; + resp->sge[0].lkey = lkey; + + resp->raddr = raddr; + resp->rkey = rkey; + resp->num_sge = length ? 1 : 0; + + /* RRESP now valid as current TX wqe or placed into IRQ */ + smp_store_mb(resp->flags, SIW_WQE_VALID); + } else { + dprint(DBG_RX|DBG_ON, ": QP[%d]: IRQ %d exceeded %d!\n", + QP_ID(qp), qp->irq_put % qp->attrs.irq_size, + qp->attrs.irq_size); + rv = -EPROTO; + } + + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (run_sq) + siw_sq_start(qp); + + return rv; +} + +/* + * Only called at start of Read.Resonse processing. + * Transfer pending Read from tip of ORQ into currrent rx wqe, + * but keep ORQ entry valid until Read.Response processing done. + * No Queue locking needed. + */ +static int siw_orqe_start_rx(struct siw_qp *qp) +{ + struct siw_sqe *orqe; + struct siw_wqe *wqe = NULL; + + /* make sure ORQ indices are current */ + smp_mb(); + + orqe = orq_get_current(qp); + if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) { + wqe = rx_wqe(qp); + wqe->sqe.id = orqe->id; + wqe->sqe.opcode = orqe->opcode; + wqe->sqe.sge[0].laddr = orqe->sge[0].laddr; + wqe->sqe.sge[0].lkey = orqe->sge[0].lkey; + wqe->sqe.sge[0].length = orqe->sge[0].length; + wqe->sqe.flags = orqe->flags; + wqe->sqe.num_sge = 1; + wqe->bytes = orqe->sge[0].length; + wqe->processed = 0; + wqe->mem[0].obj = NULL; + /* make sure WQE is completely written before valid */ + smp_wmb(); + wqe->wr_status = SIW_WR_INPROGRESS; + + return 0; + } + return -EPROTO; +} + + +/* + * siw_proc_rresp: + * + * Place incoming RRESP data into memory referenced by RREQ WQE + * which is at the tip of the ORQ + * + * Function supports partially received RRESP's (suspending/resuming + * current receive processing) + */ +int siw_proc_rresp(struct siw_qp *qp, struct siw_iwarp_rx *rctx) +{ + struct siw_wqe *wqe = rx_wqe(qp); + union siw_mem_resolved *mem; + struct siw_sge *sge; + struct siw_mr *mr; + int bytes, + rv; + + if (rctx->first_ddp_seg) { + if (unlikely(wqe->wr_status != SIW_WR_IDLE)) { + pr_warn("QP[%d]: Start RRESP: RX status %d, op %d\n", + QP_ID(qp), wqe->wr_status, + wqe->sqe.opcode); + rv = -EPROTO; + goto done; + } + /* + * fetch pending RREQ from orq + */ + rv = siw_orqe_start_rx(qp); + if (rv) { + dprint(DBG_RX|DBG_ON, "(QP%d): ORQ empty at idx %d\n", + QP_ID(qp), + qp->orq_get % qp->attrs.orq_size); + goto done; + } + rv = siw_rresp_check_ntoh(rctx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + goto done; + } + } else { + if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) { + pr_warn("QP[%d]: Resume RRESP: status %d\n", + QP_ID(qp), wqe->wr_status); + rv = -EPROTO; + goto done; + } + } + if (!rctx->fpdu_part_rem) /* zero length RRESPONSE */ + return 0; + + sge = wqe->sqe.sge; /* there is only one */ + mem = &wqe->mem[0]; + + if (mem->obj == NULL) { + /* + * check target memory which resolves memory on first fragment + */ + rv = siw_check_sge(qp->pd, sge, mem, SIW_MEM_LWRITE, 0, + wqe->bytes); + if (rv) { + dprint(DBG_RX|DBG_ON, "(QP%d): siw_check_sge: %d\n", + QP_ID(qp), rv); + wqe->wc_status = SIW_WC_LOC_PROT_ERR; + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + goto done; + } + } + bytes = min(rctx->fpdu_part_rem, rctx->skb_new); + + mr = siw_mem2mr(mem->obj); + if (mr->mem_obj == NULL) + rv = siw_rx_kva(rctx, (void *)(sge->laddr + wqe->processed), + bytes); + else if (!mr->mem.is_pbl) + rv = siw_rx_umem(rctx, mr->umem, sge->laddr + wqe->processed, + bytes); + else + rv = siw_rx_pbl(rctx, mr, sge->laddr + wqe->processed, + bytes); + if (rv != bytes) { + wqe->wc_status = SIW_WC_GENERAL_ERR; + rv = -EINVAL; + goto done; + } + rctx->fpdu_part_rem -= rv; + rctx->fpdu_part_rcvd += rv; + + wqe->processed += rv; + if (!rctx->fpdu_part_rem) { + rctx->ddp_to += rctx->fpdu_part_rcvd; + return 0; + } +done: + return (rv < 0) ? rv : -EAGAIN; +} + + +int siw_proc_unsupp(struct siw_qp *qp, struct siw_iwarp_rx *rctx) +{ + return -ECONNRESET; +} + + +int siw_proc_terminate(struct siw_qp *qp, struct siw_iwarp_rx *rctx) +{ + dprint(DBG_ON, " (QP%d): RX Terminate: type=%d, layer=%d, code=%d\n", + QP_ID(qp), + __rdmap_term_etype(&rctx->hdr.terminate), + __rdmap_term_layer(&rctx->hdr.terminate), + __rdmap_term_ecode(&rctx->hdr.terminate)); + + return -ECONNRESET; +} + + +static int siw_get_trailer(struct siw_qp *qp, struct siw_iwarp_rx *rctx) +{ + struct sk_buff *skb = rctx->skb; + u8 *tbuf = (u8 *)&rctx->trailer.crc - rctx->pad; + int avail; + + avail = min(rctx->skb_new, rctx->fpdu_part_rem); + + dprint(DBG_RX, " (QP%d): to recv %d, avail %d, pad %d, skb_new %d\n", + QP_ID(qp), rctx->fpdu_part_rem, avail, rctx->pad, + rctx->skb_new); + + skb_copy_bits(skb, rctx->skb_offset, + tbuf + rctx->fpdu_part_rcvd, avail); + + rctx->fpdu_part_rcvd += avail; + rctx->fpdu_part_rem -= avail; + + rctx->skb_new -= avail; + rctx->skb_offset += avail; + rctx->skb_copied += avail; + + if (!rctx->fpdu_part_rem) { + __be32 crc_in, crc_own = 0; + /* + * check crc if required + */ + if (!rctx->mpa_crc_hd) + return 0; + + if (rctx->pad && siw_crc_array(rctx->mpa_crc_hd, + tbuf, rctx->pad) != 0) + return -EINVAL; + + crypto_shash_final(rctx->mpa_crc_hd, (u8 *)&crc_own); + + /* + * CRC32 is computed, transmitted and received directly in NBO, + * so there's never a reason to convert byte order. + */ + crc_in = rctx->trailer.crc; + + if (crc_in != crc_own) { + dprint(DBG_RX|DBG_ON, + " (QP%d): CRC ERROR in:=%08x, own=%08x\n", + QP_ID(qp), crc_in, crc_own); + return -EINVAL; + } + return 0; + } + return -EAGAIN; +} + + +static int siw_get_hdr(struct siw_iwarp_rx *rctx) +{ + struct sk_buff *skb = rctx->skb; + struct iwarp_ctrl *c_hdr = &rctx->hdr.ctrl; + u8 opcode; + + int bytes; + + if (rctx->fpdu_part_rcvd < sizeof(struct iwarp_ctrl)) { + /* + * copy first fix part of iwarp hdr + */ + bytes = min_t(int, rctx->skb_new, sizeof(struct iwarp_ctrl) + - rctx->fpdu_part_rcvd); + + skb_copy_bits(skb, rctx->skb_offset, + (char *)c_hdr + rctx->fpdu_part_rcvd, bytes); + + rctx->fpdu_part_rcvd += bytes; + + rctx->skb_new -= bytes; + rctx->skb_offset += bytes; + rctx->skb_copied += bytes; + + if (!rctx->skb_new || + rctx->fpdu_part_rcvd < sizeof(struct iwarp_ctrl)) + return -EAGAIN; + + if (__ddp_version(c_hdr) != DDP_VERSION) { + dprint(DBG_RX|DBG_ON, " dversion %d\n", + __ddp_version(c_hdr)); + return -EINVAL; + } + if (__rdmap_version(c_hdr) != RDMAP_VERSION) { + dprint(DBG_RX|DBG_ON, " rversion %d\n", + __rdmap_version(c_hdr)); + return -EINVAL; + } + opcode = __rdmap_opcode(c_hdr); + + if (opcode > RDMAP_TERMINATE) { + dprint(DBG_RX|DBG_ON, " opcode %d\n", opcode); + return -EINVAL; + } + dprint(DBG_RX, "(QP%d): New Header, opcode:%d\n", + RX_QPID(rctx), opcode); + } else + opcode = __rdmap_opcode(c_hdr); + /* + * figure out len of current hdr: variable length of + * iwarp hdr forces us to copy hdr information + */ + bytes = min(rctx->skb_new, + iwarp_pktinfo[opcode].hdr_len - rctx->fpdu_part_rcvd); + + skb_copy_bits(skb, rctx->skb_offset, + (char *)c_hdr + rctx->fpdu_part_rcvd, bytes); + + rctx->fpdu_part_rcvd += bytes; + + rctx->skb_new -= bytes; + rctx->skb_offset += bytes; + rctx->skb_copied += bytes; + + if (rctx->fpdu_part_rcvd == iwarp_pktinfo[opcode].hdr_len) { + /* + * HDR receive completed. Check if the current DDP segment + * starts a new RDMAP message or continues a previously + * started RDMAP message. + * + * Note well from the comments on DDP reassembly: + * - Support for unordered reception of DDP segments + * (or FPDUs) from different RDMAP messages is not needed. + * - Unordered reception of DDP segments of the same + * RDMAP message is not supported. It is probably not + * needed with most peers. + */ + siw_dprint_hdr(&rctx->hdr, RX_QPID(rctx), "HDR received"); + + if (rctx->more_ddp_segs != 0) { + rctx->first_ddp_seg = 0; + if (rctx->prev_rdmap_opcode != opcode) { + dprint(DBG_ON, + "packet intersection: %d <> %d\n", + rctx->prev_rdmap_opcode, opcode); + return -EPROTO; + } + } else { + rctx->prev_rdmap_opcode = opcode; + rctx->first_ddp_seg = 1; + } + rctx->more_ddp_segs = + c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1; + + return 0; + } + return -EAGAIN; +} + +static inline int siw_fpdu_payload_len(struct siw_iwarp_rx *rctx) +{ + return be16_to_cpu(rctx->hdr.ctrl.mpa_len) - rctx->fpdu_part_rcvd + + MPA_HDR_SIZE; +} + +static inline int siw_fpdu_trailer_len(struct siw_iwarp_rx *rctx) +{ + int mpa_len = be16_to_cpu(rctx->hdr.ctrl.mpa_len) + MPA_HDR_SIZE; + + return MPA_CRC_SIZE + (-mpa_len & 0x3); +} + + + +static void siw_check_tx_fence(struct siw_qp *qp) +{ + struct siw_wqe *tx_waiting = tx_wqe(qp); + struct siw_sqe *rreq; + int resume_tx = 0; + unsigned long flags; + + spin_lock_irqsave(&qp->orq_lock, flags); + + rreq = orq_get_current(qp); + + /* free current orq entry */ + smp_store_mb(rreq->flags, 0); + + if (qp->tx_ctx.orq_fence) { + if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) { + pr_warn("QP[%d]: Resume from fence: status %d wrong\n", + QP_ID(qp), tx_waiting->wr_status); + goto out; + } + /* resume SQ processing */ + if (tx_waiting->sqe.opcode == SIW_OP_READ || + tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) { + + rreq = orq_get_tail(qp); + if (unlikely(!rreq)) { + pr_warn("QP[%d]: no ORQ\n", QP_ID(qp)); + goto out; + } + siw_read_to_orq(rreq, &tx_waiting->sqe); + + qp->orq_put++; + qp->tx_ctx.orq_fence = 0; + resume_tx = 1; + + } else if (siw_orq_empty(qp)) { + + qp->tx_ctx.orq_fence = 0; + resume_tx = 1; + } else + pr_warn("QP[%d]: Resume from fence: error: %d:%d\n", + QP_ID(qp), qp->orq_get, qp->orq_put); + } + qp->orq_get++; +out: + spin_unlock_irqrestore(&qp->orq_lock, flags); + + if (resume_tx) + siw_sq_start(qp); +} + +/* + * siw_rdmap_complete() + * + * Complete processing of an RDMA message after receiving all + * DDP segmens or ABort processing after encountering error case. + * + * o SENDs + RRESPs will need for completion, + * o RREQs need for READ RESPONSE initialization + * o WRITEs need memory dereferencing + * + * TODO: Failed WRITEs need local error to be surfaced. + */ + +static inline int +siw_rdmap_complete(struct siw_qp *qp, int error) +{ + struct siw_iwarp_rx *rctx = &qp->rx_ctx; + struct siw_wqe *wqe = rx_wqe(qp); + enum siw_wc_status wc_status = wqe->wc_status; + + u8 opcode = __rdmap_opcode(&rctx->hdr.ctrl); + int rv = 0; + + switch (opcode) { + + case RDMAP_SEND_SE: + case RDMAP_SEND_SE_INVAL: + wqe->rqe.flags |= SIW_WQE_SOLICITED; + case RDMAP_SEND: + case RDMAP_SEND_INVAL: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++; + + if (error != 0 && wc_status == SIW_WC_SUCCESS) + wc_status = SIW_WC_GENERAL_ERR; + + /* + * Handle STag invalidation request + */ + if (wc_status == SIW_WC_SUCCESS && + (opcode == RDMAP_SEND_INVAL || + opcode == RDMAP_SEND_SE_INVAL)) { + rv = siw_invalidate_stag(qp->pd, rctx->inval_stag); + if (rv) + wc_status = SIW_WC_REM_INV_REQ_ERR; + } + rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, + wc_status); + siw_wqe_put_mem(wqe, SIW_OP_RECEIVE); + + break; + + case RDMAP_RDMA_READ_RESP: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + rctx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++; + if (error != 0) { + if (rctx->state == SIW_GET_HDR || error == -ENODATA) + /* eventual RREQ in ORQ left untouched */ + break; + + if (wc_status == SIW_WC_SUCCESS) + wc_status = SIW_WC_GENERAL_ERR; + } else if (qp->kernel_verbs && + rx_type(wqe) == SIW_OP_READ_LOCAL_INV) { + /* + * Handle any STag invalidation request + */ + rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey); + if (rv && wc_status == SIW_WC_SUCCESS) { + wc_status = SIW_WC_GENERAL_ERR; + error = rv; + } + } + /* + * All errors turn the wqe into signalled. + */ + if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0) + rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed, + wc_status); + siw_wqe_put_mem(wqe, SIW_OP_READ); + + if (error == 0) + siw_check_tx_fence(qp); + break; + + case RDMAP_RDMA_READ_REQ: + if (error == 0) + rv = siw_init_rresp(qp, rctx); + + break; + + case RDMAP_RDMA_WRITE: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + /* + * Free References from memory object if + * attached to receive context (inbound WRITE). + * While a zero-length WRITE is allowed, + * no memory reference got created. + */ + if (rx_mem(qp)) { + siw_mem_put(rx_mem(qp)); + rx_mem(qp) = NULL; + } + break; + + default: + break; + } + wqe->wr_status = SIW_WR_IDLE; + + return rv; +} + +/* + * siw_tcp_rx_data() + * + * Main routine to consume inbound TCP payload + * + * @rd_desc: read descriptor + * @skb: socket buffer + * @off: offset in skb + * @len: skb->len - offset : payload in skb + */ +int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int off, size_t len) +{ + struct siw_qp *qp = rd_desc->arg.data; + struct siw_iwarp_rx *rctx = &qp->rx_ctx; + int rv; + + rctx->skb = skb; + rctx->skb_new = skb->len - off; + rctx->skb_offset = off; + rctx->skb_copied = 0; + + dprint(DBG_RX, "(QP%d): new data %d\n", + QP_ID(qp), rctx->skb_new); + + while (rctx->skb_new) { + int run_completion = 1; + + if (unlikely(rctx->rx_suspend)) { + /* Do not process any more data */ + rctx->skb_copied += rctx->skb_new; + break; + } + switch (rctx->state) { + + case SIW_GET_HDR: + rv = siw_get_hdr(rctx); + if (!rv) { + if (rctx->mpa_crc_hd && + siw_crc_rxhdr(rctx) != 0) { + rv = -EINVAL; + break; + } + rctx->fpdu_part_rem = + siw_fpdu_payload_len(rctx); + + if (rctx->fpdu_part_rem) + rctx->pad = -rctx->fpdu_part_rem & 0x3; + else + rctx->pad = 0; + + rctx->state = SIW_GET_DATA_START; + rctx->fpdu_part_rcvd = 0; + } + break; + + case SIW_GET_DATA_MORE: + /* + * Another data fragment of the same DDP segment. + * Setting first_ddp_seg = 0 avoids repeating + * initializations that shall occur only once per + * DDP segment. + */ + rctx->first_ddp_seg = 0; + + case SIW_GET_DATA_START: + /* + * Headers will be checked by the opcode-specific + * data receive function below. + */ + rv = siw_rx_data(qp, rctx); + if (!rv) { + rctx->fpdu_part_rem = + siw_fpdu_trailer_len(rctx); + rctx->fpdu_part_rcvd = 0; + rctx->state = SIW_GET_TRAILER; + } else { + if (unlikely(rv == -ECONNRESET)) + run_completion = 0; + else + rctx->state = SIW_GET_DATA_MORE; + } + break; + + case SIW_GET_TRAILER: + /* + * read CRC + any padding + */ + rv = siw_get_trailer(qp, rctx); + if (!rv) { + /* + * FPDU completed. + * complete RDMAP message if last fragment + */ + rctx->state = SIW_GET_HDR; + rctx->fpdu_part_rcvd = 0; + + if (!(rctx->hdr.ctrl.ddp_rdmap_ctrl + & DDP_FLAG_LAST)) + /* more frags */ + break; + + rv = siw_rdmap_complete(qp, 0); + run_completion = 0; + } + break; + + default: + pr_warn("QP[%d]: RX out of state\n", QP_ID(qp)); + rv = -EPROTO; + run_completion = 0; + } + + if (unlikely(rv != 0 && rv != -EAGAIN)) { + /* + * TODO: implement graceful error handling including + * generation (and processing) of TERMINATE + * messages. + * + * for now we are left with a bogus rx status + * unable to receive any further byte. + * BUT: code must handle difference between + * errors: + * + * o protocol syntax (FATAL, framing lost) + * o crc (FATAL, framing lost since we do not + * trust packet header (??)) + * o local resource (maybe non fatal, framing + * not lost) + * + */ + if (rctx->state > SIW_GET_HDR && run_completion) + siw_rdmap_complete(qp, rv); + + dprint(DBG_RX|DBG_ON, + "(QP%d): RX ERROR %d at RX state %d\n", + QP_ID(qp), rv, rctx->state); + + siw_dprint_rctx(rctx); + /* + * Calling siw_cm_queue_work() is safe without + * releasing qp->state_lock because the QP state + * will be transitioned to SIW_QP_STATE_ERROR + * by the siw_work_handler() workqueue handler + * after we return from siw_qp_llp_data_ready(). + */ + siw_qp_cm_drop(qp, 1); + + break; + } + if (rv) { + dprint(DBG_RX, + "(QP%d): FPDU frag. state %d, missing %d\n", + QP_ID(qp), rctx->state, rctx->fpdu_part_rem); + break; + } + } + return rctx->skb_copied; +} -- 2.13.6 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html