In the case of small NVMe-oF queue size (<32) we may enter a deadlock caused by the fact that the IB completions aren't sent waiting for 32 and the send queue will fill up. The error is seen as (using mlx5): [ 2048.693355] mlx5_0:mlx5_ib_post_send:3765:(pid 7273): [ 2048.693360] nvme nvme1: nvme_rdma_post_send failed with error code -12 This patch changes the way the signalling is done so that it depends on the queue depth now. The magic define has been removed completely. Signed-off-by: Marta Rybczynska <marta.rybczynska@xxxxxxxxx> Signed-off-by: Samuel Jones <sjones@xxxxxxxxx> --- Changes in v3: * avoid division in the fast path * reverse sig_count logic to simplify the code: it now counts down from the queue depth/2 to 0 * change sig_count to int to avoid overflows for big queues Changes in v2: * signal by queue size/2, remove hardcoded 32 * support queue depth of 1 drivers/nvme/host/rdma.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3d25add..ee6f747 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -88,7 +88,7 @@ enum nvme_rdma_queue_flags { struct nvme_rdma_queue { struct nvme_rdma_qe *rsp_ring; - u8 sig_count; + int sig_count; int queue_size; size_t cmnd_capsule_len; struct nvme_rdma_ctrl *ctrl; @@ -251,6 +251,15 @@ static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue) return queue->cm_error; } +static inline int nvme_rdma_init_sig_count(int queue_size) +{ + /* We signal completion every queue depth/2 and also + * handle the case of possible device with queue_depth=1, + * where we would need to signal every message. + */ + return max(queue_size / 2, 1); +} + static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) { struct nvme_rdma_device *dev = queue->device; @@ -556,6 +565,8 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, queue->queue_size = queue_size; + queue->sig_count = nvme_rdma_init_sig_count(queue_size); + queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(queue->cm_id)) { @@ -1011,6 +1022,16 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) nvme_rdma_wr_error(cq, wc, "SEND"); } +static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) +{ + queue->sig_count--; + if (queue->sig_count == 0) { + queue->sig_count = nvme_rdma_init_sig_count(queue->queue_size); + return 1; + } + return 0; +} + static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, struct ib_send_wr *first, bool flush) @@ -1038,9 +1059,6 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, * Would have been way to obvious to handle this in hardware or * at least the RDMA stack.. * - * This messy and racy code sniplet is copy and pasted from the iSER - * initiator, and the magic '32' comes from there as well. - * * Always signal the flushes. The magic request used for the flush * sequencer is not allocated in our driver's tagset and it's * triggered to be freed by blk_cleanup_queue(). So we need to @@ -1048,7 +1066,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, * embeded in request's payload, is not freed when __ib_process_cq() * calls wr_cqe->done(). */ - if ((++queue->sig_count % 32) == 0 || flush) + if (nvme_rdma_queue_sig_limit(queue) || flush) wr.send_flags |= IB_SEND_SIGNALED; if (first) -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html