So I think this is conceptually fine, but I still find the API a little arcane. What do you think about the following incremental patch? If that looks good and tests good for you I can apply the series with the modifications: diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0befaad788a094..02579f4f776c7d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -355,6 +355,21 @@ void nvme_complete_rq(struct request *req) } EXPORT_SYMBOL_GPL(nvme_complete_rq); +/* + * Called to unwind from ->queue_rq on a failed command submission so that the + * multipathing code gets called to potentially failover to another path. + * The caller needs to unwind all transport specific resource allocations and + * must return propagate the return value. + */ +blk_status_t nvme_host_path_error(struct request *req) +{ + nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR; + blk_mq_set_request_complete(req); + nvme_complete_rq(req); + return BLK_STS_OK; +} +EXPORT_SYMBOL_GPL(nvme_host_path_error); + bool nvme_cancel_request(struct request *req, void *data, bool reserved) { dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index cedf9b31898673..5dfd806fc2d28c 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -552,11 +552,7 @@ blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; - - nvme_req(rq)->status = NVME_SC_HOST_PATH_ERROR; - blk_mq_set_request_complete(rq); - nvme_complete_rq(rq); - return BLK_STS_OK; + return nvme_host_path_error(rq); } EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a72f0718109100..5819f038104149 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -575,6 +575,7 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) } void nvme_complete_rq(struct request *req); +blk_status_t nvme_host_path_error(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); void nvme_cancel_tagset(struct nvme_ctrl *ctrl); void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6993efb27b39f0..f51af5e4970a2b 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2091,16 +2091,6 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, req->mr ? &req->reg_wr.wr : NULL); if (unlikely(err)) { - if (err == -EIO) { - /* - * Fail the reqest so upper layer can failover I/O - * if another path is available - */ - req->status = NVME_SC_HOST_PATH_ERROR; - blk_mq_set_request_complete(rq); - nvme_rdma_complete_rq(rq); - return BLK_STS_OK; - } goto err_unmap; } @@ -2109,7 +2099,9 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, err_unmap: nvme_rdma_unmap_data(queue, rq); err: - if (err == -ENOMEM || err == -EAGAIN) + if (err == -EIO) + ret = nvme_host_path_error(rq); + else if (err == -ENOMEM || err == -EAGAIN) ret = BLK_STS_RESOURCE; else ret = BLK_STS_IOERR;