[PATCH v1] blk-mq: add one blk_mq_req_flags_t type to support mq ctx fallback

zhuxiaohui <zhuxiaohui400@xxxxxxxxx> · Sun, 20 Oct 2024 22:40:41 +0800

From: Zhu Xiaohui <zhuxiaohui.400@xxxxxxxxxxxxx>

It is observed that nvme connect to a nvme over fabric target will
always fail when 'nohz_full' is set.

In commit a46c27026da1 ("blk-mq: don't schedule block kworker on
isolated CPUs"), it clears hctx->cpumask for all isolate CPUs,
and when nvme connect to a remote target, it may fails on this stack:

        blk_mq_alloc_request_hctx+1
        __nvme_submit_sync_cmd+106
        nvmf_connect_io_queue+181
        nvme_tcp_start_queue+293
        nvme_tcp_setup_ctrl+948
        nvme_tcp_create_ctrl+735
        nvmf_dev_write+532
        vfs_write+237
        ksys_write+107
        do_syscall_64+128
        entry_SYSCALL_64_after_hwframe+118

due to that the given blk_mq_hw_ctx->cpumask is cleared with no available
blk_mq_ctx on the hw queue.

This patch introduce a new blk_mq_req_flags_t flag 'BLK_MQ_REQ_ARB_MQ'
as well as a nvme_submit_flags_t 'NVME_SUBMIT_ARB_MQ' which are used to
indicate that block layer can fallback to a  blk_mq_ctx whose cpu
is not isolated.

Signed-off-by: Zhu Xiaohui <zhuxiaohui.400@xxxxxxxxxxxxx>
---
 block/blk-mq.c              | 12 ++++++++++--
 drivers/nvme/host/core.c    |  5 ++++-
 drivers/nvme/host/fabrics.c |  3 ++-
 drivers/nvme/host/nvme.h    |  2 ++
 include/linux/blk-mq.h      |  4 +++-
 5 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index cf626e061dd7..e4e791fd6d80 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -654,8 +654,16 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	if (!blk_mq_hw_queue_mapped(data.hctx))
 		goto out_queue_exit;
 	cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
-	if (cpu >= nr_cpu_ids)
-		goto out_queue_exit;
+	if (cpu >= nr_cpu_ids) {
+		if (!(flags & BLK_MQ_REQ_ARB_MQ))
+			goto out_queue_exit;
+		/* fallback to the first cpu not isolated */
+		for_each_online_cpu(cpu) {
+			if (!cpu_is_isolated(cpu))
+				break;
+		}
+	}
+
 	data.ctx = __blk_mq_get_ctx(q, cpu);
 
 	if (q->elevator)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 84cb859a911d..dbb9cb59e54c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1130,9 +1130,12 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		blk_flags |= BLK_MQ_REQ_RESERVED;
 	if (qid == NVME_QID_ANY)
 		req = blk_mq_alloc_request(q, nvme_req_op(cmd), blk_flags);
-	else
+	else {
+		if (flags & NVME_SUBMIT_ARB_MQ)
+			blk_flags |= BLK_MQ_REQ_ARB_MQ;
 		req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), blk_flags,
 						qid - 1);
+	}
 
 	if (IS_ERR(req))
 		return PTR_ERR(req);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 432efcbf9e2f..ef34958e33c0 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -539,7 +539,8 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 			data, sizeof(*data), qid,
 			NVME_SUBMIT_AT_HEAD |
 			NVME_SUBMIT_RESERVED |
-			NVME_SUBMIT_NOWAIT);
+			NVME_SUBMIT_NOWAIT |
+			NVME_SUBMIT_ARB_MQ);
 	if (ret) {
 		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
 				       &cmd, data);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 093cb423f536..a61b35b1cd90 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -880,6 +880,8 @@ enum {
 	NVME_SUBMIT_RESERVED = (__force nvme_submit_flags_t)(1 << 2),
 	/* Retry command when NVME_STATUS_DNR is not set in the result */
 	NVME_SUBMIT_RETRY = (__force nvme_submit_flags_t)(1 << 3),
+	/* Submit command with arbitrary mq ctx */
+	NVME_SUBMIT_ARB_MQ = (__force nvme_submit_flags_t)(1 << 4),
 };
 
 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4fecf46ef681..d14be341ea4b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -746,6 +746,8 @@ enum {
 	BLK_MQ_REQ_RESERVED	= (__force blk_mq_req_flags_t)(1 << 1),
 	/* set RQF_PM */
 	BLK_MQ_REQ_PM		= (__force blk_mq_req_flags_t)(1 << 2),
+	/* use arbitrary mq ctx */
+	BLK_MQ_REQ_ARB_MQ	= (__force blk_mq_req_flags_t)(1 << 3),
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
@@ -824,7 +826,7 @@ static inline int blk_mq_request_completed(struct request *rq)
 }
 
 /*
- * 
+ *
  * Set the state to complete when completing a request from inside ->queue_rq.
  * This is used by drivers that want to ensure special complete actions that
  * need access to the request are called on failure, e.g. by nvme for
-- 
2.39.5