From: Zhu Xiaohui <zhuxiaohui.400@xxxxxxxxxxxxx> It is observed that nvme connect to a nvme over fabric target will always fail when 'nohz_full' is set. In commit a46c27026da1 ("blk-mq: don't schedule block kworker on isolated CPUs"), it clears hctx->cpumask for all isolate CPUs, and when nvme connect to a remote target, it may fails on this stack: blk_mq_alloc_request_hctx+1 __nvme_submit_sync_cmd+106 nvmf_connect_io_queue+181 nvme_tcp_start_queue+293 nvme_tcp_setup_ctrl+948 nvme_tcp_create_ctrl+735 nvmf_dev_write+532 vfs_write+237 ksys_write+107 do_syscall_64+128 entry_SYSCALL_64_after_hwframe+118 due to that the given blk_mq_hw_ctx->cpumask is cleared with no available blk_mq_ctx on the hw queue. This patch introduce a new blk_mq_req_flags_t flag 'BLK_MQ_REQ_ARB_MQ' as well as a nvme_submit_flags_t 'NVME_SUBMIT_ARB_MQ' which are used to indicate that block layer can fallback to a blk_mq_ctx whose cpu is not isolated. Signed-off-by: Zhu Xiaohui <zhuxiaohui.400@xxxxxxxxxxxxx> --- block/blk-mq.c | 12 ++++++++++-- drivers/nvme/host/core.c | 5 ++++- drivers/nvme/host/fabrics.c | 3 ++- drivers/nvme/host/nvme.h | 2 ++ include/linux/blk-mq.h | 4 +++- 5 files changed, 21 insertions(+), 5 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index cf626e061dd7..e4e791fd6d80 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -654,8 +654,16 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); - if (cpu >= nr_cpu_ids) - goto out_queue_exit; + if (cpu >= nr_cpu_ids) { + if (!(flags & BLK_MQ_REQ_ARB_MQ)) + goto out_queue_exit; + /* fallback to the first cpu not isolated */ + for_each_online_cpu(cpu) { + if (!cpu_is_isolated(cpu)) + break; + } + } + data.ctx = __blk_mq_get_ctx(q, cpu); if (q->elevator) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 84cb859a911d..dbb9cb59e54c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1130,9 +1130,12 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, blk_flags |= BLK_MQ_REQ_RESERVED; if (qid == NVME_QID_ANY) req = blk_mq_alloc_request(q, nvme_req_op(cmd), blk_flags); - else + else { + if (flags & NVME_SUBMIT_ARB_MQ) + blk_flags |= BLK_MQ_REQ_ARB_MQ; req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), blk_flags, qid - 1); + } if (IS_ERR(req)) return PTR_ERR(req); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 432efcbf9e2f..ef34958e33c0 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -539,7 +539,8 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) data, sizeof(*data), qid, NVME_SUBMIT_AT_HEAD | NVME_SUBMIT_RESERVED | - NVME_SUBMIT_NOWAIT); + NVME_SUBMIT_NOWAIT | + NVME_SUBMIT_ARB_MQ); if (ret) { nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), &cmd, data); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 093cb423f536..a61b35b1cd90 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -880,6 +880,8 @@ enum { NVME_SUBMIT_RESERVED = (__force nvme_submit_flags_t)(1 << 2), /* Retry command when NVME_STATUS_DNR is not set in the result */ NVME_SUBMIT_RETRY = (__force nvme_submit_flags_t)(1 << 3), + /* Submit command with arbitrary mq ctx */ + NVME_SUBMIT_ARB_MQ = (__force nvme_submit_flags_t)(1 << 4), }; int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4fecf46ef681..d14be341ea4b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -746,6 +746,8 @@ enum { BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), /* set RQF_PM */ BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), + /* use arbitrary mq ctx */ + BLK_MQ_REQ_ARB_MQ = (__force blk_mq_req_flags_t)(1 << 3), }; struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, @@ -824,7 +826,7 @@ static inline int blk_mq_request_completed(struct request *rq) } /* - * + * * Set the state to complete when completing a request from inside ->queue_rq. * This is used by drivers that want to ensure special complete actions that * need access to the request are called on failure, e.g. by nvme for -- 2.39.5