Use local cpu in order to avoid the risk of overloading the first mapped one if completing requests has to be on the cpu where requests are dispatched. Signed-off-by: Hillf Danton <hdanton@xxxxxxxx> --- --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -426,7 +426,6 @@ struct request *blk_mq_alloc_request_hct { struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; struct request *rq; - unsigned int cpu; int ret; /* @@ -454,8 +453,14 @@ struct request *blk_mq_alloc_request_hct blk_queue_exit(q); return ERR_PTR(-EXDEV); } - cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); - alloc_data.ctx = __blk_mq_get_ctx(q, cpu); + + /* prefer local cpu if it's mapped to hw queue */ + if (!cpumask_test_cpu(raw_smp_processor_id(), alloc_data.hctx->cpumask)) { + unsigned int cpu; + cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); + if (cpu < nr_cpu_ids) + alloc_data.ctx = __blk_mq_get_ctx(q, cpu); + } rq = blk_mq_get_request(q, NULL, &alloc_data); blk_queue_exit(q);