> Meantime please try the following patch and see if difference can be made. > > diff --git a/block/blk-mq.c b/block/blk-mq.c index > 49d73d979cb3..d2abec3b0f60 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -589,7 +589,7 @@ static void __blk_mq_complete_request(struct > request *rq) > * So complete IO reqeust in softirq context in case of single queue > * for not degrading IO performance by irqsoff latency. > */ > - if (q->nr_hw_queues == 1) { > + if (q->nr_hw_queues == 1 || (rq->mq_hctx->flags & > BLK_MQ_F_HOST_TAGS)) > +{ > __blk_complete_request(rq); > return; > } > @@ -1977,7 +1977,8 @@ static blk_qc_t blk_mq_make_request(struct > request_queue *q, struct bio *bio) > /* bypass scheduler for flush rq */ > blk_insert_flush(rq); > blk_mq_run_hw_queue(data.hctx, true); > - } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops- > >commit_rqs)) { > + } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs > || > + (data.hctx->flags & BLK_MQ_F_HOST_TAGS))) > { > /* > * Use plugging if we have a ->commit_rqs() hook as well, as > * we know the driver uses bd->last in a smart fashion. Ming - I tried above patch and no improvement in performance. Below is perf record data - lock contention is while getting the tag (blk_mq_get_tag ) 6.67% 6.67% fio [kernel.vmlinux] [k] native_queued_spin_lock_slowpath - 6.66% io_submit - 6.66% entry_SYSCALL_64 - do_syscall_64 - 6.66% __x64_sys_io_submit - 6.66% io_submit_one - 6.66% aio_read - 6.66% generic_file_read_iter - 6.66% blkdev_direct_IO - 6.65% submit_bio - generic_make_request - 6.65% blk_mq_make_request - 6.65% blk_mq_get_request - 6.65% blk_mq_get_tag - 6.58% prepare_to_wait_exclusive - 6.57% _raw_spin_lock_irqsave queued_spin_lock_slowpath > > thanks, > Ming