SCSI device often has per-request_queue queue depth (.cmd_per_lun), which is applied among all hw queues actually, and this patchset calls this as shared queue depth. One theory of scheduler is that we shouldn't dequeue request from sw/scheduler queue and dispatch it to driver when the low level queue is busy. For SCSI device, queue being busy depends on the per-request_queue limit, so we should hold all hw queues if the request queue is busy. This patch introduces per-request_queue dispatch list for this purpose, and only when all requests in this list are dispatched out successfully, we can restart to dequeue request from sw/scheduler queue and dispath it to lld. Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> --- block/blk-mq.c | 8 +++++++- block/blk-mq.h | 14 +++++++++++--- include/linux/blkdev.h | 5 +++++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index c6624154bb37..db21e71bb087 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2670,8 +2670,14 @@ int blk_mq_update_sched_queue_depth(struct request_queue *q) * this queue depth limit */ if (q->queue_depth) { - queue_for_each_hw_ctx(q, hctx, i) + queue_for_each_hw_ctx(q, hctx, i) { hctx->flags |= BLK_MQ_F_SHARED_DEPTH; + hctx->dispatch_lock = &q->__mq_dispatch_lock; + hctx->dispatch_list = &q->__mq_dispatch_list; + + spin_lock_init(hctx->dispatch_lock); + INIT_LIST_HEAD(hctx->dispatch_list); + } } if (!q->elevator) diff --git a/block/blk-mq.h b/block/blk-mq.h index 86a35c799ca6..295fd9dfb01d 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -139,19 +139,27 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) static inline bool blk_mq_hctx_is_dispatch_busy(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { - return test_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state); + if (!(hctx->flags & BLK_MQ_F_SHARED_DEPTH)) + return test_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state); + return q->mq_dispatch_busy; } static inline void blk_mq_hctx_set_dispatch_busy(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { - set_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state); + if (!(hctx->flags & BLK_MQ_F_SHARED_DEPTH)) + set_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state); + else + q->mq_dispatch_busy = 1; } static inline void blk_mq_hctx_clear_dispatch_busy(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { - clear_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state); + if (!(hctx->flags & BLK_MQ_F_SHARED_DEPTH)) + clear_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state); + else + q->mq_dispatch_busy = 0; } static inline bool blk_mq_has_dispatch_rqs(struct blk_mq_hw_ctx *hctx) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 25f6a0cb27d3..bc0e607710f2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -395,6 +395,11 @@ struct request_queue { atomic_t shared_hctx_restart; + /* blk-mq dispatch list and lock for shared queue depth case */ + struct list_head __mq_dispatch_list; + spinlock_t __mq_dispatch_lock; + unsigned int mq_dispatch_busy; + struct blk_queue_stats *stats; struct rq_wb *rq_wb; -- 2.9.4