From: Ming Lei <ming.lei@xxxxxxxxxx> commit 358a3a6bccb74da9d63a26b2dd5f09f1e9970e0b upstream Now restart is used in the following cases, and TAG_SHARED is for SCSI only. 1) .get_budget() returns BLK_STS_RESOURCE - if resource in target/host level isn't satisfied, this SCSI device will be added in shost->starved_list, and the whole queue will be rerun (via SCSI's built-in RESTART) in scsi_end_request() after any request initiated from this host/targe is completed. Forget to mention, host level resource can't be an issue for blk-mq at all. - the same is true if resource in the queue level isn't satisfied. - if there isn't outstanding request on this queue, then SCSI's RESTART can't work(blk-mq's can't work too), and the queue will be run after SCSI_QUEUE_DELAY, and finally all starved sdevs will be handled by SCSI's RESTART when this request is finished 2) scsi_dispatch_cmd() returns BLK_STS_RESOURCE - if there isn't onprogressing request on this queue, the queue will be run after SCSI_QUEUE_DELAY - otherwise, SCSI's RESTART covers the rerun. 3) blk_mq_get_driver_tag() failed - BLK_MQ_S_TAG_WAITING covers the cross-queue RESTART for driver allocation. In one word, SCSI's built-in RESTART is enough to cover the queue rerun, and we don't need to pay special attention to TAG_SHARED wrt. restart. In my test on scsi_debug(8 luns), this patch improves IOPS by 20% ~ 30% when running I/O on these 8 luns concurrently. Aslo Roman Pen reported the current RESTART is very expensive especialy when there are lots of LUNs attached in one host, such as in his test, RESTART causes half of IOPS be cut. Fixes: https://marc.info/?l=linux-kernel&m=150832216727524&w=2 Fixes: 6d8c6c0f97ad ("blk-mq: Restart a single queue if tag sets are shared") Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> Signed-off-by: Jack Wang <jinpu.wang@xxxxxxxxxxxxxxxx> --- block/blk-mq-sched.c | 78 +++------------------------------------------------- 1 file changed, 4 insertions(+), 74 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index df8581bb0a37..daab27feb653 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -68,25 +68,17 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } -static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return false; - - if (hctx->flags & BLK_MQ_F_TAG_SHARED) { - struct request_queue *q = hctx->queue; + return; - if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_dec(&q->shared_hctx_restart); - } else - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); if (blk_mq_hctx_has_pending(hctx)) { blk_mq_run_hw_queue(hctx, true); - return true; + return; } - - return false; } /* return true if hctx need to run again */ @@ -385,68 +377,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return true; } -/** - * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list - * @pos: loop cursor. - * @skip: the list element that will not be examined. Iteration starts at - * @skip->next. - * @head: head of the list to examine. This list must have at least one - * element, namely @skip. - * @member: name of the list_head structure within typeof(*pos). - */ -#define list_for_each_entry_rcu_rr(pos, skip, head, member) \ - for ((pos) = (skip); \ - (pos = (pos)->member.next != (head) ? list_entry_rcu( \ - (pos)->member.next, typeof(*pos), member) : \ - list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ - (pos) != (skip); ) - -/* - * Called after a driver tag has been freed to check whether a hctx needs to - * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware - * queues in a round-robin fashion if the tag set of @hctx is shared with other - * hardware queues. - */ -void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) -{ - struct blk_mq_tags *const tags = hctx->tags; - struct blk_mq_tag_set *const set = hctx->queue->tag_set; - struct request_queue *const queue = hctx->queue, *q; - struct blk_mq_hw_ctx *hctx2; - unsigned int i, j; - - if (set->flags & BLK_MQ_F_TAG_SHARED) { - /* - * If this is 0, then we know that no hardware queues - * have RESTART marked. We're done. - */ - if (!atomic_read(&queue->shared_hctx_restart)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu_rr(q, queue, &set->tag_list, - tag_set_list) { - queue_for_each_hw_ctx(q, hctx2, i) - if (hctx2->tags == tags && - blk_mq_sched_restart_hctx(hctx2)) - goto done; - } - j = hctx->queue_num + 1; - for (i = 0; i < queue->nr_hw_queues; i++, j++) { - if (j == queue->nr_hw_queues) - j = 0; - hctx2 = queue->queue_hw_ctx[j]; - if (hctx2->tags == tags && - blk_mq_sched_restart_hctx(hctx2)) - break; - } -done: - rcu_read_unlock(); - } else { - blk_mq_sched_restart_hctx(hctx); - } -} - /* * Add flush/fua to the queue. If we fail getting a driver tag, then * punt to the requeue list. Requeue will re-invoke us from a context -- 2.7.4