SCSI uses one global atomic variable to track queue depth for each LUN/request queue. This way doesn't scale well when there is lots of CPU cores and the disk is very fast. It has been observed that IOPS is affected a lot by tracking queue depth via sdev->device_busy in IO path. Return budget token from .get_budget callback, and the budget token can be passed to driver, so that we can replace the atomic variable with sbitmap_queue, then the scale issue can be fixed. Cc: Omar Sandoval <osandov@xxxxxx> Cc: Sathya Prakash <sathya.prakash@xxxxxxxxxxxx> Cc: Chaitra P B <chaitra.basappa@xxxxxxxxxxxx> Cc: Suganath Prabu Subramani <suganath-prabu.subramani@xxxxxxxxxxxx> Cc: Kashyap Desai <kashyap.desai@xxxxxxxxxxxx> Cc: Sumit Saxena <sumit.saxena@xxxxxxxxxxxx> Cc: Shivasharan S <shivasharan.srikanteshwara@xxxxxxxxxxxx> Cc: Ewan D. Milne <emilne@xxxxxxxxxx> Cc: Hannes Reinecke <hare@xxxxxxx> Cc: Bart Van Assche <bart.vanassche@xxxxxxx> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> --- block/blk-mq-sched.c | 20 ++++++++++++-------- block/blk-mq.c | 18 ++++++++++++------ block/blk-mq.h | 11 ++++++----- drivers/scsi/scsi_lib.c | 10 +++++----- include/linux/blk-mq.h | 4 ++-- 5 files changed, 37 insertions(+), 26 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index ca22afd47b3d..38dee68ba04a 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -90,6 +90,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; LIST_HEAD(rq_list); + int budget_token; do { struct request *rq; @@ -97,12 +98,13 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; - if (!blk_mq_get_dispatch_budget(hctx)) + budget_token = blk_mq_get_dispatch_budget(hctx); + if (budget_token < 0) break; rq = e->type->ops.dispatch_request(hctx); if (!rq) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(hctx, budget_token); break; } @@ -112,7 +114,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * in blk_mq_dispatch_rq_list(). */ list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + } while (blk_mq_dispatch_rq_list(q, &rq_list, budget_token)); } static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, @@ -136,6 +138,7 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; LIST_HEAD(rq_list); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); + int budget_token; do { struct request *rq; @@ -143,12 +146,13 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; - if (!blk_mq_get_dispatch_budget(hctx)) + budget_token = blk_mq_get_dispatch_budget(hctx); + if (budget_token < 0) break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(hctx, budget_token); break; } @@ -162,7 +166,7 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + } while (blk_mq_dispatch_rq_list(q, &rq_list, budget_token)); WRITE_ONCE(hctx->dispatch_from, ctx); } @@ -206,7 +210,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { + if (blk_mq_dispatch_rq_list(q, &rq_list, -1)) { if (has_sched_dispatch) blk_mq_do_dispatch_sched(hctx); else @@ -219,7 +223,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(q, &rq_list, false); + blk_mq_dispatch_rq_list(q, &rq_list, -1); } } diff --git a/block/blk-mq.c b/block/blk-mq.c index 76a5e919c336..43ae2b973d99 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1182,13 +1182,14 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, - bool got_budget) + int budget_token) { struct blk_mq_hw_ctx *hctx; struct request *rq, *nxt; bool no_tag = false; int errors, queued; blk_status_t ret = BLK_STS_OK; + bool got_budget = budget_token >= 0; if (list_empty(list)) return false; @@ -1205,8 +1206,11 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); hctx = rq->mq_hctx; - if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) - break; + if (!got_budget) { + budget_token = blk_mq_get_dispatch_budget(hctx); + if (budget_token < 0) + break; + } if (!blk_mq_get_driver_tag(rq)) { /* @@ -1217,7 +1221,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * we'll re-run it below. */ if (!blk_mq_mark_tag_wait(hctx, rq)) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(hctx, budget_token); /* * For non-shared tags, the RESTART check * will suffice. @@ -1819,6 +1823,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = rq->q; bool run_queue = true; + int budget_token; /* * RCU or SRCU read lock is needed before checking quiesced flag. @@ -1836,11 +1841,12 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (q->elevator && !bypass_insert) goto insert; - if (!blk_mq_get_dispatch_budget(hctx)) + budget_token = blk_mq_get_dispatch_budget(hctx); + if (budget_token < 0) goto insert; if (!blk_mq_get_driver_tag(rq)) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(hctx, budget_token); goto insert; } diff --git a/block/blk-mq.h b/block/blk-mq.h index eaaca8fc1c28..1c82d89791c0 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -40,7 +40,7 @@ struct blk_mq_ctx { void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); -bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); +bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, int); void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, bool kick_requeue_list); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); @@ -179,21 +179,22 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); -static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) +static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx, + int budget_token) { struct request_queue *q = hctx->queue; if (q->mq_ops->put_budget) - q->mq_ops->put_budget(hctx); + q->mq_ops->put_budget(hctx, budget_token); } -static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx) +static inline int blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; if (q->mq_ops->get_budget) return q->mq_ops->get_budget(hctx); - return true; + return 0; } static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 610ee41fa54c..c8eb555b3ce8 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1619,7 +1619,7 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) clear_bit(SCMD_STATE_COMPLETE, &cmd->state); } -static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) +static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx, int budget_token) { struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; @@ -1627,17 +1627,17 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) atomic_dec(&sdev->device_busy); } -static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) +static int scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; if (scsi_dev_queue_ready(q, sdev)) - return true; + return 0; if (atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev)) blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY); - return false; + return -1; } static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, @@ -1701,7 +1701,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (scsi_target(sdev)->can_queue > 0) atomic_dec(&scsi_target(sdev)->target_busy); out_put_budget: - scsi_mq_put_budget(hctx); + scsi_mq_put_budget(hctx, 0); switch (ret) { case BLK_STS_OK: break; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 11cfd6470b1a..6fd8d7cfe158 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -265,8 +265,8 @@ struct blk_mq_queue_data { typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *); -typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *); -typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); +typedef int (get_budget_fn)(struct blk_mq_hw_ctx *); +typedef void (put_budget_fn)(struct blk_mq_hw_ctx *, int); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); -- 2.20.1