Before one CPU becomes offline, check if it is the last online CPU of hctx. If yes, mark this hctx as inactive, meantime wait for completion of all in-flight IOs originated from this hctx. Meantime check if this hctx has become inactive in blk_mq_get_driver_tag(), if yes, release the allocated tag. This way guarantees that there isn't any inflight IO before shutdowning the managed IRQ line when all CPUs of this IRQ line is offline. Cc: John Garry <john.garry@xxxxxxxxxx> Cc: Bart Van Assche <bvanassche@xxxxxxx> Cc: Hannes Reinecke <hare@xxxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Tested-by: John Garry <john.garry@xxxxxxxxxx> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> --- block/blk-mq-debugfs.c | 1 + block/blk-mq.c | 124 +++++++++++++++++++++++++++++++++++++---- include/linux/blk-mq.h | 3 + 3 files changed, 117 insertions(+), 11 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index ddec58743e88..dc66cb689d2f 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), + HCTX_STATE_NAME(INACTIVE), }; #undef HCTX_STATE_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index 54c107be7a47..4a2250ac4fbb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1038,11 +1038,36 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return true; } -static bool blk_mq_get_driver_tag(struct request *rq) +static bool blk_mq_get_driver_tag(struct request *rq, bool direct_issue) { if (rq->tag != -1) return true; - return __blk_mq_get_driver_tag(rq); + + if (!__blk_mq_get_driver_tag(rq)) + return false; + /* + * In case that direct issue IO process is migrated to other CPU + * which may not belong to this hctx, add one memory barrier so we + * can order driver tag assignment and checking BLK_MQ_S_INACTIVE. + * Otherwise, barrier() is enough given both setting BLK_MQ_S_INACTIVE + * and driver tag assignment are run on the same CPU because + * BLK_MQ_S_INACTIVE is only set after the last CPU of this hctx is + * becoming offline. + * + * Process migration might happen after the check on current processor + * id, smp_mb() is implied by processor migration, so no need to worry + * about it. + */ + if (unlikely(direct_issue && rq->mq_ctx->cpu != raw_smp_processor_id())) + smp_mb(); + else + barrier(); + + if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state))) { + blk_mq_put_driver_tag(rq); + return false; + } + return true; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, @@ -1091,7 +1116,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run. */ - return blk_mq_get_driver_tag(rq); + return blk_mq_get_driver_tag(rq, false); } wait = &hctx->dispatch_wait; @@ -1117,7 +1142,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, * allocation failure and adding the hardware queue to the wait * queue. */ - ret = blk_mq_get_driver_tag(rq); + ret = blk_mq_get_driver_tag(rq, false); if (!ret) { spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); @@ -1218,7 +1243,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, break; } - if (!blk_mq_get_driver_tag(rq)) { + if (!blk_mq_get_driver_tag(rq, false)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The @@ -1250,7 +1275,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bd.last = true; else { nxt = list_first_entry(list, struct request, queuelist); - bd.last = !blk_mq_get_driver_tag(nxt); + bd.last = !blk_mq_get_driver_tag(nxt, false); } ret = q->mq_ops->queue_rq(hctx, &bd); @@ -1864,7 +1889,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (!blk_mq_get_dispatch_budget(hctx)) goto insert; - if (!blk_mq_get_driver_tag(rq)) { + if (!blk_mq_get_driver_tag(rq, true)) { blk_mq_put_dispatch_budget(hctx); goto insert; } @@ -2273,13 +2298,87 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, return -ENOMEM; } -static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) +struct count_inflight_data { + unsigned count; + struct blk_mq_hw_ctx *hctx; +}; + +static bool blk_mq_count_inflight_rq(struct request *rq, void *data, + bool reserved) { - return 0; + struct count_inflight_data *count_data = data; + + /* + * Can't check rq's state because it is updated to MQ_RQ_IN_FLIGHT + * in blk_mq_start_request(), at that time we can't prevent this rq + * from being issued. + * + * So check if driver tag is assigned, if yes, count this rq as + * inflight. + */ + if (rq->tag >= 0 && rq->mq_hctx == count_data->hctx) + count_data->count++; + + return true; +} + +static bool blk_mq_inflight_rq(struct request *rq, void *data, + bool reserved) +{ + return rq->tag >= 0; +} + +static unsigned blk_mq_tags_inflight_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct count_inflight_data count_data = { + .hctx = hctx, + }; + + blk_mq_all_tag_busy_iter(hctx->tags, blk_mq_count_inflight_rq, + blk_mq_inflight_rq, &count_data); + return count_data.count; +} + +static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, + struct blk_mq_hw_ctx *hctx) +{ + if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + return false; + if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) + return false; + return true; } static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) { + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (!cpumask_test_cpu(cpu, hctx->cpumask)) + return 0; + + if (!blk_mq_last_cpu_in_hctx(cpu, hctx)) + return 0; + + /* + * Order setting BLK_MQ_S_INACTIVE versus checking rq->tag and rqs[tag], + * in blk_mq_tags_inflight_rqs. It pairs with the smp_mb() in + * blk_mq_get_driver_tag. + */ + set_bit(BLK_MQ_S_INACTIVE, &hctx->state); + smp_mb__after_atomic(); + while (blk_mq_tags_inflight_rqs(hctx)) + msleep(5); + return 0; +} + +static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (cpumask_test_cpu(cpu, hctx->cpumask)) + clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); return 0; } @@ -2290,12 +2389,15 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) */ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_dead); struct blk_mq_ctx *ctx; LIST_HEAD(tmp); enum hctx_type type; - hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); + if (!cpumask_test_cpu(cpu, hctx->cpumask)) + return 0; + ctx = __blk_mq_get_ctx(hctx->queue, cpu); type = hctx->type; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3763207d88eb..77bf861d72ec 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -403,6 +403,9 @@ enum { BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, + /* hw queue is inactive after all its CPUs become offline */ + BLK_MQ_S_INACTIVE = 3, + BLK_MQ_MAX_DEPTH = 10240, BLK_MQ_CPU_WORK_BATCH = 8, -- 2.25.2