Re: [PATCH V8 07/11] blk-mq: stop to handle IO and drain IO before hctx becomes inactive

Hannes Reinecke <hare@xxxxxxx> · Fri, 24 Apr 2020 15:27:45 +0200

On 4/24/20 12:23 PM, Ming Lei wrote:
Before one CPU becomes offline, check if it is the last online CPU of hctx.
If yes, mark this hctx as inactive, meantime wait for completion of all
in-flight IOs originated from this hctx. Meantime check if this hctx has
become inactive in blk_mq_get_driver_tag(), if yes, release the
allocated tag.

This way guarantees that there isn't any inflight IO before shutdowning
the managed IRQ line when all CPUs of this IRQ line is offline.

Cc: John Garry <john.garry@xxxxxxxxxx>
Cc: Bart Van Assche <bvanassche@xxxxxxx>
Cc: Hannes Reinecke <hare@xxxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
---
  block/blk-mq-debugfs.c |   1 +
  block/blk-mq.c         | 124 +++++++++++++++++++++++++++++++++++++----
  include/linux/blk-mq.h |   3 +
  3 files changed, 117 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 8e745826eb86..b62390918ca5 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
  	HCTX_STATE_NAME(STOPPED),
  	HCTX_STATE_NAME(TAG_ACTIVE),
  	HCTX_STATE_NAME(SCHED_RESTART),
+	HCTX_STATE_NAME(INACTIVE),
  };
  #undef HCTX_STATE_NAME
  
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d432cc74ef78..4d0c271d9f6f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1050,11 +1050,31 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
  	return true;
  }
  
-static bool blk_mq_get_driver_tag(struct request *rq)
+static bool blk_mq_get_driver_tag(struct request *rq, bool direct_issue)
  {
  	if (rq->tag != -1)
  		return true;
-	return __blk_mq_get_driver_tag(rq);
+
+	if (!__blk_mq_get_driver_tag(rq))
+		return false;
+	/*
+	 * Add one memory barrier in case that direct issue IO process is
+	 * migrated to other CPU which may not belong to this hctx, so we can
+	 * order driver tag assignment and checking BLK_MQ_S_INACTIVE.
+	 * Otherwise, barrier() is enough given both setting BLK_MQ_S_INACTIVE
+	 * and driver tag assignment are run on the same CPU in case that
+	 * BLK_MQ_S_INACTIVE is set.
+	 */
+	if (unlikely(direct_issue && rq->mq_ctx->cpu != raw_smp_processor_id()))
+		smp_mb();
+	else
+		barrier();
+
+	if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state))) {
+		blk_mq_put_driver_tag(rq);
+		return false;
+	}
+	return true;
  }
  
  static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
@@ -1103,7 +1123,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
  		 * Don't clear RESTART here, someone else could have set it.
  		 * At most this will cost an extra queue run.
  		 */
-		return blk_mq_get_driver_tag(rq);
+		return blk_mq_get_driver_tag(rq, false);
  	}
  
  	wait = &hctx->dispatch_wait;
@@ -1129,7 +1149,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
  	 * allocation failure and adding the hardware queue to the wait
  	 * queue.
  	 */
-	ret = blk_mq_get_driver_tag(rq);
+	ret = blk_mq_get_driver_tag(rq, false);
  	if (!ret) {
  		spin_unlock(&hctx->dispatch_wait_lock);
  		spin_unlock_irq(&wq->lock);
@@ -1228,7 +1248,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
  			break;
  		}
  
-		if (!blk_mq_get_driver_tag(rq)) {
+		if (!blk_mq_get_driver_tag(rq, false)) {
  			/*
  			 * The initial allocation attempt failed, so we need to
  			 * rerun the hardware queue when a tag is freed. The
@@ -1260,7 +1280,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
  			bd.last = true;
  		else {
  			nxt = list_first_entry(list, struct request, queuelist);
-			bd.last = !blk_mq_get_driver_tag(nxt);
+			bd.last = !blk_mq_get_driver_tag(nxt, false);
  		}
  
  		ret = q->mq_ops->queue_rq(hctx, &bd);
@@ -1853,7 +1873,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
  	if (!blk_mq_get_dispatch_budget(hctx))
  		goto insert;
  
-	if (!blk_mq_get_driver_tag(rq)) {
+	if (!blk_mq_get_driver_tag(rq, true)) {
  		blk_mq_put_dispatch_budget(hctx);
  		goto insert;
  	}
@@ -2261,13 +2281,92 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
  	return -ENOMEM;
  }
  
-static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+struct count_inflight_data {
+	unsigned count;
+	struct blk_mq_hw_ctx *hctx;
+};
+
+static bool blk_mq_count_inflight_rq(struct request *rq, void *data,
+				     bool reserved)
  {
-	return 0;
+	struct count_inflight_data *count_data = data;
+
+	/*
+	 * Can't check rq's state because it is updated to MQ_RQ_IN_FLIGHT
+	 * in blk_mq_start_request(), at that time we can't prevent this rq
+	 * from being issued.
+	 *
+	 * So check if driver tag is assigned, if yes, count this rq as
+	 * inflight.
+	 */
+	if (rq->tag >= 0 && rq->mq_hctx == count_data->hctx)
+		count_data->count++;
+
+	return true;
+}
+
+static bool blk_mq_inflight_rq(struct request *rq, void *data,
+			       bool reserved)
+{
+	return rq->tag >= 0;
+}
+
+static unsigned blk_mq_tags_inflight_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	struct count_inflight_data count_data = {
+		.count	= 0,
+		.hctx	= hctx,
+	};
+
+	blk_mq_all_tag_busy_iter(hctx->tags, blk_mq_count_inflight_rq,
+			blk_mq_inflight_rq, &count_data);
+
+	return count_data.count;
+}
+

Remind me again: Why do we need the 'filter' function here?
Can't we just move the filter function into the main iterator and
stay with the original implementation?

+static void blk_mq_hctx_drain_inflight_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	while (1) {
+		if (!blk_mq_tags_inflight_rqs(hctx))
+			break;
+		msleep(5);
+	}
  }
  
  static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
  {
+	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+			struct blk_mq_hw_ctx, cpuhp_online);
+
+	if (!cpumask_test_cpu(cpu, hctx->cpumask))
+		return 0;
+
+	if ((cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) ||
+	    (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids))
+		return 0;
+
+	/*
+	 * The current CPU is the last one in this hctx, S_INACTIVE
+	 * can be observed in dispatch path without any barrier needed,
+	 * cause both are run on one same CPU.
+	 */
+	set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+	/*
+	 * Order setting BLK_MQ_S_INACTIVE and checking rq->tag & rqs[tag],
+	 * and its pair is the smp_mb() in blk_mq_get_driver_tag
+	 */
+	smp_mb__after_atomic();
+	blk_mq_hctx_drain_inflight_rqs(hctx);
+	return 0;
+}
+
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+			struct blk_mq_hw_ctx, cpuhp_online);
+
+	if (cpumask_test_cpu(cpu, hctx->cpumask))
+		clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
  	return 0;
  }
  
@@ -2278,12 +2377,15 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
   */
  static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
  {
-	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+			struct blk_mq_hw_ctx, cpuhp_dead);
  	struct blk_mq_ctx *ctx;
  	LIST_HEAD(tmp);
  	enum hctx_type type;
  
-	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
+	if (!cpumask_test_cpu(cpu, hctx->cpumask))
+		return 0;
+
  	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
  	type = hctx->type;
  
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f550b5274b8b..b4812c455807 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -403,6 +403,9 @@ enum {
  	BLK_MQ_S_TAG_ACTIVE	= 1,
  	BLK_MQ_S_SCHED_RESTART	= 2,
  
+	/* hw queue is inactive after all its CPUs become offline */
+	BLK_MQ_S_INACTIVE	= 3,
+
  	BLK_MQ_MAX_DEPTH	= 10240,
  
  	BLK_MQ_CPU_WORK_BATCH	= 8,

Otherwise this looks good, and is exactly what we need.
Thanks for doing the work!

Cheers,

Hannes
--
Dr. Hannes Reinecke            Teamlead Storage & Networking
hare@xxxxxxx                               +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer