Re: [PATCH v2 09/12] md/raid5: Keep a reference to last stripe_head for batch

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 4/21/22 3:54 AM, Logan Gunthorpe wrote:
When batching, every stripe head has to find the previous stripe head to
add to the batch list. This involves taking the hash lock which is
highly contended during IO.

Instead of finding the previous stripe_head each time, store a
reference to the previous stripe_head in a pointer so that it doesn't
require taking the contended lock another time.

The reference to the previous stripe must be released before scheduling
and waiting for work to get done. Otherwise, it can hold up
raid5_activate_delayed() and deadlock.

Signed-off-by: Logan Gunthorpe<logang@xxxxxxxxxxxx>
---
  drivers/md/raid5.c | 51 +++++++++++++++++++++++++++++++++++-----------
  1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0c250cc3bfff..28ea7b9b6ab6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -843,7 +843,8 @@ static bool stripe_can_batch(struct stripe_head *sh)
  }
/* we only do back search */
-static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
+static void stripe_add_to_batch_list(struct r5conf *conf,
+		struct stripe_head *sh, struct stripe_head *last_sh)

Nit, from stripe_add_to_batch_list's view, I think "head_sh" makes more sense than
"last_sh".

  {
  	struct stripe_head *head;
  	sector_t head_sector, tmp_sec;
@@ -856,15 +857,20 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
  		return;
  	head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
- hash = stripe_hash_locks_hash(conf, head_sector);
-	spin_lock_irq(conf->hash_locks + hash);
-	head = find_get_stripe(conf, head_sector, conf->generation, hash);
-	spin_unlock_irq(conf->hash_locks + hash);
-
-	if (!head)
-		return;
-	if (!stripe_can_batch(head))
-		goto out;
+	if (last_sh && head_sector == last_sh->sector) {
+		head = last_sh;
+		atomic_inc(&head->count);
+	} else {
+		hash = stripe_hash_locks_hash(conf, head_sector);
+		spin_lock_irq(conf->hash_locks + hash);
+		head = find_get_stripe(conf, head_sector, conf->generation,
+				       hash);
+		spin_unlock_irq(conf->hash_locks + hash);
+		if (!head)
+			return;
+		if (!stripe_can_batch(head))
+			goto out;
+	}
lock_two_stripes(head, sh);
  	/* clear_batch_ready clear the flag */
@@ -5800,6 +5806,7 @@ enum stripe_result {
struct stripe_request_ctx {
  	bool do_flush;
+	struct stripe_head *batch_last;
  };
static enum stripe_result make_stripe_request(struct mddev *mddev,
@@ -5889,8 +5896,13 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
  		return STRIPE_SCHEDULE_AND_RETRY;
  	}
- if (stripe_can_batch(sh))
-		stripe_add_to_batch_list(conf, sh);
+	if (stripe_can_batch(sh)) {
+		stripe_add_to_batch_list(conf, sh, ctx->batch_last);
+		if (ctx->batch_last)
+			raid5_release_stripe(ctx->batch_last);
+		atomic_inc(&sh->count);
+		ctx->batch_last = sh;
+	}
if (ctx->do_flush) {
  		set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
@@ -5979,6 +5991,18 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
  		} else if (res == STRIPE_RETRY) {
  			continue;
  		} else if (res == STRIPE_SCHEDULE_AND_RETRY) {
+			/*
+			 * Must release the reference to batch_last before
+			 * scheduling and waiting for work to be done,
+			 * otherwise the batch_last stripe head could prevent
+			 * raid5_activate_delayed() from making progress
+			 * and thus deadlocking.
+			 */
+			if (ctx.batch_last) {
+				raid5_release_stripe(ctx.batch_last);
+				ctx.batch_last = NULL;
+			}
+
  			schedule();
  			prepare_to_wait(&conf->wait_for_overlap, &w,
  					TASK_UNINTERRUPTIBLE);
@@ -5990,6 +6014,9 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
finish_wait(&conf->wait_for_overlap, &w); + if (ctx.batch_last)
+		raid5_release_stripe(ctx.batch_last);
+
  	if (rw == WRITE)
  		md_write_end(mddev);
  	bio_endio(bi);

Otherwise looks good, Acked-by: Guoqing Jiang <guoqing.jiang@xxxxxxxxx>

Thanks,
Guoqing



[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux