Re: [PATCH] md/raid5: reduce lock contention in read_one_chunk()

Gal Ofri <gal.ofri@xxxxxxxxxx> · Sun, 6 Jun 2021 18:10:08 +0300




First, there's this race condition, caused by the following code:
if (conf->quiesce == 0)
{
	...
}
if (smp_load_acquire(&conf->quiesce) != 0)
{
	...
}

cpu-0 [raid5_quiesce()]	| cpu-1 [read_one_chunk()]
----------------------------------------------------------
conf->quiesce = 2 		| 
----------------------------------------------------------
				| if (conf->quiesce == 0)
				| /* false */		
----------------------------------------------------------
conf->quiesce = 1		|
----------------------------------------------------------
conf->quiesce = 0 		|
/* resuming from quiesce. */ 	|
----------------------------------------------------------
				| if (smp_load_acquire(&conf->quiesce) != 0)
				| /* false */
----------------------------------------------------------

So we skipped both conditions and did not increment active_aligned_reads, which
would result in an error down the road (endio/retry/next-quiesce/etc.).
It's really unlikely, but it is possible.

So I fixed it by changing:
-	if (smp_load_acquire(&conf->quiesce) != 0) {
+	if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) {

Now with this fix in place, there's still the possibility that
active_aligned_reads would be >0 while quiesce==1 (before read_one_chunk()
reaches the second condition).  I can't find a bug caused by that, but IMO it's
quite counter-intuitive given wait_event(active_aligned_reads==0) in
raid5_quiesce(). Should I add a comment for that point ?

I'm adding the new code below for reference, please let me know what you think
about that point before I re-submit the patch.

thanks,
Gal Ofri
--

@@ -5396,6 +5396,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	struct md_rdev *rdev;
 	sector_t sector, end_sector, first_bad;
 	int bad_sectors, dd_idx;
+	bool did_inc;
 
 	if (!in_chunk_boundary(mddev, raid_bio)) {
 		pr_debug("%s: non aligned\n", __func__);
@@ -5443,11 +5444,23 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	/* No reshape active, so we can trust rdev->data_offset */
 	align_bio->bi_iter.bi_sector += rdev->data_offset;
 
-	spin_lock_irq(&conf->device_lock);
-	wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
-			    conf->device_lock);
-	atomic_inc(&conf->active_aligned_reads);
-	spin_unlock_irq(&conf->device_lock);
+	did_inc = false;
+	if (conf->quiesce == 0) {
+		atomic_inc(&conf->active_aligned_reads);
+		did_inc = true;
+	}
+	/* need a memory barrier to detect the race with raid5_quiesce() */
+	if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) {
+		/* quiesce is in progress, so we need to undo io activation and wait
+		 * for it to finish */
+		if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads))
+			wake_up(&conf->wait_for_quiescent);
+		spin_lock_irq(&conf->device_lock);
+		wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
+				    conf->device_lock);
+		atomic_inc(&conf->active_aligned_reads);
+		spin_unlock_irq(&conf->device_lock);
+	}
 
 	if (mddev->gendisk)
 		trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
@@ -8334,7 +8347,9 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
 		 * active stripes can drain
 		 */
 		r5c_flush_cache(conf, INT_MAX);
-		conf->quiesce = 2;
+		/* need a memory barrier to make sure read_one_chunk() sees
+		 * quiesce started and reverts to slow (locked) path. */
+		smp_store_release(&conf->quiesce, 2);
 		wait_event_cmd(conf->wait_for_quiescent,
 				    atomic_read(&conf->active_stripes) == 0 &&
 				    atomic_read(&conf->active_aligned_reads) == 0,