First, there's this race condition, caused by the following code: if (conf->quiesce == 0) { ... } if (smp_load_acquire(&conf->quiesce) != 0) { ... } cpu-0 [raid5_quiesce()] | cpu-1 [read_one_chunk()] ---------------------------------------------------------- conf->quiesce = 2 | ---------------------------------------------------------- | if (conf->quiesce == 0) | /* false */ ---------------------------------------------------------- conf->quiesce = 1 | ---------------------------------------------------------- conf->quiesce = 0 | /* resuming from quiesce. */ | ---------------------------------------------------------- | if (smp_load_acquire(&conf->quiesce) != 0) | /* false */ ---------------------------------------------------------- So we skipped both conditions and did not increment active_aligned_reads, which would result in an error down the road (endio/retry/next-quiesce/etc.). It's really unlikely, but it is possible. So I fixed it by changing: - if (smp_load_acquire(&conf->quiesce) != 0) { + if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) { Now with this fix in place, there's still the possibility that active_aligned_reads would be >0 while quiesce==1 (before read_one_chunk() reaches the second condition). I can't find a bug caused by that, but IMO it's quite counter-intuitive given wait_event(active_aligned_reads==0) in raid5_quiesce(). Should I add a comment for that point ? I'm adding the new code below for reference, please let me know what you think about that point before I re-submit the patch. thanks, Gal Ofri -- @@ -5396,6 +5396,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) struct md_rdev *rdev; sector_t sector, end_sector, first_bad; int bad_sectors, dd_idx; + bool did_inc; if (!in_chunk_boundary(mddev, raid_bio)) { pr_debug("%s: non aligned\n", __func__); @@ -5443,11 +5444,23 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) /* No reshape active, so we can trust rdev->data_offset */ align_bio->bi_iter.bi_sector += rdev->data_offset; - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, - conf->device_lock); - atomic_inc(&conf->active_aligned_reads); - spin_unlock_irq(&conf->device_lock); + did_inc = false; + if (conf->quiesce == 0) { + atomic_inc(&conf->active_aligned_reads); + did_inc = true; + } + /* need a memory barrier to detect the race with raid5_quiesce() */ + if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) { + /* quiesce is in progress, so we need to undo io activation and wait + * for it to finish */ + if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_quiescent); + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, + conf->device_lock); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock); + } if (mddev->gendisk) trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk), @@ -8334,7 +8347,9 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce) * active stripes can drain */ r5c_flush_cache(conf, INT_MAX); - conf->quiesce = 2; + /* need a memory barrier to make sure read_one_chunk() sees + * quiesce started and reverts to slow (locked) path. */ + smp_store_release(&conf->quiesce, 2); wait_event_cmd(conf->wait_for_quiescent, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0,