[PATCH] md/raid1: fix a race between removing rdev and access conf->mirrors[i].rdev

Yufen Yu <yuyufen@xxxxxxxxxx> · Fri, 26 Jul 2019 14:00:51 +0800

We get a NULL pointer dereference oops when test raid1 as follow:

mdadm -CR /dev/md1 -l 1 -n 2 /dev/sd[ab]

mdadm /dev/md1 -f /dev/sda
mdadm /dev/md1 -r /dev/sda
mdadm /dev/md1 -a /dev/sda
sleep 5
mdadm /dev/md1 -f /dev/sdb
mdadm /dev/md1 -r /dev/sdb
mdadm /dev/md1 -a /dev/sdb

After a disk(/dev/sda) has been removed, we add the disk to raid
array again, which would trigger recovery action. Since the rdev
current state is 'spare', read/write bio can be issued to the disk.

Then we set the other disk (/dev/sdb) faulty. Since the raid array
is now in degraded state and /dev/sdb is the only 'In_sync' disk,
raid1_error() will return but without set faulty success.

However, that can interrupt the recovery action and
md_check_recovery will try to call remove_and_add_spares() to
remove the spare disk. And the race condition between
remove_and_add_spares() and raid1_write_request() in follow
can cause NULL pointer dereference for conf->mirrors[i].rdev:

raid1_write_request()   md_check_recovery    raid1_error()
rcu_read_lock()
rdev != NULL
!test_bit(Faulty, &rdev->flags)

                                           conf->recovery_disabled=
                                             mddev->recovery_disabled;
                                            return busy

                        remove_and_add_spares
                        raid1_remove_disk
                        rdev->nr_pending == 0

atomic_inc(&rdev->nr_pending);
rcu_read_unlock()

                        p->rdev=NULL

conf->mirrors[i].rdev->data_offset
NULL pointer deref!!!

                        if (!test_bit(RemoveSynchronized,
                          &rdev->flags))
                         synchronize_rcu();
                         p->rdev=rdev

To fix the race condition, we add a new flag 'WantRemove' for rdev,
which means rdev will be removed from the raid array.
Before access conf->mirrors[i].rdev, we need to ensure the rdev
without 'WantRemove' bit.

Reported-by: Zou Wei <zou_wei@xxxxxxxxxx>
Signed-off-by: Yufen Yu <yuyufen@xxxxxxxxxx>
---
 drivers/md/md.h    |  4 ++++
 drivers/md/raid1.c | 28 ++++++++++++++++++++++------
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 10f98200e2f8..ebab9340af11 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -205,6 +205,10 @@ enum flag_bits {
 				 * multiqueue device should check if there
 				 * is collision between write behind bios.
 				 */
+	WantRemove,		/* Before set conf->mirrors[i] as NULL,
+				 * we set the bit first, avoiding access the
+				 * conf->mirrors[i] after it set NULL.
+				 */
 };
 
 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 34e26834ad28..838e619cd2d8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -622,7 +622,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (r1_bio->bios[disk] == IO_BLOCKED
 		    || rdev == NULL
-		    || test_bit(Faulty, &rdev->flags))
+		    || test_bit(Faulty, &rdev->flags)
+			|| test_bit(WantRemove, &rdev->flags))
 			continue;
 		if (!test_bit(In_sync, &rdev->flags) &&
 		    rdev->recovery_offset < this_sector + sectors)
@@ -751,7 +752,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 
 	if (best_disk >= 0) {
 		rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
-		if (!rdev)
+		if (!rdev || test_bit(Faulty, &rdev->flags)
+				|| test_bit(WantRemove, &rdev->flags))
 			goto retry;
 		atomic_inc(&rdev->nr_pending);
 		sectors = best_good_sectors;
@@ -1389,7 +1391,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			break;
 		}
 		r1_bio->bios[i] = NULL;
-		if (!rdev || test_bit(Faulty, &rdev->flags)) {
+		if (!rdev || test_bit(Faulty, &rdev->flags)
+				|| test_bit(WantRemove, &rdev->flags)) {
 			if (i < conf->raid_disks)
 				set_bit(R1BIO_Degraded, &r1_bio->state);
 			continue;
@@ -1772,6 +1775,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
+			clear_bit(WantRemove, &rdev->flags);
 			err = 0;
 			/* As all devices are equivalent, we don't need a full recovery
 			 * if this was recently any drive of the array
@@ -1786,6 +1790,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			/* Add this device as a replacement */
 			clear_bit(In_sync, &rdev->flags);
 			set_bit(Replacement, &rdev->flags);
+			clear_bit(WantRemove, &rdev->flags);
 			rdev->raid_disk = mirror;
 			err = 0;
 			conf->fullsync = 1;
@@ -1825,16 +1830,26 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			err = -EBUSY;
 			goto abort;
 		}
-		p->rdev = NULL;
+
+		/*
+		 * Before set p->rdev = NULL, we set WantRemove bit avoiding
+		 * race between rdev remove and issue bio, which can cause
+		 * NULL pointer deference of rdev by conf->mirrors[i].rdev.
+		 */
+		set_bit(WantRemove, &rdev->flags);
+
 		if (!test_bit(RemoveSynchronized, &rdev->flags)) {
 			synchronize_rcu();
 			if (atomic_read(&rdev->nr_pending)) {
 				/* lost the race, try later */
 				err = -EBUSY;
-				p->rdev = rdev;
+				clear_bit(WantRemove, &rdev->flags);
 				goto abort;
 			}
 		}
+
+		p->rdev = NULL;
+
 		if (conf->mirrors[conf->raid_disks + number].rdev) {
 			/* We just removed a device that is being replaced.
 			 * Move down the replacement.  We drain all IO before
@@ -2732,7 +2747,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 		rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev == NULL ||
-		    test_bit(Faulty, &rdev->flags)) {
+		    test_bit(Faulty, &rdev->flags) ||
+			test_bit(WantRemove, &rdev->flags)) {
 			if (i < conf->raid_disks)
 				still_degraded = 1;
 		} else if (!test_bit(In_sync, &rdev->flags)) {
-- 
2.16.2.dirty