Re: [PATCH v2 08/11] md: add atomic mode switching in RAID 1/10

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

在 2024/04/18 13:44, tada keisuke 写道:
This patch depends on patch 07.

All rdevs running in RAID 1/10 switch nr_pending to atomic mode.
The value of nr_pending is read in a normal operation (choose_best_rdev()).
Therefore, nr_pending must always be consistent.

Signed-off-by: Keisuke TADA <keisuke1.tada@xxxxxxxxxx>
Signed-off-by: Toshifumi OHTAKE <toshifumi.ootake@xxxxxxxxxx>
---
  drivers/md/md.h     | 14 ++++++++++++++
  drivers/md/raid1.c  |  7 +++++++
  drivers/md/raid10.c |  4 ++++
  3 files changed, 25 insertions(+)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index ab09e312c9bb..57b09b567ffa 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -236,6 +236,20 @@ static inline unsigned long nr_pending_read(struct md_rdev *rdev)
  	return atomic_long_read(&rdev->nr_pending.data->count);
  }
+static inline bool nr_pending_is_percpu_mode(struct md_rdev *rdev)
+{
+	unsigned long __percpu *percpu_count;
+
+	return __ref_is_percpu(&rdev->nr_pending, &percpu_count);
+}
+
+static inline bool nr_pending_is_atomic_mode(struct md_rdev *rdev)
+{
+	unsigned long __percpu *percpu_count;
+
+	return !__ref_is_percpu(&rdev->nr_pending, &percpu_count);
+}
+
  static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
  			      sector_t *first_bad, int *bad_sectors)
  {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 12318fb15a88..c38ae13aadab 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -784,6 +784,7 @@ static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
  		if (ctl.readable_disks++ == 1)
  			set_bit(R1BIO_FailFast, &r1_bio->state);
+ WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev));
  		pending = nr_pending_read(rdev);
  		dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
@@ -1930,6 +1931,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
  			if (err)
  				return err;
+ percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
  			raid1_add_conf(conf, rdev, mirror, false);
  			/* As all devices are equivalent, we don't need a full recovery
  			 * if this was recently any drive of the array
@@ -1949,6 +1951,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
  		set_bit(Replacement, &rdev->flags);
  		raid1_add_conf(conf, rdev, repl_slot, true);
  		err = 0;
+		percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);

I don't understand what's the point here, 'nr_pending' will be used when
the rdev issuing IO, and it's always used as atomic mode, there is no
difference.

Consider that 'nr_pending' must be read from IO fast path, use it as
atomic is something we must accept. Unless someone comes up with a plan
to avoid reading 'inflight' counter from fast path like generic block
layer, it's not ok to me to switch to percpu_ref for now.

+CC Paul

HI, Paul, perhaps you RR mode doesn't need such 'inflight' counter
anymore?

Thanks,
Kuai

  		conf->fullsync = 1;
  	}
@@ -3208,6 +3211,7 @@ static void raid1_free(struct mddev *mddev, void *priv);
  static int raid1_run(struct mddev *mddev)
  {
  	struct r1conf *conf;
+	struct md_rdev *rdev;
  	int i;
  	int ret;
@@ -3269,6 +3273,9 @@ static int raid1_run(struct mddev *mddev)
  	/*
  	 * Ok, everything is just fine now
  	 */
+	rdev_for_each(rdev, mddev) {
+		percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
+	}
  	rcu_assign_pointer(mddev->thread, conf->thread);
  	rcu_assign_pointer(conf->thread, NULL);
  	mddev->private = conf;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b91dd6c0be5a..66896a1076e1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -808,6 +808,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
nonrot = bdev_nonrot(rdev->bdev);
  		has_nonrot_disk |= nonrot;
+		WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev));
  		pending = nr_pending_read(rdev);
  		if (min_pending > pending && nonrot) {
  			min_pending = pending;
@@ -2113,6 +2114,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
  		p->recovery_disabled = mddev->recovery_disabled - 1;
  		rdev->raid_disk = mirror;
  		err = 0;
+		percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
  		if (rdev->saved_raid_disk != mirror)
  			conf->fullsync = 1;
  		WRITE_ONCE(p->rdev, rdev);
@@ -2127,6 +2129,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
  		err = mddev_stack_new_rdev(mddev, rdev);
  		if (err)
  			return err;
+		percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
  		conf->fullsync = 1;
  		WRITE_ONCE(p->replacement, rdev);
  	}
@@ -4028,6 +4031,7 @@ static int raid10_run(struct mddev *mddev)
  	rdev_for_each(rdev, mddev) {
  		long long diff;
+ percpu_ref_switch_to_atomic_sync(&rdev->nr_pending);
  		disk_idx = rdev->raid_disk;
  		if (disk_idx < 0)
  			continue;






[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux