On Thu, 18 Apr 2024 14:39:53 +0800 Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote: > Hi, > > 在 2024/04/18 13:44, tada keisuke 写道: > > This patch depends on patch 07. > > > > All rdevs running in RAID 1/10 switch nr_pending to atomic mode. > > The value of nr_pending is read in a normal operation > > (choose_best_rdev()). Therefore, nr_pending must always be > > consistent. > > > > Signed-off-by: Keisuke TADA <keisuke1.tada@xxxxxxxxxx> > > Signed-off-by: Toshifumi OHTAKE <toshifumi.ootake@xxxxxxxxxx> > > --- > > drivers/md/md.h | 14 ++++++++++++++ > > drivers/md/raid1.c | 7 +++++++ > > drivers/md/raid10.c | 4 ++++ > > 3 files changed, 25 insertions(+) > > > > diff --git a/drivers/md/md.h b/drivers/md/md.h > > index ab09e312c9bb..57b09b567ffa 100644 > > --- a/drivers/md/md.h > > +++ b/drivers/md/md.h > > @@ -236,6 +236,20 @@ static inline unsigned long > > nr_pending_read(struct md_rdev *rdev) return > > atomic_long_read(&rdev->nr_pending.data->count); } > > > > +static inline bool nr_pending_is_percpu_mode(struct md_rdev *rdev) > > +{ > > + unsigned long __percpu *percpu_count; > > + > > + return __ref_is_percpu(&rdev->nr_pending, &percpu_count); > > +} > > + > > +static inline bool nr_pending_is_atomic_mode(struct md_rdev *rdev) > > +{ > > + unsigned long __percpu *percpu_count; > > + > > + return !__ref_is_percpu(&rdev->nr_pending, &percpu_count); > > +} > > + > > static inline int is_badblock(struct md_rdev *rdev, sector_t s, > > int sectors, sector_t *first_bad, int *bad_sectors) > > { > > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c > > index 12318fb15a88..c38ae13aadab 100644 > > --- a/drivers/md/raid1.c > > +++ b/drivers/md/raid1.c > > @@ -784,6 +784,7 @@ static int choose_best_rdev(struct r1conf > > *conf, struct r1bio *r1_bio) if (ctl.readable_disks++ == 1) > > set_bit(R1BIO_FailFast, &r1_bio->state); > > > > + WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev)); > > pending = nr_pending_read(rdev); > > dist = abs(r1_bio->sector - > > conf->mirrors[disk].head_position); > > @@ -1930,6 +1931,7 @@ static int raid1_add_disk(struct mddev > > *mddev, struct md_rdev *rdev) if (err) > > return err; > > > > + > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending); > > raid1_add_conf(conf, rdev, mirror, false); /* As all devices are > > equivalent, we don't need a full recovery > > * if this was recently any drive of the > > array @@ -1949,6 +1951,7 @@ static int raid1_add_disk(struct mddev > > *mddev, struct md_rdev *rdev) set_bit(Replacement, &rdev->flags); > > raid1_add_conf(conf, rdev, repl_slot, true); > > err = 0; > > + > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending); > > I don't understand what's the point here, 'nr_pending' will be used > when the rdev issuing IO, and it's always used as atomic mode, there > is no difference. > > Consider that 'nr_pending' must be read from IO fast path, use it as > atomic is something we must accept. Unless someone comes up with a > plan to avoid reading 'inflight' counter from fast path like generic > block layer, it's not ok to me to switch to percpu_ref for now. > > +CC Paul > > HI, Paul, perhaps you RR mode doesn't need such 'inflight' counter > anymore? > I too am struggling to see the benefit but am curious enough that I will run some tests on my own as I have fresh baseline RAID1 large sweep performance data ready right now. So my WIP round robin patch won't need nr_pedning for SSDs but I think it will for HDD plus I need a new atomic counter for round robin for SSDs anyway but I see no perfomrnace impact so far from adding it. -Paul > Thanks, > Kuai > > > conf->fullsync = 1; > > } > > > > @@ -3208,6 +3211,7 @@ static void raid1_free(struct mddev *mddev, > > void *priv); static int raid1_run(struct mddev *mddev) > > { > > struct r1conf *conf; > > + struct md_rdev *rdev; > > int i; > > int ret; > > > > @@ -3269,6 +3273,9 @@ static int raid1_run(struct mddev *mddev) > > /* > > * Ok, everything is just fine now > > */ > > + rdev_for_each(rdev, mddev) { > > + > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending); > > + } > > rcu_assign_pointer(mddev->thread, conf->thread); > > rcu_assign_pointer(conf->thread, NULL); > > mddev->private = conf; > > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c > > index b91dd6c0be5a..66896a1076e1 100644 > > --- a/drivers/md/raid10.c > > +++ b/drivers/md/raid10.c > > @@ -808,6 +808,7 @@ static struct md_rdev *read_balance(struct > > r10conf *conf, > > nonrot = bdev_nonrot(rdev->bdev); > > has_nonrot_disk |= nonrot; > > + WARN_ON_ONCE(nr_pending_is_percpu_mode(rdev)); > > pending = nr_pending_read(rdev); > > if (min_pending > pending && nonrot) { > > min_pending = pending; > > @@ -2113,6 +2114,7 @@ static int raid10_add_disk(struct mddev > > *mddev, struct md_rdev *rdev) p->recovery_disabled = > > mddev->recovery_disabled - 1; rdev->raid_disk = mirror; > > err = 0; > > + > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending); if > > (rdev->saved_raid_disk != mirror) conf->fullsync = 1; > > WRITE_ONCE(p->rdev, rdev); > > @@ -2127,6 +2129,7 @@ static int raid10_add_disk(struct mddev > > *mddev, struct md_rdev *rdev) err = mddev_stack_new_rdev(mddev, > > rdev); if (err) > > return err; > > + > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending); conf->fullsync > > = 1; WRITE_ONCE(p->replacement, rdev); > > } > > @@ -4028,6 +4031,7 @@ static int raid10_run(struct mddev *mddev) > > rdev_for_each(rdev, mddev) { > > long long diff; > > > > + > > percpu_ref_switch_to_atomic_sync(&rdev->nr_pending); disk_idx = > > rdev->raid_disk; if (disk_idx < 0) > > continue; > > > >