Hi,
在 2024/10/04 22:05, Paul Luse 写道:
While working on read_balance() earlier this year I realized that
for nonrot media the primary disk selection criteria for RAID1 reads
was to choose the disk with the least pending IO. At the same time it
was noticed that rarely did this work out to a 50/50 split between
the disks. Initially I looked at a round-robin scheme however this
proved to be too complex when taking into account concurrency.
That led to this patch. Writes typically take longer than
reads for nonrot media so choosing the disk with the least pending
IO without knowing the mix of outstanding IO, reads vs writes, is not
optimal.
This patch takes a very simple implantation approach to place more
weight on writes vs reads when selecting a disk to read from. Based
on empirical testing of multiple drive vendors NVMe and SATA (some
data included below) I found that weighing writes by 1.25x and rounding
gave the best overall performance improvement. The boost varies by
drive, as do most drive dependent performance optimizations. Kioxia
gets the least benefit while Samsung gets the most. I also confirmed
no impact on pure read cases (or writes of course). I left the total
pending counter in place and simply added one specific to reads, there
was no reason to count them separately especially given the additional
complexity in the write path for tracking pending IO.
The fewer writes that are outstanding the less positive impact this
patch has. So the math works out well. Here are the non-weighted
and weighted values for looking at outstanding writes. The first column
is the unweighted value and the second is what is used with this patch.
Until there are at least 4 pending, no change. The more pending, the
more the value is weighted which is perfect for how the drives behave.
1 1
2 2
3 3
4 5
5 6
6 7
7 8
8 10
9 11
10 12
11 13
12 15
13 16
14 17
15 18
16 20
Below is performance data for the patch, 3 different NVMe drives
and one SATA.
WD SATA: https://photos.app.goo.gl/1smadpDEzgLaa5G48
WD NVMe: https://photos.app.goo.gl/YkTTcYfU8Yc8XWA58
SamSung NVMe: https://photos.app.goo.gl/F6MvEfmbGtRyPUFz6
Kioxia NVMe: https://photos.app.goo.gl/BAEhi8hUwsdTyj9y5
Signed-off-by: Paul Luse <paulluselinux@xxxxxxxxx>
---
drivers/md/md.h | 9 +++++++++
drivers/md/raid1.c | 34 +++++++++++++++++++++++++---------
2 files changed, 34 insertions(+), 9 deletions(-)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 5d2e6bd58e4d..1a1040ec3c4a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -162,6 +162,9 @@ struct md_rdev {
*/
};
+ atomic_t nr_reads_pending; /* tracks only mirrored reads pending
+ * to support a performance optimization
+ */
atomic_t nr_pending; /* number of pending requests.
* only maintained for arrays that
* support hot removal
@@ -923,6 +926,12 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
}
}
+static inline void mirror_rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
+{
+ atomic_dec(&rdev->nr_reads_pending);
+ rdev_dec_pending(rdev, mddev);
I know that this will cause more code changes, will it make more sense
just to split writes and reads? With following apis:
rdev_inc_pending(rdev, rw);
rdev_dec_pending(rdev, rw);
rdev_pending(rdev);
rdev_rw_pending(rdev, rw);
Thanks,
Kuai
+}
+
extern const struct md_cluster_operations *md_cluster_ops;
static inline int mddev_is_clustered(struct mddev *mddev)
{
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f55c8e67d059..5315b46d2cca 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -394,7 +394,7 @@ static void raid1_end_read_request(struct bio *bio)
if (uptodate) {
raid_end_bio_io(r1_bio);
- rdev_dec_pending(rdev, conf->mddev);
+ mirror_rdev_dec_pending(rdev, conf->mddev);
} else {
/*
* oops, read error:
@@ -584,6 +584,7 @@ static void update_read_sectors(struct r1conf *conf, int disk,
struct raid1_info *info = &conf->mirrors[disk];
atomic_inc(&info->rdev->nr_pending);
+ atomic_inc(&info->rdev->nr_reads_pending);
if (info->next_seq_sect != this_sector)
info->seq_start = this_sector;
info->next_seq_sect = this_sector + len;
@@ -760,9 +761,11 @@ struct read_balance_ctl {
int readable_disks;
};
+#define WRITE_WEIGHT 2
static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
{
int disk;
+ int nonrot = READ_ONCE(conf->nonrot_disks);
struct read_balance_ctl ctl = {
.closest_dist_disk = -1,
.closest_dist = MaxSector,
@@ -774,7 +777,7 @@ static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
struct md_rdev *rdev;
sector_t dist;
- unsigned int pending;
+ unsigned int total_pending, reads_pending;
if (r1_bio->bios[disk] == IO_BLOCKED)
continue;
@@ -787,7 +790,21 @@ static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
if (ctl.readable_disks++ == 1)
set_bit(R1BIO_FailFast, &r1_bio->state);
- pending = atomic_read(&rdev->nr_pending);
+ total_pending = atomic_read(&rdev->nr_pending);
+ if (nonrot) {
+ /* for nonrot we weigh writes slightly heavier than
+ * reads when deciding disk based on pending IOs as
+ * writes typically take longer
+ */
+ reads_pending = atomic_read(&rdev->nr_reads_pending);
+ if (total_pending > reads_pending) {
+ int writes;
+
+ writes = total_pending - reads_pending;
+ writes += (writes >> WRITE_WEIGHT);
+ total_pending = writes + reads_pending;
+ }
+ }
dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
/* Don't change to another disk for sequential reads */
@@ -799,7 +816,7 @@ static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
* Add 'pending' to avoid choosing this disk if
* there is other idle disk.
*/
- pending++;
+ total_pending++;
/*
* If there is no other idle disk, this disk
* will be chosen.
@@ -807,8 +824,8 @@ static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
ctl.sequential_disk = disk;
}
- if (ctl.min_pending > pending) {
- ctl.min_pending = pending;
+ if (ctl.min_pending > total_pending) {
+ ctl.min_pending = total_pending;
ctl.min_pending_disk = disk;
}
@@ -831,8 +848,7 @@ static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
* disk is rotational, which might/might not be optimal for raids with
* mixed ratation/non-rotational disks depending on workload.
*/
- if (ctl.min_pending_disk != -1 &&
- (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0))
+ if (ctl.min_pending_disk != -1 && (nonrot || ctl.min_pending == 0))
return ctl.min_pending_disk;
else
return ctl.closest_dist_disk;
@@ -2622,7 +2638,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
}
- rdev_dec_pending(rdev, conf->mddev);
+ mirror_rdev_dec_pending(rdev, conf->mddev);
sector = r1_bio->sector;
bio = r1_bio->master_bio;