[PATCH] md/raid5: reduce lock contention in read_one_chunk()

<gal.ofri@xxxxxxxxxx> · Thu, 3 Jun 2021 16:54:25 +0300

From: Gal Ofri <gal.ofri@xxxxxxxxxx>

There is a lock contention on device_lock in read_one_chunk().
device_lock is taken to sync conf->active_aligned_reads and conf->quiesce.
read_one_chunk() takes the lock, then waits for quiesce=0 (resumed) before
incrementing active_aligned_reads.
raid5_quiesce() takes the lock, sets quiesce=2 (in-progress), then waits
for active_aligned_reads to be zero before setting quiesce=1 (suspended).

Introduce a new rwlock for read_one_chunk() and raid5_quiesce().
device_lock is not needed to protect concurrent access to
active_aligned_reads (already atomic), so replace it with a read lock.
In order to retain the sync of conf->active_aligned_reads with
conf->quiesce, take write-lock in raid5_quiesce(). This way we still drain
active io before quiescent, and prevent new io activation in quiescent.

raid5_quiesce() uses un/lock_all_device_hash_locks_irq() for locking.
We cannot remove device_lock from there, so rename
un/lock_all_device_hash_locks_irq() to un/lock_all_quiesce_locks_irq().

My setups:
1. 8 local nvme drives (each up to 250k iops).
2. 8 ram disks (brd).

Each setup with raid6 (6+2) with group_thread_cnt=8, 1024 io threads on a
96 cpu-cores (48 per socket) system. Record both iops and cpu spent on this
contention with rand-read-4k. Record bw with sequential-read-128k.
Note: in most cases cpu is still busy but due to "new" bottlenecks.

nvme:
              | iops           | cpu  | bw
-----------------------------------------------
without patch | 1.6M           | ~50% | 5.5GB/s
with patch    | 2M (throttled) | <10% | 5.5GB/s

ram (brd):
              | iops           | cpu  | bw
-----------------------------------------------
without patch | 2M             | ~80% | 24GB/s
with patch    | 3.9M           | <10% | 50GB/s

CC: Song Liu <song@xxxxxxxxxx>
CC: Neil Brown <neilb@xxxxxxx>
Signed-off-by: Gal Ofri <gal.ofri@xxxxxxxxxx>
---
* Should I break the patch into two commits (renaming the function and
the rest of the patch)?
* Note: I tried to use a simple spinlock rather than a rwlock, but contention
remains this way.
---
 drivers/md/raid5.c | 31 ++++++++++++++++++-------------
 drivers/md/raid5.h |  1 +
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7d4ff8a5c55e..afc32350a3f8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -90,18 +90,20 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
 	spin_unlock_irq(conf->hash_locks + hash);
 }
 
-static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+static inline void lock_all_quiesce_locks_irq(struct r5conf *conf)
 {
 	int i;
 	spin_lock_irq(conf->hash_locks);
 	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
 		spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
 	spin_lock(&conf->device_lock);
+	write_lock(&conf->aligned_reads_lock);
 }
 
-static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+static inline void unlock_all_quiesce_locks_irq(struct r5conf *conf)
 {
 	int i;
+	write_unlock(&conf->aligned_reads_lock);
 	spin_unlock(&conf->device_lock);
 	for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
 		spin_unlock(conf->hash_locks + i);
@@ -5443,11 +5445,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	/* No reshape active, so we can trust rdev->data_offset */
 	align_bio->bi_iter.bi_sector += rdev->data_offset;
 
-	spin_lock_irq(&conf->device_lock);
-	wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
-			    conf->device_lock);
+	/* Ensure that active_aligned_reads and quiesce are synced */
+	read_lock_irq(&conf->aligned_reads_lock);
+	wait_event_cmd(conf->wait_for_quiescent, conf->quiesce == 0,
+			read_unlock_irq(&conf->aligned_reads_lock),
+			read_lock_irq(&conf->aligned_reads_lock));
 	atomic_inc(&conf->active_aligned_reads);
-	spin_unlock_irq(&conf->device_lock);
+	read_unlock_irq(&conf->aligned_reads_lock);
 
 	if (mddev->gendisk)
 		trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
@@ -7198,6 +7202,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	} else
 		goto abort;
 	spin_lock_init(&conf->device_lock);
+	rwlock_init(&conf->aligned_reads_lock);
 	seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
 	mutex_init(&conf->cache_size_mutex);
 	init_waitqueue_head(&conf->wait_for_quiescent);
@@ -7255,7 +7260,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
 	/* We init hash_locks[0] separately to that it can be used
 	 * as the reference lock in the spin_lock_nest_lock() call
-	 * in lock_all_device_hash_locks_irq in order to convince
+	 * in lock_all_quiesce_locks_irq in order to convince
 	 * lockdep that we know what we are doing.
 	 */
 	spin_lock_init(conf->hash_locks);
@@ -8329,7 +8334,7 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
 
 	if (quiesce) {
 		/* stop all writes */
-		lock_all_device_hash_locks_irq(conf);
+		lock_all_quiesce_locks_irq(conf);
 		/* '2' tells resync/reshape to pause so that all
 		 * active stripes can drain
 		 */
@@ -8338,19 +8343,19 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
 		wait_event_cmd(conf->wait_for_quiescent,
 				    atomic_read(&conf->active_stripes) == 0 &&
 				    atomic_read(&conf->active_aligned_reads) == 0,
-				    unlock_all_device_hash_locks_irq(conf),
-				    lock_all_device_hash_locks_irq(conf));
+				    unlock_all_quiesce_locks_irq(conf),
+				    lock_all_quiesce_locks_irq(conf));
 		conf->quiesce = 1;
-		unlock_all_device_hash_locks_irq(conf);
+		unlock_all_quiesce_locks_irq(conf);
 		/* allow reshape to continue */
 		wake_up(&conf->wait_for_overlap);
 	} else {
 		/* re-enable writes */
-		lock_all_device_hash_locks_irq(conf);
+		lock_all_quiesce_locks_irq(conf);
 		conf->quiesce = 0;
 		wake_up(&conf->wait_for_quiescent);
 		wake_up(&conf->wait_for_overlap);
-		unlock_all_device_hash_locks_irq(conf);
+		unlock_all_quiesce_locks_irq(conf);
 	}
 	log_quiesce(conf, quiesce);
 }
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 5c05acf20e1f..16ccd9e64e6a 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -610,6 +610,7 @@ struct r5conf {
 	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 	atomic_t		active_aligned_reads;
+	rwlock_t		aligned_reads_lock; /* protect active_aligned_reads from quiesce */
 	atomic_t		pending_full_writes; /* full write backlog */
 	int			bypass_count; /* bypassed prereads */
 	int			bypass_threshold; /* preread nice */
-- 
2.25.1