From: Gal Ofri <gal.ofri@xxxxxxxxxx> There is a lock contention on device_lock in read_one_chunk(). device_lock is taken to sync conf->active_aligned_reads and conf->quiesce. read_one_chunk() takes the lock, then waits for quiesce=0 (resumed) before incrementing active_aligned_reads. raid5_quiesce() takes the lock, sets quiesce=2 (in-progress), then waits for active_aligned_reads to be zero before setting quiesce=1 (suspended). Introduce a new rwlock for read_one_chunk() and raid5_quiesce(). device_lock is not needed to protect concurrent access to active_aligned_reads (already atomic), so replace it with a read lock. In order to retain the sync of conf->active_aligned_reads with conf->quiesce, take write-lock in raid5_quiesce(). This way we still drain active io before quiescent, and prevent new io activation in quiescent. raid5_quiesce() uses un/lock_all_device_hash_locks_irq() for locking. We cannot remove device_lock from there, so rename un/lock_all_device_hash_locks_irq() to un/lock_all_quiesce_locks_irq(). My setups: 1. 8 local nvme drives (each up to 250k iops). 2. 8 ram disks (brd). Each setup with raid6 (6+2) with group_thread_cnt=8, 1024 io threads on a 96 cpu-cores (48 per socket) system. Record both iops and cpu spent on this contention with rand-read-4k. Record bw with sequential-read-128k. Note: in most cases cpu is still busy but due to "new" bottlenecks. nvme: | iops | cpu | bw ----------------------------------------------- without patch | 1.6M | ~50% | 5.5GB/s with patch | 2M (throttled) | <10% | 5.5GB/s ram (brd): | iops | cpu | bw ----------------------------------------------- without patch | 2M | ~80% | 24GB/s with patch | 3.9M | <10% | 50GB/s CC: Song Liu <song@xxxxxxxxxx> CC: Neil Brown <neilb@xxxxxxx> Signed-off-by: Gal Ofri <gal.ofri@xxxxxxxxxx> --- * Should I break the patch into two commits (renaming the function and the rest of the patch)? * Note: I tried to use a simple spinlock rather than a rwlock, but contention remains this way. --- drivers/md/raid5.c | 31 ++++++++++++++++++------------- drivers/md/raid5.h | 1 + 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7d4ff8a5c55e..afc32350a3f8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -90,18 +90,20 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) spin_unlock_irq(conf->hash_locks + hash); } -static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) +static inline void lock_all_quiesce_locks_irq(struct r5conf *conf) { int i; spin_lock_irq(conf->hash_locks); for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); spin_lock(&conf->device_lock); + write_lock(&conf->aligned_reads_lock); } -static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) +static inline void unlock_all_quiesce_locks_irq(struct r5conf *conf) { int i; + write_unlock(&conf->aligned_reads_lock); spin_unlock(&conf->device_lock); for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--) spin_unlock(conf->hash_locks + i); @@ -5443,11 +5445,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) /* No reshape active, so we can trust rdev->data_offset */ align_bio->bi_iter.bi_sector += rdev->data_offset; - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, - conf->device_lock); + /* Ensure that active_aligned_reads and quiesce are synced */ + read_lock_irq(&conf->aligned_reads_lock); + wait_event_cmd(conf->wait_for_quiescent, conf->quiesce == 0, + read_unlock_irq(&conf->aligned_reads_lock), + read_lock_irq(&conf->aligned_reads_lock)); atomic_inc(&conf->active_aligned_reads); - spin_unlock_irq(&conf->device_lock); + read_unlock_irq(&conf->aligned_reads_lock); if (mddev->gendisk) trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk), @@ -7198,6 +7202,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) } else goto abort; spin_lock_init(&conf->device_lock); + rwlock_init(&conf->aligned_reads_lock); seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock); mutex_init(&conf->cache_size_mutex); init_waitqueue_head(&conf->wait_for_quiescent); @@ -7255,7 +7260,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) /* We init hash_locks[0] separately to that it can be used * as the reference lock in the spin_lock_nest_lock() call - * in lock_all_device_hash_locks_irq in order to convince + * in lock_all_quiesce_locks_irq in order to convince * lockdep that we know what we are doing. */ spin_lock_init(conf->hash_locks); @@ -8329,7 +8334,7 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce) if (quiesce) { /* stop all writes */ - lock_all_device_hash_locks_irq(conf); + lock_all_quiesce_locks_irq(conf); /* '2' tells resync/reshape to pause so that all * active stripes can drain */ @@ -8338,19 +8343,19 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce) wait_event_cmd(conf->wait_for_quiescent, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, - unlock_all_device_hash_locks_irq(conf), - lock_all_device_hash_locks_irq(conf)); + unlock_all_quiesce_locks_irq(conf), + lock_all_quiesce_locks_irq(conf)); conf->quiesce = 1; - unlock_all_device_hash_locks_irq(conf); + unlock_all_quiesce_locks_irq(conf); /* allow reshape to continue */ wake_up(&conf->wait_for_overlap); } else { /* re-enable writes */ - lock_all_device_hash_locks_irq(conf); + lock_all_quiesce_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_quiescent); wake_up(&conf->wait_for_overlap); - unlock_all_device_hash_locks_irq(conf); + unlock_all_quiesce_locks_irq(conf); } log_quiesce(conf, quiesce); } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 5c05acf20e1f..16ccd9e64e6a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -610,6 +610,7 @@ struct r5conf { struct bio *retry_read_aligned_list; /* aligned bios retry list */ atomic_t preread_active_stripes; /* stripes with scheduled io */ atomic_t active_aligned_reads; + rwlock_t aligned_reads_lock; /* protect active_aligned_reads from quiesce */ atomic_t pending_full_writes; /* full write backlog */ int bypass_count; /* bypassed prereads */ int bypass_threshold; /* preread nice */ -- 2.25.1