Under aarch64, __this_cpu_inc() is neither IRQ-safe nor atomic, so when percpu_up_read() is invoked under IRQ-context (e.g. aio completion), and it interrupts the process on the same CPU which is invoking percpu_down_read(), the decreasement on read_count may lost and the final value of read_count on the CPU will be unexpected as shown below: CPU 0 CPU 0 io_submit_one __sb_start_write percpu_down_read __this_cpu_inc // there is already an inflight IO, so // reading *raw_cpu_ptr(&pcp) returns 1 // half complete, then being interrupted *raw_cpu_ptr(&pcp)) += 1 nvme_irq nvme_complete_cqes blk_mq_complete_request nvme_pci_complete_rq nvme_complete_rq blk_mq_end_request blk_update_request bio_endio dio_bio_end_aio aio_complete_rw __sb_end_write percpu_up_read *raw_cpu_ptr(&pcp)) -= 1 // *raw_cpu_ptr(&pcp) is 0 // the decreasement is overwritten by the increasement *raw_cpu_ptr(&pcp)) += 1 // the final value is 1 + 1 = 2 instead of 1 Fixing it by using the IRQ-safe helper this_cpu_inc|dec() for operations on read_count. Another plausible fix is to state that percpu-rwsem can NOT be used under IRQ context and convert all users which may use it under IRQ context. Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx> --- include/linux/percpu-rwsem.h | 8 ++++---- kernel/locking/percpu-rwsem.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 5e033fe1ff4e9..5fda40f97fe91 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -60,7 +60,7 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem) * anything we did within this RCU-sched read-size critical section. */ if (likely(rcu_sync_is_idle(&sem->rss))) - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); else __percpu_down_read(sem, false); /* Unconditional memory barrier */ /* @@ -79,7 +79,7 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) * Same as in percpu_down_read(). */ if (likely(rcu_sync_is_idle(&sem->rss))) - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); else ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ preempt_enable(); @@ -103,7 +103,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem) * Same as in percpu_down_read(). */ if (likely(rcu_sync_is_idle(&sem->rss))) { - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count); } else { /* * slowpath; reader will only ever wake a single blocked @@ -115,7 +115,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem) * aggregate zero, as that is the only time it matters) they * will also see our critical section. */ - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count); rcuwait_wake_up(&sem->writer); } preempt_enable(); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 8bbafe3e5203d..70a32a576f3f2 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem); static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); /* * Due to having preemption disabled the decrement happens on @@ -71,7 +71,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) if (likely(!atomic_read_acquire(&sem->block))) return true; - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count); /* Prod writer to re-evaluate readers_active_check() */ rcuwait_wake_up(&sem->writer); -- 2.25.0.4.g0ad7144999