To minimize overhead, we call down_{read,write}_trylock first. On success, record the acquisition with "0" latency. On failure, we call e.g. down_{read,write} and record the actual measured latency. The reason for recording uncontended acquisitions is so we have an accurate count of total acquisition attempts. This allows us to compute meaningful percentiles over the histogram data - e.g., "x% of lock acquisitions succeeded in <= y ms". Note that in e.g. the killable lock case, we record the latency even if the acquisition doesn't actually succeed. This is so we can see cases where we waited a long time, even if acquisition eventually ended up failing. Nested locks are a weird case, in that we can't do the "only compute latency in the contended case" optimization without reaching into rwsem.c's internal API. The approach this commit uses (removing "static inline" from the couple of functions we need, and forward declaring them in mmap_lock.c) was chosen instead of: - Adding mmap_lock specific code to rwsem.c directly, as this would pollute a generic file with a specific use case's implementation. - Moving the API internals we need up into rwsem.h, as this would expose them more widely than arguably we'd like. - Just computing latency in all cases, as this would add nontrivial overhead to the uncontended lock case. Signed-off-by: Axel Rasmussen <axelrasmussen@xxxxxxxxxx> --- include/linux/mmap_lock.h | 87 +++++++++++++++++++++++++++++++++++---- kernel/locking/rwsem.c | 4 +- mm/Makefile | 1 + mm/mmap_lock.c | 34 +++++++++++++++ 4 files changed, 117 insertions(+), 9 deletions(-) create mode 100644 mm/mmap_lock.c diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 9dc632add390..1c212a403911 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -1,11 +1,80 @@ #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H +#include <linux/histogram.h> +#include <linux/mm_types.h> #include <linux/mmdebug.h> +#include <linux/types.h> +#include <linux/sched/clock.h> #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER(name.mmap_lock), +#ifdef CONFIG_MMAP_LOCK_HISTOGRAMS +static inline void __mmap_lock_histogram_record_duration(struct mm_struct *mm, + u64 duration) +{ + if (likely(mm->mmap_lock_contention)) + histogram_record_rcu(mm->mmap_lock_contention, duration, 1); +} + +static inline void mmap_lock_histogram_record(struct mm_struct *mm, + u64 start_time_ns) +{ + __mmap_lock_histogram_record_duration(mm, + sched_clock() - start_time_ns); +} +#endif + +static inline bool __mmap_trylock(struct mm_struct *mm, + int (*trylock)(struct rw_semaphore *)) +{ + bool ret = trylock(&mm->mmap_lock) != 0; + +#ifdef CONFIG_MMAP_LOCK_HISTOGRAMS + if (ret) + __mmap_lock_histogram_record_duration(mm, 0); +#endif + return ret; +} + +static inline void __mmap_lock(struct mm_struct *mm, + int (*trylock)(struct rw_semaphore *), + void (*lock)(struct rw_semaphore *)) +{ +#ifdef CONFIG_MMAP_LOCK_HISTOGRAMS + u64 start_time_ns; + + if (!__mmap_trylock(mm, trylock)) { + start_time_ns = sched_clock(); + lock(&mm->mmap_lock); + mmap_lock_histogram_record(mm, start_time_ns); + } +#else + lock(&mm->mmap_lock); +#endif +} + +static inline int __mmap_lock_return(struct mm_struct *mm, + int (*trylock)(struct rw_semaphore *), + int (*lock)(struct rw_semaphore *)) +{ +#ifdef CONFIG_MMAP_LOCK_HISTOGRAMS + u64 start_time_ns; + int ret; + + if (!__mmap_trylock(mm, trylock)) { + start_time_ns = sched_clock(); + ret = lock(&mm->mmap_lock); + mmap_lock_histogram_record(mm, start_time_ns); + return ret; + } + return 0; +#else + return lock(&mm->mmap_lock); +#endif +} + static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); @@ -13,22 +82,26 @@ static inline void mmap_init_lock(struct mm_struct *mm) static inline void mmap_write_lock(struct mm_struct *mm) { - down_write(&mm->mmap_lock); + __mmap_lock(mm, down_write_trylock, down_write); } +#ifdef CONFIG_MMAP_LOCK_HISTOGRAMS +void mmap_write_lock_nested(struct mm_struct *mm, int subclass); +#else static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { down_write_nested(&mm->mmap_lock, subclass); } +#endif static inline int mmap_write_lock_killable(struct mm_struct *mm) { - return down_write_killable(&mm->mmap_lock); + return __mmap_lock_return(mm, down_write_trylock, down_write_killable); } static inline bool mmap_write_trylock(struct mm_struct *mm) { - return down_write_trylock(&mm->mmap_lock) != 0; + return __mmap_trylock(mm, down_write_trylock); } static inline void mmap_write_unlock(struct mm_struct *mm) @@ -43,17 +116,17 @@ static inline void mmap_write_downgrade(struct mm_struct *mm) static inline void mmap_read_lock(struct mm_struct *mm) { - down_read(&mm->mmap_lock); + __mmap_lock(mm, down_read_trylock, down_read); } static inline int mmap_read_lock_killable(struct mm_struct *mm) { - return down_read_killable(&mm->mmap_lock); + return __mmap_lock_return(mm, down_read_trylock, down_read_killable); } static inline bool mmap_read_trylock(struct mm_struct *mm) { - return down_read_trylock(&mm->mmap_lock) != 0; + return __mmap_trylock(mm, down_read_trylock); } static inline void mmap_read_unlock(struct mm_struct *mm) @@ -63,7 +136,7 @@ static inline void mmap_read_unlock(struct mm_struct *mm) static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) { - if (down_read_trylock(&mm->mmap_lock)) { + if (__mmap_trylock(mm, down_read_trylock)) { rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); return true; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f11b9bd3431d..d041c5ae8b4d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1380,7 +1380,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) +void __down_write(struct rw_semaphore *sem) { long tmp = RWSEM_UNLOCKED_VALUE; @@ -1405,7 +1405,7 @@ static inline int __down_write_killable(struct rw_semaphore *sem) return 0; } -static inline int __down_write_trylock(struct rw_semaphore *sem) +int __down_write_trylock(struct rw_semaphore *sem) { long tmp; diff --git a/mm/Makefile b/mm/Makefile index fccd3756b25f..2a54af3fc715 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -112,3 +112,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o +obj-$(CONFIG_MMAP_LOCK_HISTOGRAMS) += mmap_lock.o diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c new file mode 100644 index 000000000000..f166cf40c60a --- /dev/null +++ b/mm/mmap_lock.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/lockdep.h> +#include <linux/mmap_lock.h> + +#define MMAP_LOCK_CONTENDED(_mm, try, lock) \ + do { \ + if (!try(&(_mm)->mmap_lock)) { \ + u64 start_time_ns; \ + lock_contended(&(_mm)->mmap_lock.dep_map, _RET_IP_); \ + start_time_ns = sched_clock(); \ + lock(&(_mm)->mmap_lock); \ + mmap_lock_histogram_record(_mm, start_time_ns); \ + } else { \ + __mmap_lock_histogram_record_duration(_mm, 0); \ + } \ + lock_acquired(&(_mm)->mmap_lock.dep_map, _RET_IP_); \ + } while (0) + +/* Defined in kernel/locking/rwsem.c. */ +extern int __down_write_trylock(struct rw_semaphore *sem); +extern void __down_write(struct rw_semaphore *sem); + +void mmap_write_lock_nested(struct mm_struct *mm, int subclass) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + might_sleep(); + rwsem_acquire(&mm->mmap_lock.dep_map, subclass, 0, _RET_IP_); + MMAP_LOCK_CONTENDED(mm, __down_write_trylock, __down_write); +#else + mmap_write_lock(mm); +#endif +} +EXPORT_SYMBOL(mmap_write_lock_nested); -- 2.27.0.rc0.183.gde8f92d652-goog