This change extends the MM locking API to pass ranges. The ranges will be used to implement range locking, but for now we only check that the passed ranges match between lock and unlock calls. Add a new CONFIG_MM_LOCK_RWSEM_CHECKED config option to verify that ranges are correctly paired accross lock/unlock function calls. To ensure an easy transition, the existing coarse MM locking calls are using a default range, which is represented by a per-task structure. This allows a task's paired coarse lock/unlock calls to be translated into correctly paired struct mm_lock_range locks and unlocks. Add some small additional changes to kernel/fork.c (dup_mmap had a single task locking two MM's at once, so it has to explicitly manage the corresponding struct mm_lock_range) and kernel/bpf/stackmap.c (dumping user stacks from interrupt context requires explicit tracking of struct mm_lock_range). Signed-off-by: Michel Lespinasse <walken@xxxxxxxxxx> --- arch/um/include/asm/mmu_context.h | 4 +- include/linux/mm_lock.h | 148 ++++++++++++++++++++++++++++-- include/linux/mm_types_task.h | 6 ++ include/linux/sched.h | 2 + init/init_task.c | 1 + kernel/bpf/stackmap.c | 26 ++++-- kernel/fork.c | 7 +- mm/Kconfig | 18 ++++ mm/Makefile | 1 + mm/mm_lock_rwsem_checked.c | 131 ++++++++++++++++++++++++++ 10 files changed, 322 insertions(+), 22 deletions(-) create mode 100644 mm/mm_lock_rwsem_checked.c diff --git arch/um/include/asm/mmu_context.h arch/um/include/asm/mmu_context.h index 7bd591231e2d..2e84e7d98141 100644 --- arch/um/include/asm/mmu_context.h +++ arch/um/include/asm/mmu_context.h @@ -47,12 +47,14 @@ extern void force_flush_all(void); static inline void activate_mm(struct mm_struct *old, struct mm_struct *new) { + struct mm_lock_range mm_range = MM_COARSE_LOCK_RANGE_INITIALIZER; + /* * This is called by fs/exec.c and sys_unshare() * when the new ->mm is used for the first time. */ __switch_mm(&new->context.id); - down_write_nested(&new->mmap_sem, 1); + mm_write_range_lock_nested(new, &mm_range, 1); uml_setup_stubs(new); mm_write_unlock(new); } diff --git include/linux/mm_lock.h include/linux/mm_lock.h index b5f134285e53..8ed92ebe58a1 100644 --- include/linux/mm_lock.h +++ include/linux/mm_lock.h @@ -1,56 +1,186 @@ #ifndef _LINUX_MM_LOCK_H #define _LINUX_MM_LOCK_H +#include <linux/sched.h> + static inline void mm_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_sem); } -static inline void mm_write_lock(struct mm_struct *mm) +#ifdef CONFIG_MM_LOCK_RWSEM_INLINE + +#define MM_COARSE_LOCK_RANGE_INITIALIZER {} + +static inline void mm_init_coarse_lock_range(struct mm_lock_range *range) {} + +static inline void mm_write_range_lock(struct mm_struct *mm, + struct mm_lock_range *range) { down_write(&mm->mmap_sem); } -static inline int mm_write_lock_killable(struct mm_struct *mm) +static inline void mm_write_range_lock_nested(struct mm_struct *mm, + struct mm_lock_range *range, + int subclass) +{ + down_write_nested(&mm->mmap_sem, subclass); +} + +static inline int mm_write_range_lock_killable(struct mm_struct *mm, + struct mm_lock_range *range) { return down_write_killable(&mm->mmap_sem); } -static inline bool mm_write_trylock(struct mm_struct *mm) +static inline bool mm_write_range_trylock(struct mm_struct *mm, + struct mm_lock_range *range) { return down_write_trylock(&mm->mmap_sem) != 0; } -static inline void mm_write_unlock(struct mm_struct *mm) +static inline void mm_write_range_unlock(struct mm_struct *mm, + struct mm_lock_range *range) { up_write(&mm->mmap_sem); } -static inline void mm_downgrade_write_lock(struct mm_struct *mm) +static inline void mm_downgrade_write_range_lock(struct mm_struct *mm, + struct mm_lock_range *range) { downgrade_write(&mm->mmap_sem); } -static inline void mm_read_lock(struct mm_struct *mm) +static inline void mm_read_range_lock(struct mm_struct *mm, + struct mm_lock_range *range) { down_read(&mm->mmap_sem); } -static inline int mm_read_lock_killable(struct mm_struct *mm) +static inline int mm_read_range_lock_killable(struct mm_struct *mm, + struct mm_lock_range *range) { return down_read_killable(&mm->mmap_sem); } -static inline bool mm_read_trylock(struct mm_struct *mm) +static inline bool mm_read_range_trylock(struct mm_struct *mm, + struct mm_lock_range *range) { return down_read_trylock(&mm->mmap_sem) != 0; } -static inline void mm_read_unlock(struct mm_struct *mm) +static inline void mm_read_range_unlock(struct mm_struct *mm, + struct mm_lock_range *range) { up_read(&mm->mmap_sem); } +static inline void mm_read_range_unlock_non_owner(struct mm_struct *mm, + struct mm_lock_range *range) +{ + up_read_non_owner(&mm->mmap_sem); +} + +static inline struct mm_lock_range *mm_coarse_lock_range(void) +{ + return NULL; +} + +#else /* CONFIG_MM_LOCK_RWSEM_CHECKED */ + +#define MM_COARSE_LOCK_RANGE_INITIALIZER { .mm = NULL } + +static inline void mm_init_coarse_lock_range(struct mm_lock_range *range) +{ + range->mm = NULL; +} + +extern void mm_write_range_lock(struct mm_struct *mm, + struct mm_lock_range *range); +#ifdef CONFIG_LOCKDEP +extern void mm_write_range_lock_nested(struct mm_struct *mm, + struct mm_lock_range *range, + int subclass); +#else +#define mm_write_range_lock_nested(mm, range, subclass) \ + mm_write_range_lock(mm, range) +#endif +extern int mm_write_range_lock_killable(struct mm_struct *mm, + struct mm_lock_range *range); +extern bool mm_write_range_trylock(struct mm_struct *mm, + struct mm_lock_range *range); +extern void mm_write_range_unlock(struct mm_struct *mm, + struct mm_lock_range *range); +extern void mm_downgrade_write_range_lock(struct mm_struct *mm, + struct mm_lock_range *range); +extern void mm_read_range_lock(struct mm_struct *mm, + struct mm_lock_range *range); +extern int mm_read_range_lock_killable(struct mm_struct *mm, + struct mm_lock_range *range); +extern bool mm_read_range_trylock(struct mm_struct *mm, + struct mm_lock_range *range); +extern void mm_read_range_unlock(struct mm_struct *mm, + struct mm_lock_range *range); +extern void mm_read_range_unlock_non_owner(struct mm_struct *mm, + struct mm_lock_range *range); + +static inline struct mm_lock_range *mm_coarse_lock_range(void) +{ + return ¤t->mm_coarse_lock_range; +} + +#endif + +static inline void mm_read_release(struct mm_struct *mm, unsigned long ip) +{ + rwsem_release(&mm->mmap_sem.dep_map, ip); +} + +static inline void mm_write_lock(struct mm_struct *mm) +{ + mm_write_range_lock(mm, mm_coarse_lock_range()); +} + +static inline int mm_write_lock_killable(struct mm_struct *mm) +{ + return mm_write_range_lock_killable(mm, mm_coarse_lock_range()); +} + +static inline bool mm_write_trylock(struct mm_struct *mm) +{ + return mm_write_range_trylock(mm, mm_coarse_lock_range()); +} + +static inline void mm_write_unlock(struct mm_struct *mm) +{ + mm_write_range_unlock(mm, mm_coarse_lock_range()); +} + +static inline void mm_downgrade_write_lock(struct mm_struct *mm) +{ + mm_downgrade_write_range_lock(mm, mm_coarse_lock_range()); +} + +static inline void mm_read_lock(struct mm_struct *mm) +{ + mm_read_range_lock(mm, mm_coarse_lock_range()); +} + +static inline int mm_read_lock_killable(struct mm_struct *mm) +{ + return mm_read_range_lock_killable(mm, mm_coarse_lock_range()); +} + +static inline bool mm_read_trylock(struct mm_struct *mm) +{ + return mm_read_range_trylock(mm, mm_coarse_lock_range()); +} + +static inline void mm_read_unlock(struct mm_struct *mm) +{ + mm_read_range_unlock(mm, mm_coarse_lock_range()); +} + static inline bool mm_is_locked(struct mm_struct *mm) { return rwsem_is_locked(&mm->mmap_sem) != 0; diff --git include/linux/mm_types_task.h include/linux/mm_types_task.h index c1bc6731125c..d98c2a2293c1 100644 --- include/linux/mm_types_task.h +++ include/linux/mm_types_task.h @@ -96,4 +96,10 @@ struct tlbflush_unmap_batch { #endif }; +struct mm_lock_range { +#ifdef CONFIG_MM_LOCK_RWSEM_CHECKED + struct mm_struct *mm; +#endif +}; + #endif /* _LINUX_MM_TYPES_TASK_H */ diff --git include/linux/sched.h include/linux/sched.h index 716ad1d8d95e..c573590076e1 100644 --- include/linux/sched.h +++ include/linux/sched.h @@ -1281,6 +1281,8 @@ struct task_struct { unsigned long prev_lowest_stack; #endif + struct mm_lock_range mm_coarse_lock_range; + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git init/init_task.c init/init_task.c index 9e5cbe5eab7b..ae54f69092a2 100644 --- init/init_task.c +++ init/init_task.c @@ -181,6 +181,7 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif + .mm_coarse_lock_range = MM_COARSE_LOCK_RANGE_INITIALIZER, }; EXPORT_SYMBOL(init_task); diff --git kernel/bpf/stackmap.c kernel/bpf/stackmap.c index 8087d31b6471..ba2399ce00e4 100644 --- kernel/bpf/stackmap.c +++ kernel/bpf/stackmap.c @@ -33,7 +33,8 @@ struct bpf_stack_map { /* irq_work to run up_read() for build_id lookup in nmi context */ struct stack_map_irq_work { struct irq_work irq_work; - struct rw_semaphore *sem; + struct mm_struct *mm; + struct mm_lock_range mm_range; }; static void do_up_read(struct irq_work *entry) @@ -41,8 +42,7 @@ static void do_up_read(struct irq_work *entry) struct stack_map_irq_work *work; work = container_of(entry, struct stack_map_irq_work, irq_work); - up_read_non_owner(work->sem); - work->sem = NULL; + mm_read_range_unlock_non_owner(work->mm, &work->mm_range); } static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); @@ -286,12 +286,17 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, struct vm_area_struct *vma; bool irq_work_busy = false; struct stack_map_irq_work *work = NULL; + struct mm_lock_range mm_range = MM_COARSE_LOCK_RANGE_INITIALIZER; + struct mm_lock_range *mm_range_ptr = &mm_range; if (irqs_disabled()) { work = this_cpu_ptr(&up_read_work); - if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) { /* cannot queue more up_read, fallback */ irq_work_busy = true; + } else { + mm_range_ptr = &work->mm_range; + } } /* @@ -305,7 +310,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - mm_read_trylock(current->mm) == 0) { + !mm_read_range_trylock(current->mm, mm_range_ptr)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -330,16 +335,16 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - mm_read_unlock(current->mm); + mm_read_range_unlock(current->mm, mm_range_ptr); } else { - work->sem = ¤t->mm->mmap_sem; + work->mm = current->mm; irq_work_queue(&work->irq_work); /* * The irq_work will release the mmap_sem with - * up_read_non_owner(). The rwsem_release() is called - * here to release the lock from lockdep's perspective. + * mm_read_range_unlock_non_owner(). mm_read_release() is + * called here to release the lock from lockdep's perspective. */ - rwsem_release(¤t->mm->mmap_sem.dep_map, _RET_IP_); + mm_read_release(current->mm, _RET_IP_); } } @@ -626,6 +631,7 @@ static int __init stack_map_init(void) for_each_possible_cpu(cpu) { work = per_cpu_ptr(&up_read_work, cpu); init_irq_work(&work->irq_work, do_up_read); + mm_init_coarse_lock_range(&work->mm_range); } return 0; } diff --git kernel/fork.c kernel/fork.c index d598f56e4b1e..3db694381ef5 100644 --- kernel/fork.c +++ kernel/fork.c @@ -486,6 +486,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge; LIST_HEAD(uf); + struct mm_lock_range mm_range = MM_COARSE_LOCK_RANGE_INITIALIZER; uprobe_start_dup_mmap(); if (mm_write_lock_killable(oldmm)) { @@ -497,7 +498,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* * Not linked in yet - no deadlock potential: */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + mm_write_range_lock_nested(mm, &mm_range, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); @@ -612,7 +613,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); out: - mm_write_unlock(mm); + mm_write_range_unlock(mm, &mm_range); flush_tlb_mm(oldmm); mm_write_unlock(oldmm); dup_userfaultfd_complete(&uf); @@ -947,6 +948,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif + + mm_init_coarse_lock_range(&tsk->mm_coarse_lock_range); return tsk; free_stack: diff --git mm/Kconfig mm/Kconfig index ab80933be65f..574fb51789a5 100644 --- mm/Kconfig +++ mm/Kconfig @@ -739,4 +739,22 @@ config ARCH_HAS_HUGEPD config MAPPING_DIRTY_HELPERS bool +choice + prompt "MM lock implementation (mmap_sem)" + default MM_LOCK_RWSEM_CHECKED + +config MM_LOCK_RWSEM_INLINE + bool "rwsem, inline" + help + This option preserves the traditional MM lock implementation as + inline read-write semaphone operations. + +config MM_LOCK_RWSEM_CHECKED + bool "rwsem, checked" + help + This option implements the MM lock using a read-write semaphore, + ignoring the passed address range but checking its validity. + +endchoice + endmenu diff --git mm/Makefile mm/Makefile index 1937cc251883..9f46376c6407 100644 --- mm/Makefile +++ mm/Makefile @@ -108,3 +108,4 @@ obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o +obj-$(CONFIG_MM_LOCK_RWSEM_CHECKED) += mm_lock_rwsem_checked.o diff --git mm/mm_lock_rwsem_checked.c mm/mm_lock_rwsem_checked.c new file mode 100644 index 000000000000..3551deb85e3d --- /dev/null +++ mm/mm_lock_rwsem_checked.c @@ -0,0 +1,131 @@ +#include <linux/mm_lock.h> +#include <linux/printk.h> + +static int mm_lock_debug = 1; + +static void mm_lock_dump(char *msg) { + if (!mm_lock_debug) { + return; + } + mm_lock_debug = 0; + pr_err("mm_lock_dump: %s\n", msg); + dump_stack(); + pr_err("mm_lock_dump: done\n"); +} + +void mm_write_range_lock(struct mm_struct *mm, struct mm_lock_range *range) +{ + if (range->mm != NULL) + mm_lock_dump("mm_write_range_lock"); + down_write(&mm->mmap_sem); + range->mm = mm; +} +EXPORT_SYMBOL(mm_write_range_lock); + +#ifdef CONFIG_LOCKDEP +void mm_write_range_lock_nested(struct mm_struct *mm, + struct mm_lock_range *range, int subclass) +{ + if (range->mm != NULL) + mm_lock_dump("mm_write_range_lock_nested"); + down_write_nested(&mm->mmap_sem, subclass); + range->mm = mm; +} +EXPORT_SYMBOL(mm_write_range_lock_nested); +#endif + +int mm_write_range_lock_killable(struct mm_struct *mm, + struct mm_lock_range *range) +{ + int ret; + if (range->mm != NULL) + mm_lock_dump("mm_write_range_lock_killable"); + ret = down_write_killable(&mm->mmap_sem); + if (!ret) + range->mm = mm; + return ret; +} +EXPORT_SYMBOL(mm_write_range_lock_killable); + +bool mm_write_range_trylock(struct mm_struct *mm, struct mm_lock_range *range) +{ + bool ret = down_write_trylock(&mm->mmap_sem) != 0; + if (ret) { + if (range->mm != NULL) + mm_lock_dump("mm_write_range_trylock"); + range->mm = mm; + } + return ret; +} +EXPORT_SYMBOL(mm_write_range_trylock); + +void mm_write_range_unlock(struct mm_struct *mm, struct mm_lock_range *range) +{ + if (range->mm != mm) + mm_lock_dump("mm_write_range_unlock"); + range->mm = NULL; + up_write(&mm->mmap_sem); +} +EXPORT_SYMBOL(mm_write_range_unlock); + +void mm_downgrade_write_range_lock(struct mm_struct *mm, + struct mm_lock_range *range) +{ + if (range->mm != mm) + mm_lock_dump("mm_downgrade_write_range_lock"); + downgrade_write(&mm->mmap_sem); +} +EXPORT_SYMBOL(mm_downgrade_write_range_lock); + +void mm_read_range_lock(struct mm_struct *mm, struct mm_lock_range *range) +{ + if (range->mm != NULL) + mm_lock_dump("mm_read_range_lock"); + down_read(&mm->mmap_sem); + range->mm = mm; +} +EXPORT_SYMBOL(mm_read_range_lock); + +int mm_read_range_lock_killable(struct mm_struct *mm, + struct mm_lock_range *range) +{ + int ret; + if (range->mm != NULL) + mm_lock_dump("mm_read_range_lock_killable"); + ret = down_read_killable(&mm->mmap_sem); + if (!ret) + range->mm = mm; + return ret; +} +EXPORT_SYMBOL(mm_read_range_lock_killable); + +bool mm_read_range_trylock(struct mm_struct *mm, struct mm_lock_range *range) +{ + bool ret; + if (range->mm != NULL) + mm_lock_dump("mm_read_range_trylock"); + ret = down_read_trylock(&mm->mmap_sem) != 0; + if (ret) + range->mm = mm; + return ret; +} +EXPORT_SYMBOL(mm_read_range_trylock); + +void mm_read_range_unlock(struct mm_struct *mm, struct mm_lock_range *range) +{ + if (range->mm != mm) + mm_lock_dump("mm_read_range_unlock"); + range->mm = NULL; + up_read(&mm->mmap_sem); +} +EXPORT_SYMBOL(mm_read_range_unlock); + +void mm_read_range_unlock_non_owner(struct mm_struct *mm, + struct mm_lock_range *range) +{ + if (range->mm != mm) + mm_lock_dump("mm_read_range_unlock_non_owner"); + range->mm = NULL; + up_read_non_owner(&mm->mmap_sem); +} +EXPORT_SYMBOL(mm_read_range_unlock_non_owner); -- 2.25.0.341.g760bfbb309-goog