From: Davidlohr Bueso <dave@xxxxxxxxxxxx> In handle_mm_fault() we need to remember the range lock specified when the mmap_sem was first taken as pf paths can drop the lock. Although this patch may seem far too big at first, it is so due to bisectability, and later conversion patches become quite easy to follow. Furthermore, most of what this patch does is pass a pointer to an 'mmrange' stack allocated parameter that is later used by the vm_fault structure. The new interfaces are pretty much all in the following areas: - vma handling (vma_merge(), vma_adjust(), split_vma(), copy_vma()) - gup family (all except get_user_pages_unlocked(), which internally passes the mmrange). - mm walking (walk_page_vma()) - mmap/unmap (do_mmap(), do_munmap()) - handle_mm_fault(), fixup_user_fault() Most of the pain of the patch is updating all callers in the kernel for this. While tedious, it is not that hard to review, I hope. The idea is to use a local variable (no concurrency) whenever the mmap_sem is taken and we end up in pf paths that end up retaking the lock. Ie: DEFINE_RANGE_LOCK_FULL(mmrange); down_write(&mm->mmap_sem); some_fn(a, b, c, &mmrange); .... .... ... handle_mm_fault(vma, addr, flags, mmrange); ... up_write(&mm->mmap_sem); Semantically nothing changes at all, and the 'mmrange' ends up being unused for now. Later patches will use the variable when the mmap_sem wrappers replace straightforward down/up. Compile tested defconfigs on various non-x86 archs without breaking. Signed-off-by: Davidlohr Bueso <dbueso@xxxxxxx> --- arch/alpha/mm/fault.c | 3 +- arch/arc/mm/fault.c | 3 +- arch/arm/mm/fault.c | 8 ++- arch/arm/probes/uprobes/core.c | 5 +- arch/arm64/mm/fault.c | 7 ++- arch/cris/mm/fault.c | 3 +- arch/frv/mm/fault.c | 3 +- arch/hexagon/mm/vm_fault.c | 3 +- arch/ia64/mm/fault.c | 3 +- arch/m32r/mm/fault.c | 3 +- arch/m68k/mm/fault.c | 3 +- arch/metag/mm/fault.c | 3 +- arch/microblaze/mm/fault.c | 3 +- arch/mips/kernel/vdso.c | 3 +- arch/mips/mm/fault.c | 3 +- arch/mn10300/mm/fault.c | 3 +- arch/nios2/mm/fault.c | 3 +- arch/openrisc/mm/fault.c | 3 +- arch/parisc/mm/fault.c | 3 +- arch/powerpc/include/asm/mmu_context.h | 3 +- arch/powerpc/include/asm/powernv.h | 5 +- arch/powerpc/mm/copro_fault.c | 4 +- arch/powerpc/mm/fault.c | 3 +- arch/powerpc/platforms/powernv/npu-dma.c | 5 +- arch/riscv/mm/fault.c | 3 +- arch/s390/include/asm/gmap.h | 14 +++-- arch/s390/kvm/gaccess.c | 31 ++++++---- arch/s390/mm/fault.c | 3 +- arch/s390/mm/gmap.c | 80 +++++++++++++++--------- arch/score/mm/fault.c | 3 +- arch/sh/mm/fault.c | 3 +- arch/sparc/mm/fault_32.c | 6 +- arch/sparc/mm/fault_64.c | 3 +- arch/tile/mm/fault.c | 3 +- arch/um/include/asm/mmu_context.h | 3 +- arch/um/kernel/trap.c | 3 +- arch/unicore32/mm/fault.c | 8 ++- arch/x86/entry/vdso/vma.c | 3 +- arch/x86/include/asm/mmu_context.h | 5 +- arch/x86/include/asm/mpx.h | 6 +- arch/x86/mm/fault.c | 3 +- arch/x86/mm/mpx.c | 41 ++++++++----- arch/xtensa/mm/fault.c | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +- drivers/gpu/drm/i915/i915_gem_userptr.c | 4 +- drivers/gpu/drm/radeon/radeon_ttm.c | 4 +- drivers/infiniband/core/umem.c | 3 +- drivers/infiniband/core/umem_odp.c | 3 +- drivers/infiniband/hw/qib/qib_user_pages.c | 7 ++- drivers/infiniband/hw/usnic/usnic_uiom.c | 3 +- drivers/iommu/amd_iommu_v2.c | 5 +- drivers/iommu/intel-svm.c | 5 +- drivers/media/v4l2-core/videobuf-dma-sg.c | 18 ++++-- drivers/misc/mic/scif/scif_rma.c | 3 +- drivers/misc/sgi-gru/grufault.c | 43 ++++++++----- drivers/vfio/vfio_iommu_type1.c | 3 +- fs/aio.c | 3 +- fs/binfmt_elf.c | 3 +- fs/exec.c | 20 ++++-- fs/proc/internal.h | 3 + fs/proc/task_mmu.c | 29 ++++++--- fs/proc/vmcore.c | 14 ++++- fs/userfaultfd.c | 18 +++--- include/asm-generic/mm_hooks.h | 3 +- include/linux/hmm.h | 4 +- include/linux/ksm.h | 6 +- include/linux/migrate.h | 4 +- include/linux/mm.h | 73 +++++++++++++--------- include/linux/uprobes.h | 15 +++-- ipc/shm.c | 14 +++-- kernel/events/uprobes.c | 49 +++++++++------ kernel/futex.c | 3 +- mm/frame_vector.c | 4 +- mm/gup.c | 60 ++++++++++-------- mm/hmm.c | 37 ++++++----- mm/internal.h | 3 +- mm/ksm.c | 24 +++++--- mm/madvise.c | 58 ++++++++++------- mm/memcontrol.c | 13 ++-- mm/memory.c | 10 +-- mm/mempolicy.c | 35 ++++++----- mm/migrate.c | 20 +++--- mm/mincore.c | 24 +++++--- mm/mlock.c | 33 ++++++---- mm/mmap.c | 99 +++++++++++++++++------------- mm/mprotect.c | 14 +++-- mm/mremap.c | 30 +++++---- mm/nommu.c | 32 ++++++---- mm/pagewalk.c | 56 +++++++++-------- mm/process_vm_access.c | 4 +- mm/util.c | 3 +- security/tomoyo/domain.c | 3 +- virt/kvm/async_pf.c | 3 +- virt/kvm/kvm_main.c | 16 +++-- 94 files changed, 784 insertions(+), 474 deletions(-) diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index cd3c572ee912..690d86a00a20 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -90,6 +90,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, int fault, si_code = SEGV_MAPERR; siginfo_t info; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); /* As of EV6, a load into $31/$f31 is a prefetch, and never faults (or is suppressed by the PALcode). Support that for older CPUs @@ -148,7 +149,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, /* If for any reason at all we couldn't handle the fault, make sure we exit gracefully rather than endlessly redo the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index a0b7bd6d030d..e423f764f159 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -69,6 +69,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) int fault, ret; int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); /* * We fault-in kernel-space virtual memory on-demand. The @@ -137,7 +138,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); /* If Pagefault was interrupted by SIGKILL, exit page fault "early" */ if (unlikely(fatal_signal_pending(current))) { diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index b75eada23d0a..99ae40b5851a 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -221,7 +221,8 @@ static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma) static int __kprobes __do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, - unsigned int flags, struct task_struct *tsk) + unsigned int flags, struct task_struct *tsk, + struct range_lock *mmrange) { struct vm_area_struct *vma; int fault; @@ -243,7 +244,7 @@ __do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr, goto out; } - return handle_mm_fault(vma, addr & PAGE_MASK, flags); + return handle_mm_fault(vma, addr & PAGE_MASK, flags, mmrange); check_stack: /* Don't allow expansion below FIRST_USER_ADDRESS */ @@ -261,6 +262,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) struct mm_struct *mm; int fault, sig, code; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); if (notify_page_fault(regs, fsr)) return 0; @@ -308,7 +310,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) #endif } - fault = __do_page_fault(mm, addr, fsr, flags, tsk); + fault = __do_page_fault(mm, addr, fsr, flags, tsk, &mmrange); /* If we need to retry but a fatal signal is pending, handle the * signal first. We do not need to release the mmap_sem because diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index d1329f1ba4e4..e8b893eaebcf 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -30,10 +30,11 @@ bool is_swbp_insn(uprobe_opcode_t *insn) } int set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, - unsigned long vaddr) + unsigned long vaddr, struct range_lock *mmrange) { return uprobe_write_opcode(mm, vaddr, - __opcode_to_mem_arm(auprobe->bpinsn)); + __opcode_to_mem_arm(auprobe->bpinsn), + mmrange); } bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index ce441d29e7f6..1f3ad9e4f214 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -342,7 +342,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re static int __do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int mm_flags, unsigned long vm_flags, - struct task_struct *tsk) + struct task_struct *tsk, struct range_lock *mmrange) { struct vm_area_struct *vma; int fault; @@ -368,7 +368,7 @@ static int __do_page_fault(struct mm_struct *mm, unsigned long addr, goto out; } - return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags); + return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, mmrange); check_stack: if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr)) @@ -390,6 +390,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, int fault, sig, code, major = 0; unsigned long vm_flags = VM_READ | VM_WRITE; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); if (notify_page_fault(regs, esr)) return 0; @@ -450,7 +451,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, #endif } - fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk); + fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk, &mmrange); major |= fault & VM_FAULT_MAJOR; if (fault & VM_FAULT_RETRY) { diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index 29cc58038b98..16af16d77269 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c @@ -61,6 +61,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs, siginfo_t info; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); D(printk(KERN_DEBUG "Page fault for %lX on %X at %lX, prot %d write %d\n", @@ -170,7 +171,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs, * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c index cbe7aec863e3..494d33b628fc 100644 --- a/arch/frv/mm/fault.c +++ b/arch/frv/mm/fault.c @@ -41,6 +41,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear pud_t *pue; pte_t *pte; int fault; + DEFINE_RANGE_LOCK_FULL(mmrange); #if 0 const char *atxc[16] = { @@ -165,7 +166,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, ear0, flags); + fault = handle_mm_fault(vma, ear0, flags, &mmrange); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 3eec33c5cfd7..7d6ada2c2230 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -55,6 +55,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) int fault; const struct exception_table_entry *fixup; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); /* * If we're in an interrupt or have no user context, @@ -102,7 +103,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) break; } - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index dfdc152d6737..44f0ec5f77c2 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -89,6 +89,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re unsigned long mask; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); @@ -162,7 +163,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re * sure we exit gracefully rather than endlessly redo the * fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c index 46d9a5ca0e3a..0129aea46729 100644 --- a/arch/m32r/mm/fault.c +++ b/arch/m32r/mm/fault.c @@ -82,6 +82,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long flags = 0; int fault; siginfo_t info; + DEFINE_RANGE_LOCK_FULL(mmrange); /* * If BPSW IE bit enable --> set PSW IE bit @@ -197,7 +198,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, */ addr = (address & PAGE_MASK); set_thread_fault_code(error_code); - fault = handle_mm_fault(vma, addr, flags); + fault = handle_mm_fault(vma, addr, flags, &mmrange); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 03253c4f8e6a..ec32a193726f 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -75,6 +75,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, struct vm_area_struct * vma; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); pr_debug("do page fault:\nregs->sr=%#x, regs->pc=%#lx, address=%#lx, %ld, %p\n", regs->sr, regs->pc, address, error_code, mm ? mm->pgd : NULL); @@ -138,7 +139,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); pr_debug("handle_mm_fault returns %d\n", fault); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c index de54fe686080..e16ba0ea7ea1 100644 --- a/arch/metag/mm/fault.c +++ b/arch/metag/mm/fault.c @@ -56,6 +56,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, siginfo_t info; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); tsk = current; @@ -135,7 +136,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return 0; diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index f91b30f8aaa8..fd49efbdfbf4 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -93,6 +93,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, int is_write = error_code & ESR_S; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); regs->ear = address; regs->esr = error_code; @@ -216,7 +217,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 019035d7225c..56b7c29991db 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -102,6 +102,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) unsigned long gic_size, vvar_size, size, base, data_addr, vdso_addr, gic_pfn; struct vm_area_struct *vma; int ret; + DEFINE_RANGE_LOCK_FULL(mmrange); if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -110,7 +111,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) base = mmap_region(NULL, STACK_TOP, PAGE_SIZE, VM_READ|VM_WRITE|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - 0, NULL); + 0, NULL, &mmrange); if (IS_ERR_VALUE(base)) { ret = base; goto out; diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 4f8f5bf46977..1433edd01d09 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -47,6 +47,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); + DEFINE_RANGE_LOCK_FULL(mmrange); #if 0 printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(), @@ -152,7 +153,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index f0bfa1448744..71c38f0c8702 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c @@ -125,6 +125,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code, siginfo_t info; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); #ifdef CONFIG_GDBSTUB /* handle GDB stub causing a fault */ @@ -254,7 +255,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index b804dd06ea1c..768678b685af 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -49,6 +49,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, int code = SEGV_MAPERR; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); cause >>= 2; @@ -132,7 +133,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index d0021dfae20a..75ddb1e8e7e7 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -55,6 +55,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, siginfo_t info; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); tsk = current; @@ -163,7 +164,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index e247edbca68e..79db33a0cb0c 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -264,6 +264,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, unsigned long acc_type; int fault = 0; unsigned int flags; + DEFINE_RANGE_LOCK_FULL(mmrange); if (faulthandler_disabled()) goto no_context; @@ -301,7 +302,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, * fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 051b3d63afe3..089b3cf948eb 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -176,7 +176,8 @@ extern void arch_exit_mmap(struct mm_struct *mm); static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct range_lock *mmrange) { if (start <= mm->context.vdso_base && mm->context.vdso_base < end) mm->context.vdso_base = 0; diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h index dc5f6a5d4575..805ff3ba94e1 100644 --- a/arch/powerpc/include/asm/powernv.h +++ b/arch/powerpc/include/asm/powernv.h @@ -21,7 +21,7 @@ extern void pnv_npu2_destroy_context(struct npu_context *context, struct pci_dev *gpdev); extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, unsigned long *flags, unsigned long *status, - int count); + int count, struct range_lock *mmrange); void pnv_tm_init(void); #else @@ -35,7 +35,8 @@ static inline void pnv_npu2_destroy_context(struct npu_context *context, static inline int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, unsigned long *flags, - unsigned long *status, int count) { + unsigned long *status, int count, + struct range_lock *mmrange) { return -ENODEV; } diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 697b70ad1195..8f5e604828a1 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -39,6 +39,7 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, struct vm_area_struct *vma; unsigned long is_write; int ret; + DEFINE_RANGE_LOCK_FULL(mmrange); if (mm == NULL) return -EFAULT; @@ -77,7 +78,8 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, } ret = 0; - *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0); + *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, + &mmrange); if (unlikely(*flt & VM_FAULT_ERROR)) { if (*flt & VM_FAULT_OOM) { ret = -ENOMEM; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 866446cf2d9a..d562dc88687d 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -399,6 +399,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, int is_write = page_fault_is_write(error_code); int fault, major = 0; bool store_update_sp = false; + DEFINE_RANGE_LOCK_FULL(mmrange); if (notify_page_fault(regs)) return 0; @@ -514,7 +515,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); #ifdef CONFIG_PPC_MEM_KEYS /* diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 0a253b64ac5f..759e9a4c7479 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -789,7 +789,8 @@ EXPORT_SYMBOL(pnv_npu2_destroy_context); * Assumes mmap_sem is held for the contexts associated mm. */ int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, - unsigned long *flags, unsigned long *status, int count) + unsigned long *flags, unsigned long *status, + int count, struct range_lock *mmrange) { u64 rc = 0, result = 0; int i, is_write; @@ -807,7 +808,7 @@ int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, is_write = flags[i] & NPU2_WRITE; rc = get_user_pages_remote(NULL, mm, ea[i], 1, is_write ? FOLL_WRITE : 0, - page, NULL, NULL); + page, NULL, NULL, mmrange); /* * To support virtualised environments we will have to do an diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 148c98ca9b45..75d15e73ba39 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -42,6 +42,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs) unsigned long addr, cause; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; int fault, code = SEGV_MAPERR; + DEFINE_RANGE_LOCK_FULL(mmrange); cause = regs->scause; addr = regs->sbadaddr; @@ -119,7 +120,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs) * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, addr, flags); + fault = handle_mm_fault(vma, addr, flags, &mmrange); /* * If we need to retry but a fatal signal is pending, handle the diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index e07cce88dfb0..117c19a947c9 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -107,22 +107,24 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to); void __gmap_zap(struct gmap *, unsigned long gaddr); void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); -int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val, + struct range_lock *mmrange); struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, - int fake); + int fake, struct range_lock *mmrange); int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, - int fake); + int fake, struct range_lock *mmrange); int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, - int fake); + int fake, struct range_lock *mmrange); int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, - int fake); + int fake, struct range_lock *mmrange); int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, int *dat_protection, int *fake); -int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte, + struct range_lock *mmrange); void gmap_register_pte_notifier(struct gmap_notifier *); void gmap_unregister_pte_notifier(struct gmap_notifier *); diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index c24bfa72baf7..ff739b86df36 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -978,10 +978,11 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) * @saddr: faulting address in the shadow gmap * @pgt: pointer to the page table address result * @fake: pgt references contiguous guest memory block, not a pgtable + * @mmrange: address space range locking */ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, unsigned long *pgt, int *dat_protection, - int *fake) + int *fake, struct range_lock *mmrange) { struct gmap *parent; union asce asce; @@ -1034,7 +1035,8 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, rfte.val = ptr; goto shadow_r2t; } - rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); + rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val, + mmrange); if (rc) return rc; if (rfte.i) @@ -1047,7 +1049,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, *dat_protection |= rfte.p; ptr = rfte.rto * PAGE_SIZE; shadow_r2t: - rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); + rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake, mmrange); if (rc) return rc; /* fallthrough */ @@ -1060,7 +1062,8 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, rste.val = ptr; goto shadow_r3t; } - rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); + rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val, + mmrange); if (rc) return rc; if (rste.i) @@ -1074,7 +1077,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, ptr = rste.rto * PAGE_SIZE; shadow_r3t: rste.p |= *dat_protection; - rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); + rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake, mmrange); if (rc) return rc; /* fallthrough */ @@ -1087,7 +1090,8 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, rtte.val = ptr; goto shadow_sgt; } - rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); + rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val, + mmrange); if (rc) return rc; if (rtte.i) @@ -1110,7 +1114,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, ptr = rtte.fc0.sto * PAGE_SIZE; shadow_sgt: rtte.fc0.p |= *dat_protection; - rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); + rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake, mmrange); if (rc) return rc; /* fallthrough */ @@ -1123,7 +1127,8 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, ste.val = ptr; goto shadow_pgt; } - rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); + rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val, + mmrange); if (rc) return rc; if (ste.i) @@ -1142,7 +1147,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, ptr = ste.fc0.pto * (PAGE_SIZE / 2); shadow_pgt: ste.fc0.p |= *dat_protection; - rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); + rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake, mmrange); if (rc) return rc; } @@ -1172,6 +1177,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, unsigned long pgt; int dat_protection, fake; int rc; + DEFINE_RANGE_LOCK_FULL(mmrange); down_read(&sg->mm->mmap_sem); /* @@ -1184,7 +1190,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); if (rc) rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, - &fake); + &fake, &mmrange); vaddr.addr = saddr; if (fake) { @@ -1192,7 +1198,8 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, goto shadow_page; } if (!rc) - rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val); + rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, + &pte.val, &mmrange); if (!rc && pte.i) rc = PGM_PAGE_TRANSLATION; if (!rc && pte.z) @@ -1200,7 +1207,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, shadow_page: pte.p |= dat_protection; if (!rc) - rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); + rc = gmap_shadow_page(sg, saddr, __pte(pte.val), &mmrange); ipte_unlock(vcpu); up_read(&sg->mm->mmap_sem); return rc; diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 93faeca52284..17ba3c402f9d 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -421,6 +421,7 @@ static inline int do_exception(struct pt_regs *regs, int access) unsigned long address; unsigned int flags; int fault; + DEFINE_RANGE_LOCK_FULL(mmrange); tsk = current; /* @@ -507,7 +508,7 @@ static inline int do_exception(struct pt_regs *regs, int access) * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); /* No reason to continue if interrupted by SIGKILL. */ if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { fault = VM_FAULT_SIGNAL; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 2c55a2b9d6c6..b12a44813022 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -621,6 +621,7 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr; int rc; bool unlocked; + DEFINE_RANGE_LOCK_FULL(mmrange); down_read(&gmap->mm->mmap_sem); @@ -632,7 +633,7 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr, goto out_up; } if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, - &unlocked)) { + &unlocked, &mmrange)) { rc = -EFAULT; goto out_up; } @@ -835,13 +836,15 @@ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, * @gaddr: virtual address in the guest address space * @vmaddr: address in the host process address space * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @mmrange: address space range locking * * Returns 0 if the caller can retry __gmap_translate (might fail again), * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing * up or connecting the gmap page table. */ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, - unsigned long vmaddr, int prot) + unsigned long vmaddr, int prot, + struct range_lock *mmrange) { struct mm_struct *mm = gmap->mm; unsigned int fault_flags; @@ -849,7 +852,8 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, BUG_ON(gmap_is_shadow(gmap)); fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked)) + if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked, + mmrange)) return -EFAULT; if (unlocked) /* lost mmap_sem, caller has to retry __gmap_translate */ @@ -874,6 +878,7 @@ static void gmap_pte_op_end(spinlock_t *ptl) * @len: size of area * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE * @bits: pgste notification bits to set + * @mmrange: address space range locking * * Returns 0 if successfully protected, -ENOMEM if out of memory and * -EFAULT if gaddr is invalid (or mapping for shadows is missing). @@ -881,7 +886,8 @@ static void gmap_pte_op_end(spinlock_t *ptl) * Called with sg->mm->mmap_sem in read. */ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, - unsigned long len, int prot, unsigned long bits) + unsigned long len, int prot, unsigned long bits, + struct range_lock *mmrange) { unsigned long vmaddr; spinlock_t *ptl; @@ -900,7 +906,8 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, vmaddr = __gmap_translate(gmap, gaddr); if (IS_ERR_VALUE(vmaddr)) return vmaddr; - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot, + mmrange); if (rc) return rc; continue; @@ -929,13 +936,14 @@ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len, int prot) { int rc; + DEFINE_RANGE_LOCK_FULL(mmrange); if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) return -EINVAL; if (!MACHINE_HAS_ESOP && prot == PROT_READ) return -EINVAL; down_read(&gmap->mm->mmap_sem); - rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT); + rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT, &mmrange); up_read(&gmap->mm->mmap_sem); return rc; } @@ -947,6 +955,7 @@ EXPORT_SYMBOL_GPL(gmap_mprotect_notify); * @gmap: pointer to guest mapping meta data structure * @gaddr: virtual address in the guest address space * @val: pointer to the unsigned long value to return + * @mmrange: address space range locking * * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT * if reading using the virtual address failed. -EINVAL if called on a gmap @@ -954,7 +963,8 @@ EXPORT_SYMBOL_GPL(gmap_mprotect_notify); * * Called with gmap->mm->mmap_sem in read. */ -int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val, + struct range_lock *mmrange) { unsigned long address, vmaddr; spinlock_t *ptl; @@ -986,7 +996,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) rc = vmaddr; break; } - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ, mmrange); if (rc) break; } @@ -1026,12 +1036,14 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, * @raddr: rmap address in the shadow gmap * @paddr: address in the parent guest address space * @len: length of the memory area to protect + * @mmrange: address space range locking * * Returns 0 if successfully protected and the rmap was created, -ENOMEM * if out of memory and -EFAULT if paddr is invalid. */ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, - unsigned long paddr, unsigned long len) + unsigned long paddr, unsigned long len, + struct range_lock *mmrange) { struct gmap *parent; struct gmap_rmap *rmap; @@ -1069,7 +1081,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, radix_tree_preload_end(); if (rc) { kfree(rmap); - rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ, mmrange); if (rc) return rc; continue; @@ -1473,6 +1485,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, struct gmap *sg, *new; unsigned long limit; int rc; + DEFINE_RANGE_LOCK_FULL(mmrange); BUG_ON(gmap_is_shadow(parent)); spin_lock(&parent->shadow_lock); @@ -1526,7 +1539,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, down_read(&parent->mm->mmap_sem); rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, - PROT_READ, PGSTE_VSIE_BIT); + PROT_READ, PGSTE_VSIE_BIT, &mmrange); up_read(&parent->mm->mmap_sem); spin_lock(&parent->shadow_lock); new->initialized = true; @@ -1546,6 +1559,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow); * @saddr: faulting address in the shadow gmap * @r2t: parent gmap address of the region 2 table to get shadowed * @fake: r2t references contiguous guest memory block, not a r2t + * @mmrange: address space range locking * * The r2t parameter specifies the address of the source table. The * four pages of the source table are made read-only in the parent gmap @@ -1559,7 +1573,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow); * Called with sg->mm->mmap_sem in read. */ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, - int fake) + int fake, struct range_lock *mmrange) { unsigned long raddr, origin, offset, len; unsigned long *s_r2t, *table; @@ -1608,7 +1622,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, origin = r2t & _REGION_ENTRY_ORIGIN; offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len); + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, mmrange); spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 4); @@ -1635,6 +1649,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t); * @saddr: faulting address in the shadow gmap * @r3t: parent gmap address of the region 3 table to get shadowed * @fake: r3t references contiguous guest memory block, not a r3t + * @mmrange: address space range locking * * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory and @@ -1643,7 +1658,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t); * Called with sg->mm->mmap_sem in read. */ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, - int fake) + int fake, struct range_lock *mmrange) { unsigned long raddr, origin, offset, len; unsigned long *s_r3t, *table; @@ -1691,7 +1706,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, origin = r3t & _REGION_ENTRY_ORIGIN; offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len); + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, mmrange); spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 3); @@ -1718,6 +1733,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t); * @saddr: faulting address in the shadow gmap * @sgt: parent gmap address of the segment table to get shadowed * @fake: sgt references contiguous guest memory block, not a sgt + * @mmrange: address space range locking * * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory and @@ -1726,7 +1742,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t); * Called with sg->mm->mmap_sem in read. */ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, - int fake) + int fake, struct range_lock *mmrange) { unsigned long raddr, origin, offset, len; unsigned long *s_sgt, *table; @@ -1775,7 +1791,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, origin = sgt & _REGION_ENTRY_ORIGIN; offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len); + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, mmrange); spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 2); @@ -1842,6 +1858,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); * @saddr: faulting address in the shadow gmap * @pgt: parent gmap address of the page table to get shadowed * @fake: pgt references contiguous guest memory block, not a pgtable + * @mmrange: address space range locking * * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory, @@ -1850,7 +1867,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); * Called with gmap->mm->mmap_sem in read */ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, - int fake) + int fake, struct range_lock *mmrange) { unsigned long raddr, origin; unsigned long *s_pgt, *table; @@ -1894,7 +1911,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, /* Make pgt read-only in parent gmap page table (not the pgste) */ raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; - rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); + rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, mmrange); spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 1); @@ -1921,6 +1938,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt); * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap * @pte: pte in parent gmap address space to get shadowed + * @mmrange: address space range locking * * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory and @@ -1928,7 +1946,8 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt); * * Called with sg->mm->mmap_sem in read. */ -int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte, + struct range_lock *mmrange) { struct gmap *parent; struct gmap_rmap *rmap; @@ -1982,7 +2001,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) radix_tree_preload_end(); if (!rc) break; - rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot, mmrange); if (rc) break; } @@ -2117,7 +2136,8 @@ static inline void thp_split_mm(struct mm_struct *mm) * - This must be called after THP was enabled */ static int __zap_zero_pages(pmd_t *pmd, unsigned long start, - unsigned long end, struct mm_walk *walk) + unsigned long end, struct mm_walk *walk, + struct range_lock *mmrange) { unsigned long addr; @@ -2133,12 +2153,13 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start, return 0; } -static inline void zap_zero_pages(struct mm_struct *mm) +static inline void zap_zero_pages(struct mm_struct *mm, + struct range_lock *mmrange) { struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); + walk_page_range(0, TASK_SIZE, &walk, mmrange); } /* @@ -2147,6 +2168,7 @@ static inline void zap_zero_pages(struct mm_struct *mm) int s390_enable_sie(void) { struct mm_struct *mm = current->mm; + DEFINE_RANGE_LOCK_FULL(mmrange); /* Do we have pgstes? if yes, we are done */ if (mm_has_pgste(mm)) @@ -2158,7 +2180,7 @@ int s390_enable_sie(void) mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); - zap_zero_pages(mm); + zap_zero_pages(mm, &mmrange); up_write(&mm->mmap_sem); return 0; } @@ -2182,6 +2204,7 @@ int s390_enable_skey(void) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int rc = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); down_write(&mm->mmap_sem); if (mm_use_skey(mm)) @@ -2190,7 +2213,7 @@ int s390_enable_skey(void) mm->context.use_skey = 1; for (vma = mm->mmap; vma; vma = vma->vm_next) { if (ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags)) { + MADV_UNMERGEABLE, &vma->vm_flags, &mmrange)) { mm->context.use_skey = 0; rc = -ENOMEM; goto out_up; @@ -2199,7 +2222,7 @@ int s390_enable_skey(void) mm->def_flags &= ~VM_MERGEABLE; walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); + walk_page_range(0, TASK_SIZE, &walk, &mmrange); out_up: up_write(&mm->mmap_sem); @@ -2220,10 +2243,11 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr, void s390_reset_cmma(struct mm_struct *mm) { struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; + DEFINE_RANGE_LOCK_FULL(mmrange); down_write(&mm->mmap_sem); walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); + walk_page_range(0, TASK_SIZE, &walk, &mmrange); up_write(&mm->mmap_sem); } EXPORT_SYMBOL_GPL(s390_reset_cmma); diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c index b85fad4f0874..07a8637ad142 100644 --- a/arch/score/mm/fault.c +++ b/arch/score/mm/fault.c @@ -51,6 +51,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, unsigned long flags = 0; siginfo_t info; int fault; + DEFINE_RANGE_LOCK_FULL(mmrange); info.si_code = SEGV_MAPERR; @@ -111,7 +112,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, mmrange); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 6fd1bf7481c7..d36106564728 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -405,6 +405,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, struct vm_area_struct * vma; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); tsk = current; mm = tsk->mm; @@ -488,7 +489,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if (unlikely(fault & (VM_FAULT_RETRY | VM_FAULT_ERROR))) if (mm_fault_error(regs, error_code, address, fault)) diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index a8103a84b4ac..ebb2406dbe7c 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -176,6 +176,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, int from_user = !(regs->psr & PSR_PS); int fault, code; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); if (text_fault) address = regs->pc; @@ -242,7 +243,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; @@ -389,6 +390,7 @@ static void force_user_fault(unsigned long address, int write) struct mm_struct *mm = tsk->mm; unsigned int flags = FAULT_FLAG_USER; int code; + DEFINE_RANGE_LOCK_FULL(mmrange); code = SEGV_MAPERR; @@ -412,7 +414,7 @@ static void force_user_fault(unsigned long address, int write) if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } - switch (handle_mm_fault(vma, address, flags)) { + switch (handle_mm_fault(vma, address, flags, &mmrange)) { case VM_FAULT_SIGBUS: case VM_FAULT_OOM: goto do_sigbus; diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 41363f46797b..e0a3c36b0fa1 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -287,6 +287,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) int si_code, fault_code, fault; unsigned long address, mm_rss; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); fault_code = get_thread_fault_code(); @@ -438,7 +439,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) goto bad_area; } - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) goto exit_exception; diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index f58fa06a2214..09f053eb146f 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -275,6 +275,7 @@ static int handle_page_fault(struct pt_regs *regs, int is_kernel_mode; pgd_t *pgd; unsigned int flags; + DEFINE_RANGE_LOCK_FULL(mmrange); /* on TILE, protection faults are always writes */ if (!is_page_fault) @@ -437,7 +438,7 @@ static int handle_page_fault(struct pt_regs *regs, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return 0; diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h index fca34b2177e2..98cc3e36385a 100644 --- a/arch/um/include/asm/mmu_context.h +++ b/arch/um/include/asm/mmu_context.h @@ -23,7 +23,8 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) extern void arch_exit_mmap(struct mm_struct *mm); static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct range_lock *mmrange) { } static inline void arch_bprm_mm_init(struct mm_struct *mm, diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index b2b02df9896e..e632a14e896e 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -33,6 +33,7 @@ int handle_page_fault(unsigned long address, unsigned long ip, pte_t *pte; int err = -EFAULT; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); *code_out = SEGV_MAPERR; @@ -74,7 +75,7 @@ int handle_page_fault(unsigned long address, unsigned long ip, do { int fault; - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) goto out_nosemaphore; diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c index bbefcc46a45e..dd35b6191798 100644 --- a/arch/unicore32/mm/fault.c +++ b/arch/unicore32/mm/fault.c @@ -168,7 +168,8 @@ static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma) } static int __do_pf(struct mm_struct *mm, unsigned long addr, unsigned int fsr, - unsigned int flags, struct task_struct *tsk) + unsigned int flags, struct task_struct *tsk, + struct range_lock *mmrange) { struct vm_area_struct *vma; int fault; @@ -194,7 +195,7 @@ static int __do_pf(struct mm_struct *mm, unsigned long addr, unsigned int fsr, * If for any reason at all we couldn't handle the fault, make * sure we exit gracefully rather than endlessly redo the fault. */ - fault = handle_mm_fault(vma, addr & PAGE_MASK, flags); + fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, mmrange); return fault; check_stack: @@ -210,6 +211,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) struct mm_struct *mm; int fault, sig, code; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); tsk = current; mm = tsk->mm; @@ -251,7 +253,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) #endif } - fault = __do_pf(mm, addr, fsr, flags, tsk); + fault = __do_pf(mm, addr, fsr, flags, tsk, &mmrange); /* If we need to retry but a fatal signal is pending, handle the * signal first. We do not need to release the mmap_sem because diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 5b8b556dbb12..2e0bdf6a3aaf 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -155,6 +155,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) struct vm_area_struct *vma; unsigned long text_start; int ret = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -192,7 +193,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) if (IS_ERR(vma)) { ret = PTR_ERR(vma); - do_munmap(mm, text_start, image->size, NULL); + do_munmap(mm, text_start, image->size, NULL, &mmrange); } else { current->mm->context.vdso = (void __user *)text_start; current->mm->context.vdso_image = image; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index c931b88982a0..31fb02ed4770 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -263,7 +263,8 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, } static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct range_lock *mmrange) { /* * mpx_notify_unmap() goes and reads a rarely-hot @@ -283,7 +284,7 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, * consistently wrong. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX))) - mpx_notify_unmap(mm, vma, start, end); + mpx_notify_unmap(mm, vma, start, end, mmrange); } #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index 61eb4b63c5ec..c26099224a17 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -73,7 +73,8 @@ static inline void mpx_mm_init(struct mm_struct *mm) mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR; } void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + struct range_lock *mmrange); unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, unsigned long flags); @@ -95,7 +96,8 @@ static inline void mpx_mm_init(struct mm_struct *mm) } static inline void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct range_lock *mmrange) { } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 800de815519c..93f1b8d4c88e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1244,6 +1244,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; u32 pkey; + DEFINE_RANGE_LOCK_FULL(mmrange); tsk = current; mm = tsk->mm; @@ -1423,7 +1424,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * fault, so we read the pkey beforehand. */ pkey = vma_pkey(vma); - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); major |= fault & VM_FAULT_MAJOR; /* diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index e500949bae24..51c3e1f7e6be 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -47,6 +47,7 @@ static unsigned long mpx_mmap(unsigned long len) { struct mm_struct *mm = current->mm; unsigned long addr, populate; + DEFINE_RANGE_LOCK_FULL(mmrange); /* Only bounds table can be allocated here */ if (len != mpx_bt_size_bytes(mm)) @@ -54,7 +55,8 @@ static unsigned long mpx_mmap(unsigned long len) down_write(&mm->mmap_sem); addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL); + MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL, + &mmrange); up_write(&mm->mmap_sem); if (populate) mm_populate(addr, populate); @@ -427,13 +429,15 @@ int mpx_handle_bd_fault(void) * A thin wrapper around get_user_pages(). Returns 0 if the * fault was resolved or -errno if not. */ -static int mpx_resolve_fault(long __user *addr, int write) +static int mpx_resolve_fault(long __user *addr, int write, + struct range_lock *mmrange) { long gup_ret; int nr_pages = 1; gup_ret = get_user_pages((unsigned long)addr, nr_pages, - write ? FOLL_WRITE : 0, NULL, NULL); + write ? FOLL_WRITE : 0, NULL, NULL, + mmrange); /* * get_user_pages() returns number of pages gotten. * 0 means we failed to fault in and get anything, @@ -500,7 +504,8 @@ static int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, */ static int get_bt_addr(struct mm_struct *mm, long __user *bd_entry_ptr, - unsigned long *bt_addr_result) + unsigned long *bt_addr_result, + struct range_lock *mmrange) { int ret; int valid_bit; @@ -519,7 +524,8 @@ static int get_bt_addr(struct mm_struct *mm, if (!ret) break; if (ret == -EFAULT) - ret = mpx_resolve_fault(bd_entry_ptr, need_write); + ret = mpx_resolve_fault(bd_entry_ptr, + need_write, mmrange); /* * If we could not resolve the fault, consider it * userspace's fault and error out. @@ -730,7 +736,8 @@ static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm, } static int unmap_entire_bt(struct mm_struct *mm, - long __user *bd_entry, unsigned long bt_addr) + long __user *bd_entry, unsigned long bt_addr, + struct range_lock *mmrange) { unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG; unsigned long uninitialized_var(actual_old_val); @@ -747,7 +754,7 @@ static int unmap_entire_bt(struct mm_struct *mm, if (!ret) break; if (ret == -EFAULT) - ret = mpx_resolve_fault(bd_entry, need_write); + ret = mpx_resolve_fault(bd_entry, need_write, mmrange); /* * If we could not resolve the fault, consider it * userspace's fault and error out. @@ -780,11 +787,12 @@ static int unmap_entire_bt(struct mm_struct *mm, * avoid recursion, do_munmap() will check whether it comes * from one bounds table through VM_MPX flag. */ - return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL); + return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL, mmrange); } static int try_unmap_single_bt(struct mm_struct *mm, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct range_lock *mmrange) { struct vm_area_struct *next; struct vm_area_struct *prev; @@ -835,7 +843,7 @@ static int try_unmap_single_bt(struct mm_struct *mm, } bde_vaddr = mm->context.bd_addr + mpx_get_bd_entry_offset(mm, start); - ret = get_bt_addr(mm, bde_vaddr, &bt_addr); + ret = get_bt_addr(mm, bde_vaddr, &bt_addr, mmrange); /* * No bounds table there, so nothing to unmap. */ @@ -853,12 +861,13 @@ static int try_unmap_single_bt(struct mm_struct *mm, */ if ((start == bta_start_vaddr) && (end == bta_end_vaddr)) - return unmap_entire_bt(mm, bde_vaddr, bt_addr); + return unmap_entire_bt(mm, bde_vaddr, bt_addr, mmrange); return zap_bt_entries_mapping(mm, bt_addr, start, end); } static int mpx_unmap_tables(struct mm_struct *mm, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct range_lock *mmrange) { unsigned long one_unmap_start; trace_mpx_unmap_search(start, end); @@ -876,7 +885,8 @@ static int mpx_unmap_tables(struct mm_struct *mm, */ if (one_unmap_end > next_unmap_start) one_unmap_end = next_unmap_start; - ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end); + ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end, + mmrange); if (ret) return ret; @@ -894,7 +904,8 @@ static int mpx_unmap_tables(struct mm_struct *mm, * necessary, and the 'vma' is the first vma in this range (start -> end). */ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct range_lock *mmrange) { int ret; @@ -920,7 +931,7 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, vma = vma->vm_next; } while (vma && vma->vm_start < end); - ret = mpx_unmap_tables(mm, start, end); + ret = mpx_unmap_tables(mm, start, end, mmrange); if (ret) force_sig(SIGSEGV, current); } diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 8b9b6f44bb06..6f8e3e7cccb5 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -44,6 +44,7 @@ void do_page_fault(struct pt_regs *regs) int is_write, is_exec; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + DEFINE_RANGE_LOCK_FULL(mmrange); info.si_code = SEGV_MAPERR; @@ -108,7 +109,7 @@ void do_page_fault(struct pt_regs *regs) * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, &mmrange); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index e4bb435e614b..bd464a599341 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -691,6 +691,7 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) unsigned int flags = 0; unsigned pinned = 0; int r; + DEFINE_RANGE_LOCK_FULL(mmrange); if (!(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY)) flags |= FOLL_WRITE; @@ -721,7 +722,7 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) list_add(&guptask.list, >t->guptasks); spin_unlock(>t->guptasklock); - r = get_user_pages(userptr, num_pages, flags, p, NULL); + r = get_user_pages(userptr, num_pages, flags, p, NULL, &mmrange); spin_lock(>t->guptasklock); list_del(&guptask.list); diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 382a77a1097e..881bcc7d663a 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -512,6 +512,8 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) ret = -EFAULT; if (mmget_not_zero(mm)) { + DEFINE_RANGE_LOCK_FULL(mmrange); + down_read(&mm->mmap_sem); while (pinned < npages) { ret = get_user_pages_remote @@ -519,7 +521,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) obj->userptr.ptr + pinned * PAGE_SIZE, npages - pinned, flags, - pvec + pinned, NULL, NULL); + pvec + pinned, NULL, NULL, &mmrange); if (ret < 0) break; diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index a0a839bc39bf..9fc3a4f86945 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -545,6 +545,8 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_tt *ttm) struct radeon_ttm_tt *gtt = (void *)ttm; unsigned pinned = 0, nents; int r; + // XXX: this is wrong!! + DEFINE_RANGE_LOCK_FULL(mmrange); int write = !(gtt->userflags & RADEON_GEM_USERPTR_READONLY); enum dma_data_direction direction = write ? @@ -569,7 +571,7 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_tt *ttm) struct page **pages = ttm->pages + pinned; r = get_user_pages(userptr, num_pages, write ? FOLL_WRITE : 0, - pages, NULL); + pages, NULL, &mmrange); if (r < 0) goto release_pages; diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 9a4e899d94b3..fd9601ed5b84 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -96,6 +96,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct scatterlist *sg, *sg_list_start; int need_release = 0; unsigned int gup_flags = FOLL_WRITE; + DEFINE_RANGE_LOCK_FULL(mmrange); if (dmasync) dma_attrs |= DMA_ATTR_WRITE_BARRIER; @@ -194,7 +195,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), - gup_flags, page_list, vma_list); + gup_flags, page_list, vma_list, &mmrange); if (ret < 0) goto out; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 2aadf5813a40..0572953260e8 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -632,6 +632,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, int j, k, ret = 0, start_idx, npages = 0, page_shift; unsigned int flags = 0; phys_addr_t p = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); if (access_mask == 0) return -EINVAL; @@ -683,7 +684,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, */ npages = get_user_pages_remote(owning_process, owning_mm, user_virt, gup_num_pages, - flags, local_page_list, NULL, NULL); + flags, local_page_list, NULL, NULL, &mmrange); up_read(&owning_mm->mmap_sem); if (npages < 0) diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index ce83ba9a12ef..6bcb4f9f9b30 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -53,7 +53,7 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages, * Call with current->mm->mmap_sem held. */ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, - struct page **p) + struct page **p, struct range_lock *mmrange) { unsigned long lock_limit; size_t got; @@ -70,7 +70,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, ret = get_user_pages(start_page + got * PAGE_SIZE, num_pages - got, FOLL_WRITE | FOLL_FORCE, - p + got, NULL); + p + got, NULL, mmrange); if (ret < 0) goto bail_release; } @@ -134,10 +134,11 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages, struct page **p) { int ret; + DEFINE_RANGE_LOCK_FULL(mmrange); down_write(¤t->mm->mmap_sem); - ret = __qib_get_user_pages(start_page, num_pages, p); + ret = __qib_get_user_pages(start_page, num_pages, p, &mmrange); up_write(¤t->mm->mmap_sem); diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 4381c0a9a873..5f36c6d2e21b 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -113,6 +113,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, int flags; dma_addr_t pa; unsigned int gup_flags; + DEFINE_RANGE_LOCK_FULL(mmrange); if (!can_do_mlock()) return -EPERM; @@ -146,7 +147,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), - gup_flags, page_list, NULL); + gup_flags, page_list, NULL, &mmrange); if (ret < 0) goto out; diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 1d0b53a04a08..15a7103fd84c 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -512,6 +512,7 @@ static void do_fault(struct work_struct *work) unsigned int flags = 0; struct mm_struct *mm; u64 address; + DEFINE_RANGE_LOCK_FULL(mmrange); mm = fault->state->mm; address = fault->address; @@ -523,7 +524,7 @@ static void do_fault(struct work_struct *work) flags |= FAULT_FLAG_REMOTE; down_read(&mm->mmap_sem); - vma = find_extend_vma(mm, address); + vma = find_extend_vma(mm, address, &mmrange); if (!vma || address < vma->vm_start) /* failed to get a vma in the right range */ goto out; @@ -532,7 +533,7 @@ static void do_fault(struct work_struct *work) if (access_error(vma, fault)) goto out; - ret = handle_mm_fault(vma, address, flags); + ret = handle_mm_fault(vma, address, flags, &mmrange); out: up_read(&mm->mmap_sem); diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 35a408d0ae4f..6a74386ee83f 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -585,6 +585,7 @@ static irqreturn_t prq_event_thread(int irq, void *d) struct intel_iommu *iommu = d; struct intel_svm *svm = NULL; int head, tail, handled = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); /* Clear PPR bit before reading head/tail registers, to * ensure that we get a new interrupt if needed. */ @@ -643,7 +644,7 @@ static irqreturn_t prq_event_thread(int irq, void *d) goto bad_req; down_read(&svm->mm->mmap_sem); - vma = find_extend_vma(svm->mm, address); + vma = find_extend_vma(svm->mm, address, &mmrange); if (!vma || address < vma->vm_start) goto invalid; @@ -651,7 +652,7 @@ static irqreturn_t prq_event_thread(int irq, void *d) goto invalid; ret = handle_mm_fault(vma, address, - req->wr_req ? FAULT_FLAG_WRITE : 0); + req->wr_req ? FAULT_FLAG_WRITE : 0, &mmrange); if (ret & VM_FAULT_ERROR) goto invalid; diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index f412429cf5ba..64a4cd62eeb3 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -152,7 +152,8 @@ static void videobuf_dma_init(struct videobuf_dmabuf *dma) } static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, - int direction, unsigned long data, unsigned long size) + int direction, unsigned long data, unsigned long size, + struct range_lock *mmrange) { unsigned long first, last; int err, rw = 0; @@ -186,7 +187,7 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, data, size, dma->nr_pages); err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages, - flags, dma->pages, NULL); + flags, dma->pages, NULL, mmrange); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; @@ -201,9 +202,10 @@ static int videobuf_dma_init_user(struct videobuf_dmabuf *dma, int direction, unsigned long data, unsigned long size) { int ret; + DEFINE_RANGE_LOCK_FULL(mmrange); down_read(¤t->mm->mmap_sem); - ret = videobuf_dma_init_user_locked(dma, direction, data, size); + ret = videobuf_dma_init_user_locked(dma, direction, data, size, &mmrange); up_read(¤t->mm->mmap_sem); return ret; @@ -539,9 +541,14 @@ static int __videobuf_iolock(struct videobuf_queue *q, we take current->mm->mmap_sem there, to prevent locking inversion, so don't take it here */ + /* XXX: can we use a local mmrange here? */ + DEFINE_RANGE_LOCK_FULL(mmrange); + err = videobuf_dma_init_user_locked(&mem->dma, - DMA_FROM_DEVICE, - vb->baddr, vb->bsize); + DMA_FROM_DEVICE, + vb->baddr, + vb->bsize, + &mmrange); if (0 != err) return err; } @@ -555,6 +562,7 @@ static int __videobuf_iolock(struct videobuf_queue *q, * building for PAE. Compiler doesn't like direct casting * of a 32 bit ptr to 64 bit integer. */ + bus = (dma_addr_t)(unsigned long)fbuf->base + vb->boff; pages = PAGE_ALIGN(vb->size) >> PAGE_SHIFT; err = videobuf_dma_init_overlay(&mem->dma, DMA_FROM_DEVICE, diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c index c824329f7012..6ecac843e5f3 100644 --- a/drivers/misc/mic/scif/scif_rma.c +++ b/drivers/misc/mic/scif/scif_rma.c @@ -1332,6 +1332,7 @@ int __scif_pin_pages(void *addr, size_t len, int *out_prot, int prot = *out_prot; int ulimit = 0; struct mm_struct *mm = NULL; + DEFINE_RANGE_LOCK_FULL(mmrange); /* Unsupported flags */ if (map_flags & ~(SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT)) @@ -1400,7 +1401,7 @@ int __scif_pin_pages(void *addr, size_t len, int *out_prot, nr_pages, (prot & SCIF_PROT_WRITE) ? FOLL_WRITE : 0, pinned_pages->pages, - NULL); + NULL, &mmrange); up_write(&mm->mmap_sem); if (nr_pages != pinned_pages->nr_pages) { if (try_upgrade) { diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index 93be82fc338a..b35d60bb2197 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -189,7 +189,8 @@ static void get_clear_fault_map(struct gru_state *gru, */ static int non_atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr, int write, - unsigned long *paddr, int *pageshift) + unsigned long *paddr, int *pageshift, + struct range_lock *mmrange) { struct page *page; @@ -198,7 +199,8 @@ static int non_atomic_pte_lookup(struct vm_area_struct *vma, #else *pageshift = PAGE_SHIFT; #endif - if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL) <= 0) + if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, + &page, NULL, mmrange) <= 0) return -EFAULT; *paddr = page_to_phys(page); put_page(page); @@ -263,7 +265,8 @@ static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr, } static int gru_vtop(struct gru_thread_state *gts, unsigned long vaddr, - int write, int atomic, unsigned long *gpa, int *pageshift) + int write, int atomic, unsigned long *gpa, int *pageshift, + struct range_lock *mmrange) { struct mm_struct *mm = gts->ts_mm; struct vm_area_struct *vma; @@ -283,7 +286,8 @@ static int gru_vtop(struct gru_thread_state *gts, unsigned long vaddr, if (ret) { if (atomic) goto upm; - if (non_atomic_pte_lookup(vma, vaddr, write, &paddr, &ps)) + if (non_atomic_pte_lookup(vma, vaddr, write, &paddr, + &ps, mmrange)) goto inval; } if (is_gru_paddr(paddr)) @@ -324,7 +328,8 @@ static void gru_preload_tlb(struct gru_state *gru, unsigned long fault_vaddr, int asid, int write, unsigned char tlb_preload_count, struct gru_tlb_fault_handle *tfh, - struct gru_control_block_extended *cbe) + struct gru_control_block_extended *cbe, + struct range_lock *mmrange) { unsigned long vaddr = 0, gpa; int ret, pageshift; @@ -342,7 +347,7 @@ static void gru_preload_tlb(struct gru_state *gru, vaddr = min(vaddr, fault_vaddr + tlb_preload_count * PAGE_SIZE); while (vaddr > fault_vaddr) { - ret = gru_vtop(gts, vaddr, write, atomic, &gpa, &pageshift); + ret = gru_vtop(gts, vaddr, write, atomic, &gpa, &pageshift, mmrange); if (ret || tfh_write_only(tfh, gpa, GAA_RAM, vaddr, asid, write, GRU_PAGESIZE(pageshift))) return; @@ -368,7 +373,8 @@ static void gru_preload_tlb(struct gru_state *gru, static int gru_try_dropin(struct gru_state *gru, struct gru_thread_state *gts, struct gru_tlb_fault_handle *tfh, - struct gru_instruction_bits *cbk) + struct gru_instruction_bits *cbk, + struct range_lock *mmrange) { struct gru_control_block_extended *cbe = NULL; unsigned char tlb_preload_count = gts->ts_tlb_preload_count; @@ -423,7 +429,7 @@ static int gru_try_dropin(struct gru_state *gru, if (atomic_read(>s->ts_gms->ms_range_active)) goto failactive; - ret = gru_vtop(gts, vaddr, write, atomic, &gpa, &pageshift); + ret = gru_vtop(gts, vaddr, write, atomic, &gpa, &pageshift, mmrange); if (ret == VTOP_INVALID) goto failinval; if (ret == VTOP_RETRY) @@ -438,7 +444,8 @@ static int gru_try_dropin(struct gru_state *gru, } if (unlikely(cbe) && pageshift == PAGE_SHIFT) { - gru_preload_tlb(gru, gts, atomic, vaddr, asid, write, tlb_preload_count, tfh, cbe); + gru_preload_tlb(gru, gts, atomic, vaddr, asid, write, + tlb_preload_count, tfh, cbe, mmrange); gru_flush_cache_cbe(cbe); } @@ -587,10 +594,13 @@ static irqreturn_t gru_intr(int chiplet, int blade) * If it fails, retry the fault in user context. */ gts->ustats.fmm_tlbmiss++; - if (!gts->ts_force_cch_reload && - down_read_trylock(>s->ts_mm->mmap_sem)) { - gru_try_dropin(gru, gts, tfh, NULL); - up_read(>s->ts_mm->mmap_sem); + if (!gts->ts_force_cch_reload) { + DEFINE_RANGE_LOCK_FULL(mmrange); + + if (down_read_trylock(>s->ts_mm->mmap_sem)) { + gru_try_dropin(gru, gts, tfh, NULL, &mmrange); + up_read(>s->ts_mm->mmap_sem); + } } else { tfh_user_polling_mode(tfh); STAT(intr_mm_lock_failed); @@ -625,7 +635,7 @@ irqreturn_t gru_intr_mblade(int irq, void *dev_id) static int gru_user_dropin(struct gru_thread_state *gts, struct gru_tlb_fault_handle *tfh, - void *cb) + void *cb, struct range_lock *mmrange) { struct gru_mm_struct *gms = gts->ts_gms; int ret; @@ -635,7 +645,7 @@ static int gru_user_dropin(struct gru_thread_state *gts, wait_event(gms->ms_wait_queue, atomic_read(&gms->ms_range_active) == 0); prefetchw(tfh); /* Helps on hdw, required for emulator */ - ret = gru_try_dropin(gts->ts_gru, gts, tfh, cb); + ret = gru_try_dropin(gts->ts_gru, gts, tfh, cb, mmrange); if (ret <= 0) return ret; STAT(call_os_wait_queue); @@ -653,6 +663,7 @@ int gru_handle_user_call_os(unsigned long cb) struct gru_thread_state *gts; void *cbk; int ucbnum, cbrnum, ret = -EINVAL; + DEFINE_RANGE_LOCK_FULL(mmrange); STAT(call_os); @@ -685,7 +696,7 @@ int gru_handle_user_call_os(unsigned long cb) tfh = get_tfh_by_index(gts->ts_gru, cbrnum); cbk = get_gseg_base_address_cb(gts->ts_gru->gs_gru_base_vaddr, gts->ts_ctxnum, ucbnum); - ret = gru_user_dropin(gts, tfh, cbk); + ret = gru_user_dropin(gts, tfh, cbk, &mmrange); } exit: gru_unlock_gts(gts); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index e30e29ae4819..1b3b103da637 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -345,13 +345,14 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, page); } else { unsigned int flags = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); if (prot & IOMMU_WRITE) flags |= FOLL_WRITE; down_read(&mm->mmap_sem); ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, - NULL, NULL); + NULL, NULL, &mmrange); up_read(&mm->mmap_sem); } diff --git a/fs/aio.c b/fs/aio.c index a062d75109cb..31774b75c372 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -457,6 +457,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) int nr_pages; int i; struct file *file; + DEFINE_RANGE_LOCK_FULL(mmrange); /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ @@ -519,7 +520,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED, 0, &unused, NULL); + MAP_SHARED, 0, &unused, NULL, &mmrange); up_write(&mm->mmap_sem); if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2f492dfcabde..9aea808d55d7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -180,6 +180,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, int ei_index = 0; const struct cred *cred = current_cred(); struct vm_area_struct *vma; + DEFINE_RANGE_LOCK_FULL(mmrange); /* * In some cases (e.g. Hyper-Threading), we want to avoid L1 @@ -300,7 +301,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ - vma = find_extend_vma(current->mm, bprm->p); + vma = find_extend_vma(current->mm, bprm->p, &mmrange); if (!vma) return -EFAULT; diff --git a/fs/exec.c b/fs/exec.c index e7b69e14649f..e46752874b47 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -197,6 +197,11 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page; int ret; unsigned int gup_flags = FOLL_FORCE; + /* + * No concurrency for the bprm->mm yet -- this is exec path; + * but gup needs an mmrange. + */ + DEFINE_RANGE_LOCK_FULL(mmrange); #ifdef CONFIG_STACK_GROWSUP if (write) { @@ -214,7 +219,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * doing the exec and bprm->mm is the new process's mm. */ ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, - &page, NULL, NULL); + &page, NULL, NULL, &mmrange); if (ret <= 0) return NULL; @@ -615,7 +620,8 @@ EXPORT_SYMBOL(copy_strings_kernel); * 4) Free up any cleared pgd range. * 5) Shrink the vma to cover only the new range. */ -static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) +static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift, + struct range_lock *mmrange) { struct mm_struct *mm = vma->vm_mm; unsigned long old_start = vma->vm_start; @@ -637,7 +643,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) + if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL, + mmrange)) return -ENOMEM; /* @@ -671,7 +678,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * Shrink the vma to just the new range. Always succeeds. */ - vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); + vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL, mmrange); return 0; } @@ -694,6 +701,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; + DEFINE_RANGE_LOCK_FULL(mmrange); #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ @@ -749,14 +757,14 @@ int setup_arg_pages(struct linux_binprm *bprm, vm_flags |= VM_STACK_INCOMPLETE_SETUP; ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, - vm_flags); + vm_flags, &mmrange); if (ret) goto out_unlock; BUG_ON(prev != vma); /* Move stack pages down in memory. */ if (stack_shift) { - ret = shift_arg_pages(vma, stack_shift); + ret = shift_arg_pages(vma, stack_shift, &mmrange); if (ret) goto out_unlock; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d697c8ab0a14..791f9f93643c 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -16,6 +16,7 @@ #include <linux/binfmts.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> +#include <linux/range_lock.h> struct ctl_table_header; struct mempolicy; @@ -263,6 +264,8 @@ struct proc_maps_private { #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif + /* mmap_sem is held across all stages of seqfile */ + struct range_lock mmrange; } __randomize_layout; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b66fc8de7d34..7c0a79a937b5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -174,6 +174,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos) if (!mm || !mmget_not_zero(mm)) return NULL; + range_lock_init_full(&priv->mmrange); down_read(&mm->mmap_sem); hold_task_mempolicy(priv); priv->tail_vma = get_gate_vma(mm); @@ -514,7 +515,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, #ifdef CONFIG_SHMEM static int smaps_pte_hole(unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { struct mem_size_stats *mss = walk->private; @@ -605,7 +606,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, #endif static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { struct vm_area_struct *vma = walk->vma; pte_t *pte; @@ -797,7 +798,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) #endif /* mmap_sem is held in m_start */ - walk_page_vma(vma, &smaps_walk); + walk_page_vma(vma, &smaps_walk, &priv->mmrange); if (vma->vm_flags & VM_LOCKED) mss->pss_locked += mss->pss; @@ -1012,7 +1013,8 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, #endif static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) + unsigned long end, struct mm_walk *walk, + struct range_lock *mmrange) { struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; @@ -1103,6 +1105,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, struct mmu_gather tlb; int itype; int rv; + DEFINE_RANGE_LOCK_FULL(mmrange); memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) @@ -1166,7 +1169,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } mmu_notifier_invalidate_range_start(mm, 0, -1); } - walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); + walk_page_range(0, mm->highest_vm_end, &clear_refs_walk, + &mmrange); if (type == CLEAR_REFS_SOFT_DIRTY) mmu_notifier_invalidate_range_end(mm, 0, -1); tlb_finish_mmu(&tlb, 0, -1); @@ -1223,7 +1227,7 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, } static int pagemap_pte_hole(unsigned long start, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { struct pagemapread *pm = walk->private; unsigned long addr = start; @@ -1301,7 +1305,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, } static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { struct vm_area_struct *vma = walk->vma; struct pagemapread *pm = walk->private; @@ -1467,6 +1471,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long start_vaddr; unsigned long end_vaddr; int ret = 0, copied = 0; + DEFINE_RANGE_LOCK_FULL(tmprange); + struct range_lock *mmrange = &tmprange; if (!mm || !mmget_not_zero(mm)) goto out; @@ -1523,7 +1529,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (end < start_vaddr || end > end_vaddr) end = end_vaddr; down_read(&mm->mmap_sem); - ret = walk_page_range(start_vaddr, end, &pagemap_walk); + ret = walk_page_range(start_vaddr, end, &pagemap_walk, + mmrange); up_read(&mm->mmap_sem); start_vaddr = end; @@ -1671,7 +1678,8 @@ static struct page *can_gather_numa_stats_pmd(pmd_t pmd, #endif static int gather_pte_stats(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) + unsigned long end, struct mm_walk *walk, + struct range_lock *mmrange) { struct numa_maps *md = walk->private; struct vm_area_struct *vma = walk->vma; @@ -1740,6 +1748,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, */ static int show_numa_map(struct seq_file *m, void *v, int is_pid) { + struct proc_maps_private *priv = m->private; struct numa_maps_private *numa_priv = m->private; struct proc_maps_private *proc_priv = &numa_priv->proc_maps; struct vm_area_struct *vma = v; @@ -1785,7 +1794,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_puts(m, " huge"); /* mmap_sem is held by m_start */ - walk_page_vma(vma, &walk); + walk_page_vma(vma, &walk, &priv->mmrange); if (!md->pages) goto out; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a45f0af22a60..3768955c10bc 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -350,6 +350,11 @@ static int remap_oldmem_pfn_checked(struct vm_area_struct *vma, unsigned long pos_start, pos_end, pos; unsigned long zeropage_pfn = my_zero_pfn(0); size_t len = 0; + /* + * No concurrency for the bprm->mm yet -- this is a vmcore path, + * but do_munmap() needs an mmrange. + */ + DEFINE_RANGE_LOCK_FULL(mmrange); pos_start = pfn; pos_end = pfn + (size >> PAGE_SHIFT); @@ -388,7 +393,7 @@ static int remap_oldmem_pfn_checked(struct vm_area_struct *vma, } return 0; fail: - do_munmap(vma->vm_mm, from, len, NULL); + do_munmap(vma->vm_mm, from, len, NULL, &mmrange); return -EAGAIN; } @@ -411,6 +416,11 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) size_t size = vma->vm_end - vma->vm_start; u64 start, end, len, tsz; struct vmcore *m; + /* + * No concurrency for the bprm->mm yet -- this is a vmcore path, + * but do_munmap() needs an mmrange. + */ + DEFINE_RANGE_LOCK_FULL(mmrange); start = (u64)vma->vm_pgoff << PAGE_SHIFT; end = start + size; @@ -481,7 +491,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) return 0; fail: - do_munmap(vma->vm_mm, vma->vm_start, len, NULL); + do_munmap(vma->vm_mm, vma->vm_start, len, NULL, &mmrange); return -EAGAIN; } #else diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 87a13a7c8270..e3089865fd52 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -851,6 +851,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; + DEFINE_RANGE_LOCK_FULL(mmrange); WRITE_ONCE(ctx->released, true); @@ -880,7 +881,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, &mmrange); if (prev) vma = prev; else @@ -1276,6 +1277,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; + DEFINE_RANGE_LOCK_FULL(mmrange); user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1413,18 +1415,19 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx })); + ((struct vm_userfaultfd_ctx){ ctx }), + &mmrange); if (prev) { vma = prev; goto next; } if (vma->vm_start < start) { - ret = split_vma(mm, vma, start, 1); + ret = split_vma(mm, vma, start, 1, &mmrange); if (ret) break; } if (vma->vm_end > end) { - ret = split_vma(mm, vma, end, 0); + ret = split_vma(mm, vma, end, 0, &mmrange); if (ret) break; } @@ -1471,6 +1474,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; + DEFINE_RANGE_LOCK_FULL(mmrange); ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1571,18 +1575,18 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, &mmrange); if (prev) { vma = prev; goto next; } if (vma->vm_start < start) { - ret = split_vma(mm, vma, start, 1); + ret = split_vma(mm, vma, start, 1, &mmrange); if (ret) break; } if (vma->vm_end > end) { - ret = split_vma(mm, vma, end, 0); + ret = split_vma(mm, vma, end, 0, &mmrange); if (ret) break; } diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index 8ac4e68a12f0..2115deceded1 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h @@ -19,7 +19,8 @@ static inline void arch_exit_mmap(struct mm_struct *mm) static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct range_lock *mmrange) { } diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 325017ad9311..da004594d831 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -295,7 +295,7 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, struct hmm_range *range, unsigned long start, unsigned long end, - hmm_pfn_t *pfns); + hmm_pfn_t *pfns, struct range_lock *mmrange); bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range); @@ -323,7 +323,7 @@ int hmm_vma_fault(struct vm_area_struct *vma, unsigned long end, hmm_pfn_t *pfns, bool write, - bool block); + bool block, struct range_lock *mmrange); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 44368b19b27e..19667b75f73c 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -20,7 +20,8 @@ struct mem_cgroup; #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags); + unsigned long end, int advice, unsigned long *vm_flags, + struct range_lock *mmrange); int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); @@ -78,7 +79,8 @@ static inline void ksm_exit(struct mm_struct *mm) #ifdef CONFIG_MMU static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags) + unsigned long end, int advice, unsigned long *vm_flags, + struct range_lock *mmrange) { return 0; } diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 0c6fe904bc97..fa08e348a295 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -272,7 +272,7 @@ int migrate_vma(const struct migrate_vma_ops *ops, unsigned long end, unsigned long *src, unsigned long *dst, - void *private); + void *private, struct range_lock *mmrange); #else static inline int migrate_vma(const struct migrate_vma_ops *ops, struct vm_area_struct *vma, @@ -280,7 +280,7 @@ static inline int migrate_vma(const struct migrate_vma_ops *ops, unsigned long end, unsigned long *src, unsigned long *dst, - void *private) + void *private, struct range_lock *mmrange) { return -EINVAL; } diff --git a/include/linux/mm.h b/include/linux/mm.h index bcf2509d448d..fc4e7fdc3e76 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1295,11 +1295,12 @@ struct mm_walk { int (*pud_entry)(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk); int (*pmd_entry)(pmd_t *pmd, unsigned long addr, - unsigned long next, struct mm_walk *walk); + unsigned long next, struct mm_walk *walk, + struct range_lock *mmrange); int (*pte_entry)(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk); int (*pte_hole)(unsigned long addr, unsigned long next, - struct mm_walk *walk); + struct mm_walk *walk, struct range_lock *mmrange); int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk); @@ -1311,8 +1312,9 @@ struct mm_walk { }; int walk_page_range(unsigned long addr, unsigned long end, - struct mm_walk *walk); -int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); + struct mm_walk *walk, struct range_lock *mmrange); +int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk, + struct range_lock *mmrange); void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, @@ -1337,17 +1339,18 @@ int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags); + unsigned int flags, struct range_lock *mmrange); extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, - bool *unlocked); + bool *unlocked, struct range_lock *mmrange); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline int handle_mm_fault(struct vm_area_struct *vma, - unsigned long address, unsigned int flags) + unsigned long address, unsigned int flags, + struct range_lock *mmrange) { /* should never happen if there's no MMU */ BUG(); @@ -1355,7 +1358,8 @@ static inline int handle_mm_fault(struct vm_area_struct *vma, } static inline int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, - unsigned int fault_flags, bool *unlocked) + unsigned int fault_flags, bool *unlocked, + struct range_lock *mmrange) { /* should never happen if there's no MMU */ BUG(); @@ -1383,24 +1387,28 @@ extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked); + struct vm_area_struct **vmas, int *locked, + struct range_lock *mmrange); long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, struct range_lock *mmrange); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, int *locked); + unsigned int gup_flags, struct page **pages, + int *locked, struct range_lock *mmrange); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); #ifdef CONFIG_FS_DAX long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, + struct range_lock *mmrange); #else static inline long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + struct range_lock *mmrange) { - return get_user_pages(start, nr_pages, gup_flags, pages, vmas); + return get_user_pages(start, nr_pages, gup_flags, pages, vmas, mmrange); } #endif /* CONFIG_FS_DAX */ @@ -1505,7 +1513,8 @@ extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long int dirty_accountable, int prot_numa); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags); + unsigned long end, unsigned long newflags, + struct range_lock *mmrange); /* * doesn't attempt to fault and will return short. @@ -2149,28 +2158,30 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand); + struct vm_area_struct *expand, struct range_lock *mmrange); static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, + struct range_lock *mmrange) { - return __vma_adjust(vma, start, end, pgoff, insert, NULL); + return __vma_adjust(vma, start, end, pgoff, insert, NULL, mmrange); } extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); + struct mempolicy *, struct vm_userfaultfd_ctx, + struct range_lock *mmrange); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); + unsigned long addr, int new_below, struct range_lock *mmrange); extern int split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); + unsigned long addr, int new_below, struct range_lock *mmrange); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, - bool *need_rmap_locks); + bool *need_rmap_locks, struct range_lock *mmrange); extern void exit_mmap(struct mm_struct *); static inline int check_data_rlimit(unsigned long rlim, @@ -2212,21 +2223,22 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf); + struct list_head *uf, struct range_lock *mmrange); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, - struct list_head *uf); + struct list_head *uf, struct range_lock *mmrange); extern int do_munmap(struct mm_struct *, unsigned long, size_t, - struct list_head *uf); + struct list_head *uf, struct range_lock *mmrange); static inline unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, - struct list_head *uf) + struct list_head *uf, struct range_lock *mmrange) { - return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf); + return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, + uf, mmrange); } #ifdef CONFIG_MMU @@ -2405,7 +2417,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif -struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); +struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr, + struct range_lock *); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 0a294e950df8..79eb735e7c95 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -34,6 +34,7 @@ struct mm_struct; struct inode; struct notifier_block; struct page; +struct range_lock; #define UPROBE_HANDLER_REMOVE 1 #define UPROBE_HANDLER_MASK 1 @@ -115,17 +116,20 @@ struct uprobes_state { struct xol_area *xol_area; }; -extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, + unsigned long vaddr, struct range_lock *mmrange); +extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, + unsigned long vaddr, struct range_lock *mmrange); extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, + uprobe_opcode_t, struct range_lock *mmrange); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); -extern int uprobe_mmap(struct vm_area_struct *vma); +extern int uprobe_mmap(struct vm_area_struct *vma, struct range_lock *mmrange);; extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void uprobe_start_dup_mmap(void); extern void uprobe_end_dup_mmap(void); @@ -169,7 +173,8 @@ static inline void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { } -static inline int uprobe_mmap(struct vm_area_struct *vma) +static inline int uprobe_mmap(struct vm_area_struct *vma, + struct range_lock *mmrange) { return 0; } diff --git a/ipc/shm.c b/ipc/shm.c index 4643865e9171..6c29c791c7f2 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1293,6 +1293,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, struct path path; fmode_t f_mode; unsigned long populate = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); err = -EINVAL; if (shmid < 0) @@ -1411,7 +1412,8 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, goto invalid; } - addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL); + addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL, + &mmrange); *raddr = addr; err = 0; if (IS_ERR_VALUE(addr)) @@ -1487,6 +1489,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) struct file *file; struct vm_area_struct *next; #endif + DEFINE_RANGE_LOCK_FULL(mmrange); if (addr & ~PAGE_MASK) return retval; @@ -1537,7 +1540,8 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, + NULL, &mmrange); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1564,7 +1568,8 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && (vma->vm_file == file)) - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, + NULL, &mmrange); vma = next; } @@ -1573,7 +1578,8 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) * given */ if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, + NULL, &mmrange); retval = 0; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ce6848e46e94..60e12b39182c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -300,7 +300,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, - uprobe_opcode_t opcode) + uprobe_opcode_t opcode, struct range_lock *mmrange) { struct page *old_page, *new_page; struct vm_area_struct *vma; @@ -309,7 +309,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); + FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL, + mmrange); if (ret <= 0) return ret; @@ -349,9 +350,10 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr, struct range_lock *mmrange) { - return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN, mmrange); } /** @@ -364,9 +366,12 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned * Return 0 (success) or a negative errno. */ int __weak -set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr, struct range_lock *mmrange) { - return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); + return uprobe_write_opcode(mm, vaddr, + *(uprobe_opcode_t *)&auprobe->insn, + mmrange); } static struct uprobe *get_uprobe(struct uprobe *uprobe) @@ -650,7 +655,8 @@ static bool filter_chain(struct uprobe *uprobe, static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long vaddr) + struct vm_area_struct *vma, unsigned long vaddr, + struct range_lock *mmrange) { bool first_uprobe; int ret; @@ -667,7 +673,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (first_uprobe) set_bit(MMF_HAS_UPROBES, &mm->flags); - ret = set_swbp(&uprobe->arch, mm, vaddr); + ret = set_swbp(&uprobe->arch, mm, vaddr, mmrange); if (!ret) clear_bit(MMF_RECALC_UPROBES, &mm->flags); else if (first_uprobe) @@ -677,10 +683,11 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, } static int -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) +remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, + unsigned long vaddr, struct range_lock *mmrange) { set_bit(MMF_RECALC_UPROBES, &mm->flags); - return set_orig_insn(&uprobe->arch, mm, vaddr); + return set_orig_insn(&uprobe->arch, mm, vaddr, mmrange); } static inline bool uprobe_is_active(struct uprobe *uprobe) @@ -794,6 +801,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) bool is_register = !!new; struct map_info *info; int err = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); percpu_down_write(&dup_mmap_sem); info = build_map_info(uprobe->inode->i_mapping, @@ -824,11 +832,13 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) /* consult only the "caller", new consumer. */ if (consumer_filter(new, UPROBE_FILTER_REGISTER, mm)) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); + err = install_breakpoint(uprobe, mm, vma, + info->vaddr, &mmrange); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { if (!filter_chain(uprobe, UPROBE_FILTER_UNREGISTER, mm)) - err |= remove_breakpoint(uprobe, mm, info->vaddr); + err |= remove_breakpoint(uprobe, mm, + info->vaddr, &mmrange); } unlock: @@ -972,6 +982,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { struct vm_area_struct *vma; int err = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { @@ -988,7 +999,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); - err |= remove_breakpoint(uprobe, mm, vaddr); + err |= remove_breakpoint(uprobe, mm, vaddr, &mmrange); } up_read(&mm->mmap_sem); @@ -1063,7 +1074,7 @@ static void build_probe_list(struct inode *inode, * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. */ -int uprobe_mmap(struct vm_area_struct *vma) +int uprobe_mmap(struct vm_area_struct *vma, struct range_lock *mmrange) { struct list_head tmp_list; struct uprobe *uprobe, *u; @@ -1087,7 +1098,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!fatal_signal_pending(current) && filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); + install_breakpoint(uprobe, vma->vm_mm, vma, vaddr, mmrange); } put_uprobe(uprobe); } @@ -1698,7 +1709,8 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) clear_bit(MMF_HAS_UPROBES, &mm->flags); } -static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) +static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr, + struct range_lock *mmrange) { struct page *page; uprobe_opcode_t opcode; @@ -1718,7 +1730,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * essentially a kernel access to the memory. */ result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, - NULL, NULL); + NULL, NULL, mmrange); if (result < 0) return result; @@ -1734,6 +1746,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; + DEFINE_RANGE_LOCK_FULL(mmrange); down_read(&mm->mmap_sem); vma = find_vma(mm, bp_vaddr); @@ -1746,7 +1759,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) } if (!uprobe) - *is_swbp = is_trap_at_addr(mm, bp_vaddr); + *is_swbp = is_trap_at_addr(mm, bp_vaddr, &mmrange); } else { *is_swbp = -EFAULT; } diff --git a/kernel/futex.c b/kernel/futex.c index 1f450e092c74..09a0d86f80a0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -725,10 +725,11 @@ static int fault_in_user_writeable(u32 __user *uaddr) { struct mm_struct *mm = current->mm; int ret; + DEFINE_RANGE_LOCK_FULL(mmrange); down_read(&mm->mmap_sem); ret = fixup_user_fault(current, mm, (unsigned long)uaddr, - FAULT_FLAG_WRITE, NULL); + FAULT_FLAG_WRITE, NULL, &mmrange); up_read(&mm->mmap_sem); return ret < 0 ? ret : 0; diff --git a/mm/frame_vector.c b/mm/frame_vector.c index c64dca6e27c2..d3dccd80c6ee 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -39,6 +39,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, int ret = 0; int err; int locked; + DEFINE_RANGE_LOCK_FULL(mmrange); if (nr_frames == 0) return 0; @@ -71,7 +72,8 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, vec->got_ref = true; vec->is_pfns = false; ret = get_user_pages_locked(start, nr_frames, - gup_flags, (struct page **)(vec->ptrs), &locked); + gup_flags, (struct page **)(vec->ptrs), &locked, + &mmrange); goto out; } diff --git a/mm/gup.c b/mm/gup.c index 1b46e6e74881..01983a7b3750 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -478,7 +478,8 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, * If it is, *@nonblocking will be set to 0 and -EBUSY returned. */ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, - unsigned long address, unsigned int *flags, int *nonblocking) + unsigned long address, unsigned int *flags, int *nonblocking, + struct range_lock *mmrange) { unsigned int fault_flags = 0; int ret; @@ -499,7 +500,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_TRIED; } - ret = handle_mm_fault(vma, address, fault_flags); + ret = handle_mm_fault(vma, address, fault_flags, mmrange); if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, *flags); @@ -592,6 +593,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * @nonblocking: whether waiting for disk IO or mmap_sem contention + * @mmrange: mm address space range locking * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages @@ -638,7 +640,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) + struct vm_area_struct **vmas, int *nonblocking, + struct range_lock *mmrange) { long i = 0; unsigned int page_mask; @@ -664,7 +667,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { - vma = find_extend_vma(mm, start); + vma = find_extend_vma(mm, start, mmrange); if (!vma && in_gate_area(mm, start)) { int ret; ret = get_gate_page(mm, start & PAGE_MASK, @@ -697,7 +700,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (!page) { int ret; ret = faultin_page(tsk, vma, start, &foll_flags, - nonblocking); + nonblocking, mmrange); switch (ret) { case 0: goto retry; @@ -796,7 +799,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma, */ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, - bool *unlocked) + bool *unlocked, struct range_lock *mmrange) { struct vm_area_struct *vma; int ret, major = 0; @@ -805,14 +808,14 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, fault_flags |= FAULT_FLAG_ALLOW_RETRY; retry: - vma = find_extend_vma(mm, address); + vma = find_extend_vma(mm, address, mmrange); if (!vma || address < vma->vm_start) return -EFAULT; if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; - ret = handle_mm_fault(vma, address, fault_flags); + ret = handle_mm_fault(vma, address, fault_flags, mmrange); major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); @@ -849,7 +852,8 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, struct page **pages, struct vm_area_struct **vmas, int *locked, - unsigned int flags) + unsigned int flags, + struct range_lock *mmrange) { long ret, pages_done; bool lock_dropped; @@ -868,7 +872,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, lock_dropped = false; for (;;) { ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, - vmas, locked); + vmas, locked, mmrange); if (!locked) /* VM_FAULT_RETRY couldn't trigger, bypass */ return ret; @@ -908,7 +912,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, lock_dropped = true; down_read(&mm->mmap_sem); ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, - pages, NULL, NULL); + pages, NULL, NULL, mmrange); if (ret != 1) { BUG_ON(ret > 1); if (!pages_done) @@ -956,11 +960,11 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, */ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - int *locked) + int *locked, struct range_lock *mmrange) { return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, NULL, locked, - gup_flags | FOLL_TOUCH); + gup_flags | FOLL_TOUCH, mmrange); } EXPORT_SYMBOL(get_user_pages_locked); @@ -985,10 +989,11 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct mm_struct *mm = current->mm; int locked = 1; long ret; + DEFINE_RANGE_LOCK_FULL(mmrange); down_read(&mm->mmap_sem); ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, - &locked, gup_flags | FOLL_TOUCH); + &locked, gup_flags | FOLL_TOUCH, &mmrange); if (locked) up_read(&mm->mmap_sem); return ret; @@ -1054,11 +1059,13 @@ EXPORT_SYMBOL(get_user_pages_unlocked); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + struct vm_area_struct **vmas, int *locked, + struct range_lock *mmrange) { return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, locked, - gup_flags | FOLL_TOUCH | FOLL_REMOTE); + gup_flags | FOLL_TOUCH | FOLL_REMOTE, + mmrange); } EXPORT_SYMBOL(get_user_pages_remote); @@ -1071,11 +1078,11 @@ EXPORT_SYMBOL(get_user_pages_remote); */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, struct range_lock *mmrange) { return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, vmas, NULL, - gup_flags | FOLL_TOUCH); + gup_flags | FOLL_TOUCH, mmrange); } EXPORT_SYMBOL(get_user_pages); @@ -1094,7 +1101,8 @@ EXPORT_SYMBOL(get_user_pages); */ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas_arg) + struct vm_area_struct **vmas_arg, + struct range_lock *mmrange) { struct vm_area_struct **vmas = vmas_arg; struct vm_area_struct *vma_prev = NULL; @@ -1110,7 +1118,7 @@ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, return -ENOMEM; } - rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas, mmrange); for (i = 0; i < rc; i++) { struct vm_area_struct *vma = vmas[i]; @@ -1149,6 +1157,7 @@ EXPORT_SYMBOL(get_user_pages_longterm); * @start: start address * @end: end address * @nonblocking: + * @mmrange: mm address space range locking * * This takes care of mlocking the pages too if VM_LOCKED is set. * @@ -1163,7 +1172,8 @@ EXPORT_SYMBOL(get_user_pages_longterm); * released. If it's released, *@nonblocking will be set to 0. */ long populate_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, int *nonblocking) + unsigned long start, unsigned long end, int *nonblocking, + struct range_lock *mmrange) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; @@ -1198,7 +1208,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * not result in a stack expansion that recurses back here. */ return __get_user_pages(current, mm, start, nr_pages, gup_flags, - NULL, NULL, nonblocking); + NULL, NULL, nonblocking, mmrange); } /* @@ -1215,6 +1225,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) struct vm_area_struct *vma = NULL; int locked = 0; long ret = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(len != PAGE_ALIGN(len)); @@ -1247,7 +1258,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) * double checks the vma flags, so that it won't mlock pages * if the vma was already munlocked. */ - ret = populate_vma_page_range(vma, nstart, nend, &locked); + ret = populate_vma_page_range(vma, nstart, nend, &locked, &mmrange); if (ret < 0) { if (ignore_errors) { ret = 0; @@ -1282,10 +1293,11 @@ struct page *get_dump_page(unsigned long addr) { struct vm_area_struct *vma; struct page *page; + DEFINE_RANGE_LOCK_FULL(mmrange); if (__get_user_pages(current, current->mm, addr, 1, FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, - NULL) < 1) + NULL, &mmrange) < 1) return NULL; flush_cache_page(vma, addr, page_to_pfn(page)); return page; diff --git a/mm/hmm.c b/mm/hmm.c index 320545b98ff5..b14e6869689e 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -245,7 +245,8 @@ struct hmm_vma_walk { static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, - hmm_pfn_t *pfn) + hmm_pfn_t *pfn, + struct range_lock *mmrange) { unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; struct hmm_vma_walk *hmm_vma_walk = walk->private; @@ -254,7 +255,7 @@ static int hmm_vma_do_fault(struct mm_walk *walk, flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0; - r = handle_mm_fault(vma, addr, flags); + r = handle_mm_fault(vma, addr, flags, mmrange); if (r & VM_FAULT_RETRY) return -EBUSY; if (r & VM_FAULT_ERROR) { @@ -298,7 +299,9 @@ static void hmm_pfns_clear(hmm_pfn_t *pfns, static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, + struct range_lock *mmrange) + { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; @@ -312,7 +315,7 @@ static int hmm_vma_walk_hole(unsigned long addr, if (hmm_vma_walk->fault) { int ret; - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + ret = hmm_vma_do_fault(walk, addr, &pfns[i], mmrange); if (ret != -EAGAIN) return ret; } @@ -323,7 +326,8 @@ static int hmm_vma_walk_hole(unsigned long addr, static int hmm_vma_walk_clear(unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, + struct range_lock *mmrange) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; @@ -337,7 +341,7 @@ static int hmm_vma_walk_clear(unsigned long addr, if (hmm_vma_walk->fault) { int ret; - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + ret = hmm_vma_do_fault(walk, addr, &pfns[i], mmrange); if (ret != -EAGAIN) return ret; } @@ -349,7 +353,8 @@ static int hmm_vma_walk_clear(unsigned long addr, static int hmm_vma_walk_pmd(pmd_t *pmdp, unsigned long start, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, + struct range_lock *mmrange) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; @@ -366,7 +371,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, again: if (pmd_none(*pmdp)) - return hmm_vma_walk_hole(start, end, walk); + return hmm_vma_walk_hole(start, end, walk, mmrange); if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB) return hmm_pfns_bad(start, end, walk); @@ -389,10 +394,10 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; if (pmd_protnone(pmd)) - return hmm_vma_walk_clear(start, end, walk); + return hmm_vma_walk_clear(start, end, walk, mmrange); if (write_fault && !pmd_write(pmd)) - return hmm_vma_walk_clear(start, end, walk); + return hmm_vma_walk_clear(start, end, walk, mmrange); pfn = pmd_pfn(pmd) + pte_index(addr); flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; @@ -464,7 +469,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, fault: pte_unmap(ptep); /* Fault all pages in range */ - return hmm_vma_walk_clear(start, end, walk); + return hmm_vma_walk_clear(start, end, walk, mmrange); } pte_unmap(ptep - 1); @@ -495,7 +500,8 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, struct hmm_range *range, unsigned long start, unsigned long end, - hmm_pfn_t *pfns) + hmm_pfn_t *pfns, + struct range_lock *mmrange) { struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; @@ -541,7 +547,7 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pte_hole = hmm_vma_walk_hole; - walk_page_range(start, end, &mm_walk); + walk_page_range(start, end, &mm_walk, mmrange); return 0; } EXPORT_SYMBOL(hmm_vma_get_pfns); @@ -664,7 +670,8 @@ int hmm_vma_fault(struct vm_area_struct *vma, unsigned long end, hmm_pfn_t *pfns, bool write, - bool block) + bool block, + struct range_lock *mmrange) { struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; @@ -717,7 +724,7 @@ int hmm_vma_fault(struct vm_area_struct *vma, mm_walk.pte_hole = hmm_vma_walk_hole; do { - ret = walk_page_range(start, end, &mm_walk); + ret = walk_page_range(start, end, &mm_walk, mmrange); start = hmm_vma_walk.last; } while (ret == -EAGAIN); diff --git a/mm/internal.h b/mm/internal.h index 62d8c34e63d5..abf1de31e524 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -289,7 +289,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, #ifdef CONFIG_MMU extern long populate_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, int *nonblocking); + unsigned long start, unsigned long end, int *nonblocking, + struct range_lock *mmrange); extern void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); static inline void munlock_vma_pages_all(struct vm_area_struct *vma) diff --git a/mm/ksm.c b/mm/ksm.c index 293721f5da70..66c350cd9799 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -448,7 +448,8 @@ static inline bool ksm_test_exit(struct mm_struct *mm) * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ -static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +static int break_ksm(struct vm_area_struct *vma, unsigned long addr, + struct range_lock *mmrange) { struct page *page; int ret = 0; @@ -461,7 +462,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) break; if (PageKsm(page)) ret = handle_mm_fault(vma, addr, - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, + mmrange); else ret = VM_FAULT_WRITE; put_page(page); @@ -516,6 +518,7 @@ static void break_cow(struct rmap_item *rmap_item) struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; + DEFINE_RANGE_LOCK_FULL(mmrange); /* * It is not an accident that whenever we want to break COW @@ -526,7 +529,7 @@ static void break_cow(struct rmap_item *rmap_item) down_read(&mm->mmap_sem); vma = find_mergeable_vma(mm, addr); if (vma) - break_ksm(vma, addr); + break_ksm(vma, addr, &mmrange); up_read(&mm->mmap_sem); } @@ -807,7 +810,8 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot, * in cmp_and_merge_page on one of the rmap_items we would be removing. */ static int unmerge_ksm_pages(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct range_lock *mmrange) { unsigned long addr; int err = 0; @@ -818,7 +822,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, if (signal_pending(current)) err = -ERESTARTSYS; else - err = break_ksm(vma, addr); + err = break_ksm(vma, addr, mmrange); } return err; } @@ -922,6 +926,7 @@ static int unmerge_and_remove_all_rmap_items(void) struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, @@ -937,8 +942,8 @@ static int unmerge_and_remove_all_rmap_items(void) break; if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; - err = unmerge_ksm_pages(vma, - vma->vm_start, vma->vm_end); + err = unmerge_ksm_pages(vma, vma->vm_start, + vma->vm_end, &mmrange); if (err) goto error; } @@ -2350,7 +2355,8 @@ static int ksm_scan_thread(void *nothing) } int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags) + unsigned long end, int advice, unsigned long *vm_flags, + struct range_lock *mmrange) { struct mm_struct *mm = vma->vm_mm; int err; @@ -2384,7 +2390,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return 0; /* just ignore the advice */ if (vma->anon_vma) { - err = unmerge_ksm_pages(vma, start, end); + err = unmerge_ksm_pages(vma, start, end, mmrange); if (err) return err; } diff --git a/mm/madvise.c b/mm/madvise.c index 4d3c922ea1a1..eaec6bfc2b08 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -54,7 +54,8 @@ static int madvise_need_mmap_write(int behavior) */ static long madvise_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) + unsigned long start, unsigned long end, int behavior, + struct range_lock *mmrange) { struct mm_struct *mm = vma->vm_mm; int error = 0; @@ -104,7 +105,8 @@ static long madvise_behavior(struct vm_area_struct *vma, break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: - error = ksm_madvise(vma, start, end, behavior, &new_flags); + error = ksm_madvise(vma, start, end, behavior, + &new_flags, mmrange); if (error) { /* * madvise() returns EAGAIN if kernel resources, such as @@ -138,7 +140,7 @@ static long madvise_behavior(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, mmrange); if (*prev) { vma = *prev; goto success; @@ -151,7 +153,7 @@ static long madvise_behavior(struct vm_area_struct *vma, error = -ENOMEM; goto out; } - error = __split_vma(mm, vma, start, 1); + error = __split_vma(mm, vma, start, 1, mmrange); if (error) { /* * madvise() returns EAGAIN if kernel resources, such as @@ -168,7 +170,7 @@ static long madvise_behavior(struct vm_area_struct *vma, error = -ENOMEM; goto out; } - error = __split_vma(mm, vma, end, 0); + error = __split_vma(mm, vma, end, 0, mmrange); if (error) { /* * madvise() returns EAGAIN if kernel resources, such as @@ -191,7 +193,8 @@ static long madvise_behavior(struct vm_area_struct *vma, #ifdef CONFIG_SWAP static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, - unsigned long end, struct mm_walk *walk) + unsigned long end, struct mm_walk *walk, + struct range_lock *mmrange) { pte_t *orig_pte; struct vm_area_struct *vma = walk->private; @@ -226,7 +229,8 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, } static void force_swapin_readahead(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct range_lock *mmrange) { struct mm_walk walk = { .mm = vma->vm_mm, @@ -234,7 +238,7 @@ static void force_swapin_readahead(struct vm_area_struct *vma, .private = vma, }; - walk_page_range(start, end, &walk); + walk_page_range(start, end, &walk, mmrange); lru_add_drain(); /* Push any new pages onto the LRU now */ } @@ -272,14 +276,15 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, */ static long madvise_willneed(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct range_lock *mmrange) { struct file *file = vma->vm_file; *prev = vma; #ifdef CONFIG_SWAP if (!file) { - force_swapin_readahead(vma, start, end); + force_swapin_readahead(vma, start, end, mmrange); return 0; } @@ -308,7 +313,8 @@ static long madvise_willneed(struct vm_area_struct *vma, } static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) + unsigned long end, struct mm_walk *walk, + struct range_lock *mmrange) { struct mmu_gather *tlb = walk->private; @@ -442,7 +448,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, static void madvise_free_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) + unsigned long addr, unsigned long end, + struct range_lock *mmrange) { struct mm_walk free_walk = { .pmd_entry = madvise_free_pte_range, @@ -451,12 +458,14 @@ static void madvise_free_page_range(struct mmu_gather *tlb, }; tlb_start_vma(tlb, vma); - walk_page_range(addr, end, &free_walk); + walk_page_range(addr, end, &free_walk, mmrange); tlb_end_vma(tlb, vma); } static int madvise_free_single_vma(struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr) + unsigned long start_addr, + unsigned long end_addr, + struct range_lock *mmrange) { unsigned long start, end; struct mm_struct *mm = vma->vm_mm; @@ -478,7 +487,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); - madvise_free_page_range(&tlb, vma, start, end); + madvise_free_page_range(&tlb, vma, start, end, mmrange); mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); @@ -514,7 +523,7 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma, static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - int behavior) + int behavior, struct range_lock *mmrange) { *prev = vma; if (!can_madv_dontneed_vma(vma)) @@ -562,7 +571,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (behavior == MADV_DONTNEED) return madvise_dontneed_single_vma(vma, start, end); else if (behavior == MADV_FREE) - return madvise_free_single_vma(vma, start, end); + return madvise_free_single_vma(vma, start, end, mmrange); else return -EINVAL; } @@ -676,18 +685,21 @@ static int madvise_inject_error(int behavior, static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) + unsigned long start, unsigned long end, int behavior, + struct range_lock *mmrange) { switch (behavior) { case MADV_REMOVE: return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: - return madvise_willneed(vma, prev, start, end); + return madvise_willneed(vma, prev, start, end, mmrange); case MADV_FREE: case MADV_DONTNEED: - return madvise_dontneed_free(vma, prev, start, end, behavior); + return madvise_dontneed_free(vma, prev, start, end, behavior, + mmrange); default: - return madvise_behavior(vma, prev, start, end, behavior); + return madvise_behavior(vma, prev, start, end, behavior, + mmrange); } } @@ -797,7 +809,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int write; size_t len; struct blk_plug plug; - + DEFINE_RANGE_LOCK_FULL(mmrange); if (!madvise_behavior_valid(behavior)) return error; @@ -860,7 +872,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) tmp = end; /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = madvise_vma(vma, &prev, start, tmp, behavior); + error = madvise_vma(vma, &prev, start, tmp, behavior, &mmrange); if (error) goto out; start = tmp; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 88c1af32fd67..a7ac5a14b22e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4881,7 +4881,8 @@ static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, + struct range_lock *mmrange) { struct vm_area_struct *vma = walk->vma; pte_t *pte; @@ -4915,6 +4916,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) { unsigned long precharge; + DEFINE_RANGE_LOCK_FULL(mmrange); struct mm_walk mem_cgroup_count_precharge_walk = { .pmd_entry = mem_cgroup_count_precharge_pte_range, @@ -4922,7 +4924,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) }; down_read(&mm->mmap_sem); walk_page_range(0, mm->highest_vm_end, - &mem_cgroup_count_precharge_walk); + &mem_cgroup_count_precharge_walk, &mmrange); up_read(&mm->mmap_sem); precharge = mc.precharge; @@ -5081,7 +5083,8 @@ static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, + struct range_lock *mmrange) { int ret = 0; struct vm_area_struct *vma = walk->vma; @@ -5197,6 +5200,7 @@ static void mem_cgroup_move_charge(void) .pmd_entry = mem_cgroup_move_charge_pte_range, .mm = mc.mm, }; + DEFINE_RANGE_LOCK_FULL(mmrange); lru_add_drain_all(); /* @@ -5223,7 +5227,8 @@ static void mem_cgroup_move_charge(void) * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); + walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk, + &mmrange); up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); diff --git a/mm/memory.c b/mm/memory.c index 5ec6433d6a5c..b3561a052939 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4021,7 +4021,7 @@ static int handle_pte_fault(struct vm_fault *vmf) * return value. See filemap_fault() and __lock_page_or_retry(). */ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags) + unsigned int flags, struct range_lock *mmrange) { struct vm_fault vmf = { .vma = vma, @@ -4029,6 +4029,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), + .lockrange = mmrange, }; unsigned int dirty = flags & FAULT_FLAG_WRITE; struct mm_struct *mm = vma->vm_mm; @@ -4110,7 +4111,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, * return value. See filemap_fault() and __lock_page_or_retry(). */ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags) + unsigned int flags, struct range_lock *mmrange) { int ret; @@ -4137,7 +4138,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else - ret = __handle_mm_fault(vma, address, flags); + ret = __handle_mm_fault(vma, address, flags, mmrange); if (flags & FAULT_FLAG_USER) { mem_cgroup_oom_disable(); @@ -4425,6 +4426,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; void *old_buf = buf; int write = gup_flags & FOLL_WRITE; + DEFINE_RANGE_LOCK_FULL(mmrange); down_read(&mm->mmap_sem); /* ignore errors, just check how much was successfully transferred */ @@ -4434,7 +4436,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - gup_flags, &page, &vma, NULL); + gup_flags, &page, &vma, NULL, &mmrange); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a8b7d59002e8..001dc176abc1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -467,7 +467,8 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, * and move them to the pagelist if they do. */ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) + unsigned long end, struct mm_walk *walk, + struct range_lock *mmrange) { struct vm_area_struct *vma = walk->vma; struct page *page; @@ -618,7 +619,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + struct list_head *pagelist, struct range_lock *mmrange) { struct queue_pages qp = { .pagelist = pagelist, @@ -634,7 +635,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, .private = &qp, }; - return walk_page_range(start, end, &queue_pages_walk); + return walk_page_range(start, end, &queue_pages_walk, mmrange); } /* @@ -675,7 +676,8 @@ static int vma_replace_policy(struct vm_area_struct *vma, /* Step 2: apply policy to a range and do splits. */ static int mbind_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct mempolicy *new_pol) + unsigned long end, struct mempolicy *new_pol, + struct range_lock *mmrange) { struct vm_area_struct *next; struct vm_area_struct *prev; @@ -705,7 +707,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx); + new_pol, vma->vm_userfaultfd_ctx, mmrange); if (prev) { vma = prev; next = vma->vm_next; @@ -715,12 +717,12 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, goto replace; } if (vma->vm_start != vmstart) { - err = split_vma(vma->vm_mm, vma, vmstart, 1); + err = split_vma(vma->vm_mm, vma, vmstart, 1, mmrange); if (err) goto out; } if (vma->vm_end != vmend) { - err = split_vma(vma->vm_mm, vma, vmend, 0); + err = split_vma(vma->vm_mm, vma, vmend, 0, mmrange); if (err) goto out; } @@ -797,12 +799,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) } } -static int lookup_node(unsigned long addr) +static int lookup_node(unsigned long addr, struct range_lock *mmrange) { struct page *p; int err; - err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL); + err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL, mmrange); if (err >= 0) { err = page_to_nid(p); put_page(p); @@ -818,6 +820,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; + DEFINE_RANGE_LOCK_FULL(mmrange); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) @@ -857,7 +860,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { - err = lookup_node(addr); + err = lookup_node(addr, &mmrange); if (err < 0) goto out; *policy = err; @@ -943,7 +946,7 @@ struct page *alloc_new_node_page(struct page *page, unsigned long node) * Returns error or the number of pages not migrated. */ static int migrate_to_node(struct mm_struct *mm, int source, int dest, - int flags) + int flags, struct range_lock *mmrange) { nodemask_t nmask; LIST_HEAD(pagelist); @@ -959,7 +962,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, */ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, - flags | MPOL_MF_DISCONTIG_OK, &pagelist); + flags | MPOL_MF_DISCONTIG_OK, &pagelist, mmrange); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest, @@ -983,6 +986,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, int busy = 0; int err; nodemask_t tmp; + DEFINE_RANGE_LOCK_FULL(mmrange); err = migrate_prep(); if (err) @@ -1063,7 +1067,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, break; node_clear(source, tmp); - err = migrate_to_node(mm, source, dest, flags); + err = migrate_to_node(mm, source, dest, flags, &mmrange); if (err > 0) busy += err; if (err < 0) @@ -1143,6 +1147,7 @@ static long do_mbind(unsigned long start, unsigned long len, unsigned long end; int err; LIST_HEAD(pagelist); + DEFINE_RANGE_LOCK_FULL(mmrange); if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; @@ -1204,9 +1209,9 @@ static long do_mbind(unsigned long start, unsigned long len, goto mpol_out; err = queue_pages_range(mm, start, end, nmask, - flags | MPOL_MF_INVERT, &pagelist); + flags | MPOL_MF_INVERT, &pagelist, &mmrange); if (!err) - err = mbind_range(mm, start, end, new); + err = mbind_range(mm, start, end, new, &mmrange); if (!err) { int nr_failed = 0; diff --git a/mm/migrate.c b/mm/migrate.c index 5d0dc7b85f90..7a6afc34dd54 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2105,7 +2105,8 @@ struct migrate_vma { static int migrate_vma_collect_hole(unsigned long start, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, + struct range_lock *mmrange) { struct migrate_vma *migrate = walk->private; unsigned long addr; @@ -2138,7 +2139,8 @@ static int migrate_vma_collect_skip(unsigned long start, static int migrate_vma_collect_pmd(pmd_t *pmdp, unsigned long start, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, + struct range_lock *mmrange) { struct migrate_vma *migrate = walk->private; struct vm_area_struct *vma = walk->vma; @@ -2149,7 +2151,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, again: if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, walk); + return migrate_vma_collect_hole(start, end, walk, mmrange); if (pmd_trans_huge(*pmdp)) { struct page *page; @@ -2183,7 +2185,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, walk); if (pmd_none(*pmdp)) return migrate_vma_collect_hole(start, end, - walk); + walk, mmrange); } } @@ -2309,7 +2311,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, * valid page, it updates the src array and takes a reference on the page, in * order to pin the page until we lock it and unmap it. */ -static void migrate_vma_collect(struct migrate_vma *migrate) +static void migrate_vma_collect(struct migrate_vma *migrate, + struct range_lock *mmrange) { struct mm_walk mm_walk; @@ -2325,7 +2328,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate) mmu_notifier_invalidate_range_start(mm_walk.mm, migrate->start, migrate->end); - walk_page_range(migrate->start, migrate->end, &mm_walk); + walk_page_range(migrate->start, migrate->end, &mm_walk, mmrange); mmu_notifier_invalidate_range_end(mm_walk.mm, migrate->start, migrate->end); @@ -2891,7 +2894,8 @@ int migrate_vma(const struct migrate_vma_ops *ops, unsigned long end, unsigned long *src, unsigned long *dst, - void *private) + void *private, + struct range_lock *mmrange) { struct migrate_vma migrate; @@ -2917,7 +2921,7 @@ int migrate_vma(const struct migrate_vma_ops *ops, migrate.vma = vma; /* Collect, and try to unmap source pages */ - migrate_vma_collect(&migrate); + migrate_vma_collect(&migrate, mmrange); if (!migrate.cpages) return 0; diff --git a/mm/mincore.c b/mm/mincore.c index fc37afe226e6..a6875a34aac0 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -85,7 +85,9 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) } static int __mincore_unmapped_range(unsigned long addr, unsigned long end, - struct vm_area_struct *vma, unsigned char *vec) + struct vm_area_struct *vma, + unsigned char *vec, + struct range_lock *mmrange) { unsigned long nr = (end - addr) >> PAGE_SHIFT; int i; @@ -104,15 +106,17 @@ static int __mincore_unmapped_range(unsigned long addr, unsigned long end, } static int mincore_unmapped_range(unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, + struct range_lock *mmrange) { walk->private += __mincore_unmapped_range(addr, end, - walk->vma, walk->private); + walk->vma, + walk->private, mmrange); return 0; } static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { spinlock_t *ptl; struct vm_area_struct *vma = walk->vma; @@ -128,7 +132,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } if (pmd_trans_unstable(pmd)) { - __mincore_unmapped_range(addr, end, vma, vec); + __mincore_unmapped_range(addr, end, vma, vec, mmrange); goto out; } @@ -138,7 +142,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (pte_none(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, - vma, vec); + vma, vec, mmrange); else if (pte_present(pte)) *vec = 1; else { /* pte is a swap entry */ @@ -174,7 +178,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, * all the arguments, we hold the mmap semaphore: we should * just return the amount of info we're asked for. */ -static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec) +static long do_mincore(unsigned long addr, unsigned long pages, + unsigned char *vec, struct range_lock *mmrange) { struct vm_area_struct *vma; unsigned long end; @@ -191,7 +196,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v return -ENOMEM; mincore_walk.mm = vma->vm_mm; end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); - err = walk_page_range(addr, end, &mincore_walk); + err = walk_page_range(addr, end, &mincore_walk, mmrange); if (err < 0) return err; return (end - addr) >> PAGE_SHIFT; @@ -227,6 +232,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, long retval; unsigned long pages; unsigned char *tmp; + DEFINE_RANGE_LOCK_FULL(mmrange); /* Check the start address: needs to be page-aligned.. */ if (start & ~PAGE_MASK) @@ -254,7 +260,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, * the temporary buffer size. */ down_read(¤t->mm->mmap_sem); - retval = do_mincore(start, min(pages, PAGE_SIZE), tmp); + retval = do_mincore(start, min(pages, PAGE_SIZE), tmp, &mmrange); up_read(¤t->mm->mmap_sem); if (retval <= 0) diff --git a/mm/mlock.c b/mm/mlock.c index 74e5a6547c3d..3f6bd953e8b0 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -517,7 +517,8 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * For vmas that pass the filters, merge/split as appropriate. */ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, vm_flags_t newflags) + unsigned long start, unsigned long end, vm_flags_t newflags, + struct range_lock *mmrange) { struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; @@ -534,20 +535,20 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, mmrange); if (*prev) { vma = *prev; goto success; } if (start != vma->vm_start) { - ret = split_vma(mm, vma, start, 1); + ret = split_vma(mm, vma, start, 1, mmrange); if (ret) goto out; } if (end != vma->vm_end) { - ret = split_vma(mm, vma, end, 0); + ret = split_vma(mm, vma, end, 0, mmrange); if (ret) goto out; } @@ -580,7 +581,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, } static int apply_vma_lock_flags(unsigned long start, size_t len, - vm_flags_t flags) + vm_flags_t flags, struct range_lock *mmrange) { unsigned long nstart, end, tmp; struct vm_area_struct * vma, * prev; @@ -610,7 +611,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, tmp = vma->vm_end; if (tmp > end) tmp = end; - error = mlock_fixup(vma, &prev, nstart, tmp, newflags); + error = mlock_fixup(vma, &prev, nstart, tmp, newflags, mmrange); if (error) break; nstart = tmp; @@ -667,11 +668,13 @@ static int count_mm_mlocked_page_nr(struct mm_struct *mm, return count >> PAGE_SHIFT; } -static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) +static __must_check int do_mlock(unsigned long start, size_t len, + vm_flags_t flags) { unsigned long locked; unsigned long lock_limit; int error = -ENOMEM; + DEFINE_RANGE_LOCK_FULL(mmrange); if (!can_do_mlock()) return -EPERM; @@ -700,7 +703,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) - error = apply_vma_lock_flags(start, len, flags); + error = apply_vma_lock_flags(start, len, flags, &mmrange); up_write(¤t->mm->mmap_sem); if (error) @@ -733,13 +736,14 @@ SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; + DEFINE_RANGE_LOCK_FULL(mmrange); len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; if (down_write_killable(¤t->mm->mmap_sem)) return -EINTR; - ret = apply_vma_lock_flags(start, len, 0); + ret = apply_vma_lock_flags(start, len, 0, &mmrange); up_write(¤t->mm->mmap_sem); return ret; @@ -755,7 +759,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) * is called once including the MCL_FUTURE flag and then a second time without * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags. */ -static int apply_mlockall_flags(int flags) +static int apply_mlockall_flags(int flags, struct range_lock *mmrange) { struct vm_area_struct * vma, * prev = NULL; vm_flags_t to_add = 0; @@ -784,7 +788,8 @@ static int apply_mlockall_flags(int flags) newflags |= to_add; /* Ignore errors */ - mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags, + mmrange); cond_resched(); } out: @@ -795,6 +800,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; int ret; + DEFINE_RANGE_LOCK_FULL(mmrange); if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT))) return -EINVAL; @@ -811,7 +817,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) ret = -ENOMEM; if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) - ret = apply_mlockall_flags(flags); + ret = apply_mlockall_flags(flags, &mmrange); up_write(¤t->mm->mmap_sem); if (!ret && (flags & MCL_CURRENT)) mm_populate(0, TASK_SIZE); @@ -822,10 +828,11 @@ SYSCALL_DEFINE1(mlockall, int, flags) SYSCALL_DEFINE0(munlockall) { int ret; + DEFINE_RANGE_LOCK_FULL(mmrange); if (down_write_killable(¤t->mm->mmap_sem)) return -EINTR; - ret = apply_mlockall_flags(0); + ret = apply_mlockall_flags(0, &mmrange); up_write(¤t->mm->mmap_sem); return ret; } diff --git a/mm/mmap.c b/mm/mmap.c index 4bb038e7984b..f61d49cb791e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -177,7 +177,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf); +static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf, + struct range_lock *mmrange); SYSCALL_DEFINE1(brk, unsigned long, brk) { @@ -188,6 +189,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) unsigned long min_brk; bool populate; LIST_HEAD(uf); + DEFINE_RANGE_LOCK_FULL(mmrange); if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -225,7 +227,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Always allow shrinking brk. */ if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf)) + if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf, &mmrange)) goto set_brk; goto out; } @@ -236,7 +238,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0) + if (do_brk(oldbrk, newbrk-oldbrk, &uf, &mmrange) < 0) goto out; set_brk: @@ -680,7 +682,7 @@ static inline void __vma_unlink_prev(struct mm_struct *mm, */ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand) + struct vm_area_struct *expand, struct range_lock *mmrange) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; @@ -887,10 +889,10 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, i_mmap_unlock_write(mapping); if (root) { - uprobe_mmap(vma); + uprobe_mmap(vma, mmrange); if (adjust_next) - uprobe_mmap(next); + uprobe_mmap(next, mmrange); } if (remove_next) { @@ -960,7 +962,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } if (insert && file) - uprobe_mmap(insert); + uprobe_mmap(insert, mmrange); validate_mm(mm); @@ -1101,7 +1103,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct range_lock *mmrange) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1149,10 +1152,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, /* cases 1, 6 */ err = __vma_adjust(prev, prev->vm_start, next->vm_end, prev->vm_pgoff, NULL, - prev); + prev, mmrange); } else /* cases 2, 5, 7 */ err = __vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL, prev); + end, prev->vm_pgoff, NULL, + prev, mmrange); if (err) return NULL; khugepaged_enter_vma_merge(prev, vm_flags); @@ -1169,10 +1173,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, vm_userfaultfd_ctx)) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL, next); + addr, prev->vm_pgoff, NULL, + next, mmrange); else { /* cases 3, 8 */ err = __vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL, next); + next->vm_pgoff - pglen, NULL, + next, mmrange); /* * In case 3 area is already equal to next and * this is a noop, but in case 8 "area" has @@ -1322,7 +1328,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, - struct list_head *uf) + struct list_head *uf, struct range_lock *mmrange) { struct mm_struct *mm = current->mm; int pkey = 0; @@ -1491,7 +1497,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_NORESERVE; } - addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf, mmrange); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) @@ -1628,7 +1634,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) + struct list_head *uf, struct range_lock *mmrange) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1654,7 +1660,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Clear old maps */ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { - if (do_munmap(mm, addr, len, uf)) + if (do_munmap(mm, addr, len, uf, mmrange)) return -ENOMEM; } @@ -1672,7 +1678,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, mmrange); if (vma) goto out; @@ -1756,7 +1762,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } if (file) - uprobe_mmap(vma); + uprobe_mmap(vma, mmrange); /* * New (or expanded) vma always get soft dirty status. @@ -2435,7 +2441,8 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address) } struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) +find_extend_vma(struct mm_struct *mm, unsigned long addr, + struct range_lock *mmrange) { struct vm_area_struct *vma, *prev; @@ -2446,7 +2453,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) - populate_vma_page_range(prev, addr, prev->vm_end, NULL); + populate_vma_page_range(prev, addr, prev->vm_end, + NULL, mmrange); return prev; } #else @@ -2456,7 +2464,8 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address) } struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) +find_extend_vma(struct mm_struct *mm, unsigned long addr, + struct range_lock *mmrange) { struct vm_area_struct *vma; unsigned long start; @@ -2473,7 +2482,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (expand_stack(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) - populate_vma_page_range(vma, addr, start, NULL); + populate_vma_page_range(vma, addr, start, NULL, mmrange); return vma; } #endif @@ -2561,7 +2570,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, * has already been checked or doesn't make sense to fail. */ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, int new_below) + unsigned long addr, int new_below, struct range_lock *mmrange) { struct vm_area_struct *new; int err; @@ -2604,9 +2613,11 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (new_below) err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + - ((addr - new->vm_start) >> PAGE_SHIFT), new); + ((addr - new->vm_start) >> PAGE_SHIFT), new, + mmrange); else - err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new, + mmrange); /* Success. */ if (!err) @@ -2630,12 +2641,12 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * either for the first part or the tail. */ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, int new_below) + unsigned long addr, int new_below, struct range_lock *mmrange) { if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; - return __split_vma(mm, vma, addr, new_below); + return __split_vma(mm, vma, addr, new_below, mmrange); } /* Munmap is split into 2 main parts -- this part which finds @@ -2644,7 +2655,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * Jeremy Fitzhardinge <jeremy@xxxxxxxx> */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf) + struct list_head *uf, struct range_lock *mmrange) { unsigned long end; struct vm_area_struct *vma, *prev, *last; @@ -2686,7 +2697,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) return -ENOMEM; - error = __split_vma(mm, vma, start, 0); + error = __split_vma(mm, vma, start, 0, mmrange); if (error) return error; prev = vma; @@ -2695,7 +2706,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, /* Does it split the last one? */ last = find_vma(mm, end); if (last && end > last->vm_start) { - int error = __split_vma(mm, last, end, 1); + int error = __split_vma(mm, last, end, 1, mmrange); if (error) return error; } @@ -2736,7 +2747,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, detach_vmas_to_be_unmapped(mm, vma, prev, end); unmap_region(mm, vma, prev, start, end); - arch_unmap(mm, vma, start, end); + arch_unmap(mm, vma, start, end, mmrange); /* Fix up all other VM information */ remove_vma_list(mm, vma); @@ -2749,11 +2760,12 @@ int vm_munmap(unsigned long start, size_t len) int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); + DEFINE_RANGE_LOCK_FULL(mmrange); if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_munmap(mm, start, len, &uf); + ret = do_munmap(mm, start, len, &uf, &mmrange); up_write(&mm->mmap_sem); userfaultfd_unmap_complete(mm, &uf); return ret; @@ -2779,6 +2791,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long populate = 0; unsigned long ret = -EINVAL; struct file *file; + DEFINE_RANGE_LOCK_FULL(mmrange); pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n", current->comm, current->pid); @@ -2855,7 +2868,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, file = get_file(vma->vm_file); ret = do_mmap_pgoff(vma->vm_file, start, size, - prot, flags, pgoff, &populate, NULL); + prot, flags, pgoff, &populate, NULL, &mmrange); fput(file); out: up_write(&mm->mmap_sem); @@ -2881,7 +2894,9 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf) +static int do_brk_flags(unsigned long addr, unsigned long request, + unsigned long flags, struct list_head *uf, + struct range_lock *mmrange) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -2920,7 +2935,7 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long */ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { - if (do_munmap(mm, addr, len, uf)) + if (do_munmap(mm, addr, len, uf, mmrange)) return -ENOMEM; } @@ -2936,7 +2951,7 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, mmrange); if (vma) goto out; @@ -2967,9 +2982,10 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long return 0; } -static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf) +static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf, + struct range_lock *mmrange) { - return do_brk_flags(addr, len, 0, uf); + return do_brk_flags(addr, len, 0, uf, mmrange); } int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) @@ -2978,11 +2994,12 @@ int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) int ret; bool populate; LIST_HEAD(uf); + DEFINE_RANGE_LOCK_FULL(mmrange); if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_brk_flags(addr, len, flags, &uf); + ret = do_brk_flags(addr, len, flags, &uf, &mmrange); populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); userfaultfd_unmap_complete(mm, &uf); @@ -3105,7 +3122,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) */ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, - bool *need_rmap_locks) + bool *need_rmap_locks, struct range_lock *mmrange) { struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; @@ -3127,7 +3144,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, mmrange); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index e3309fcf586b..b84a70720319 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -299,7 +299,8 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, - unsigned long start, unsigned long end, unsigned long newflags) + unsigned long start, unsigned long end, unsigned long newflags, + struct range_lock *mmrange) { struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; @@ -340,7 +341,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, mmrange); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); @@ -350,13 +351,13 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, *pprev = vma; if (start != vma->vm_start) { - error = split_vma(mm, vma, start, 1); + error = split_vma(mm, vma, start, 1, mmrange); if (error) goto fail; } if (end != vma->vm_end) { - error = split_vma(mm, vma, end, 0); + error = split_vma(mm, vma, end, 0, mmrange); if (error) goto fail; } @@ -379,7 +380,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, */ if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && (newflags & VM_WRITE)) { - populate_vma_page_range(vma, start, end, NULL); + populate_vma_page_range(vma, start, end, NULL, mmrange); } vm_stat_account(mm, oldflags, -nrpages); @@ -404,6 +405,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); + DEFINE_RANGE_LOCK_FULL(mmrange); prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ @@ -494,7 +496,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, tmp = vma->vm_end; if (tmp > end) tmp = end; - error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); + error = mprotect_fixup(vma, &prev, nstart, tmp, newflags, &mmrange); if (error) goto out; nstart = tmp; diff --git a/mm/mremap.c b/mm/mremap.c index 049470aa1e3e..21a9e2a2baa2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -264,7 +264,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long old_addr, unsigned long old_len, unsigned long new_len, unsigned long new_addr, bool *locked, struct vm_userfaultfd_ctx *uf, - struct list_head *uf_unmap) + struct list_head *uf_unmap, + struct range_lock *mmrange) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; @@ -292,13 +293,13 @@ static unsigned long move_vma(struct vm_area_struct *vma, * so KSM can come around to merge on vma and new_vma afterwards. */ err = ksm_madvise(vma, old_addr, old_addr + old_len, - MADV_UNMERGEABLE, &vm_flags); + MADV_UNMERGEABLE, &vm_flags, mmrange); if (err) return err; new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, - &need_rmap_locks); + &need_rmap_locks, mmrange); if (!new_vma) return -ENOMEM; @@ -353,7 +354,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn_moved(vma); - if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { + if (do_munmap(mm, old_addr, old_len, uf_unmap, mmrange) < 0) { /* OOM: unable to split vma, just get accounts right */ vm_unacct_memory(excess >> PAGE_SHIFT); excess = 0; @@ -444,7 +445,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, unsigned long new_addr, unsigned long new_len, bool *locked, struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap_early, - struct list_head *uf_unmap) + struct list_head *uf_unmap, + struct range_lock *mmrange) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -462,12 +464,13 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; - ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); + ret = do_munmap(mm, new_addr, new_len, uf_unmap_early, mmrange); if (ret) goto out; if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap); + ret = do_munmap(mm, addr+new_len, old_len - new_len, + uf_unmap, mmrange); if (ret && old_len != new_len) goto out; old_len = new_len; @@ -490,7 +493,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, goto out1; ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf, - uf_unmap); + uf_unmap, mmrange); if (!(offset_in_page(ret))) goto out; out1: @@ -532,6 +535,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); + DEFINE_RANGE_LOCK_FULL(mmrange); if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; @@ -558,7 +562,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (flags & MREMAP_FIXED) { ret = mremap_to(addr, old_len, new_addr, new_len, - &locked, &uf, &uf_unmap_early, &uf_unmap); + &locked, &uf, &uf_unmap_early, + &uf_unmap, &mmrange); goto out; } @@ -568,7 +573,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * do_munmap does all the needed commit accounting */ if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); + ret = do_munmap(mm, addr+new_len, old_len - new_len, + &uf_unmap, &mmrange); if (ret && old_len != new_len) goto out; ret = addr; @@ -592,7 +598,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, int pages = (new_len - old_len) >> PAGE_SHIFT; if (vma_adjust(vma, vma->vm_start, addr + new_len, - vma->vm_pgoff, NULL)) { + vma->vm_pgoff, NULL, &mmrange)) { ret = -ENOMEM; goto out; } @@ -628,7 +634,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } ret = move_vma(vma, addr, old_len, new_len, new_addr, - &locked, &uf, &uf_unmap); + &locked, &uf, &uf_unmap, &mmrange); } out: if (offset_in_page(ret)) { diff --git a/mm/nommu.c b/mm/nommu.c index ebb6e618dade..1805f0a788b3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -113,7 +113,8 @@ unsigned int kobjsize(const void *objp) static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int foll_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) + struct vm_area_struct **vmas, int *nonblocking, + struct range_lock *mmrange) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -162,18 +163,19 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, + struct range_lock *mmrange) { return __get_user_pages(current, current->mm, start, nr_pages, - gup_flags, pages, vmas, NULL); + gup_flags, pages, vmas, NULL, mmrange); } EXPORT_SYMBOL(get_user_pages); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - int *locked) + int *locked, struct range_lock *mmrange) { - return get_user_pages(start, nr_pages, gup_flags, pages, NULL); + return get_user_pages(start, nr_pages, gup_flags, pages, NULL, mmrange); } EXPORT_SYMBOL(get_user_pages_locked); @@ -183,9 +185,11 @@ static long __get_user_pages_unlocked(struct task_struct *tsk, unsigned int gup_flags) { long ret; + DEFINE_RANGE_LOCK_FULL(mmrange); + down_read(&mm->mmap_sem); ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages, - NULL, NULL); + NULL, NULL, &mmrange); up_read(&mm->mmap_sem); return ret; } @@ -836,7 +840,8 @@ EXPORT_SYMBOL(find_vma); * find a VMA * - we don't extend stack VMAs under NOMMU conditions */ -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr, + struct range_lock *mmrange) { return find_vma(mm, addr); } @@ -1206,7 +1211,8 @@ unsigned long do_mmap(struct file *file, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, - struct list_head *uf) + struct list_head *uf, + struct range_lock *mmrange) { struct vm_area_struct *vma; struct vm_region *region; @@ -1476,7 +1482,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * for the first part or the tail. */ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, int new_below) + unsigned long addr, int new_below, struct range_lock *mmrange) { struct vm_area_struct *new; struct vm_region *region; @@ -1578,7 +1584,8 @@ static int shrink_vma(struct mm_struct *mm, * - under NOMMU conditions the chunk to be unmapped must be backed by a single * VMA, though it need not cover the whole VMA */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf, struct range_lock *mmrange) { struct vm_area_struct *vma; unsigned long end; @@ -1624,7 +1631,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; if (start != vma->vm_start && end != vma->vm_end) { - ret = split_vma(mm, vma, start, 1); + ret = split_vma(mm, vma, start, 1, mmrange); if (ret < 0) return ret; } @@ -1642,9 +1649,10 @@ int vm_munmap(unsigned long addr, size_t len) { struct mm_struct *mm = current->mm; int ret; + DEFINE_RANGE_LOCK_FULL(mmrange); down_write(&mm->mmap_sem); - ret = do_munmap(mm, addr, len, NULL); + ret = do_munmap(mm, addr, len, NULL, &mmrange); up_write(&mm->mmap_sem); return ret; } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8d2da5dec1e0..44a2507c94fd 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -26,7 +26,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { pmd_t *pmd; unsigned long next; @@ -38,7 +38,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, next = pmd_addr_end(addr, end); if (pmd_none(*pmd) || !walk->vma) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + err = walk->pte_hole(addr, next, walk, mmrange); if (err) break; continue; @@ -48,7 +48,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, * needs to know about pmd_trans_huge() pmds */ if (walk->pmd_entry) - err = walk->pmd_entry(pmd, addr, next, walk); + err = walk->pmd_entry(pmd, addr, next, walk, mmrange); if (err) break; @@ -71,7 +71,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, } static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { pud_t *pud; unsigned long next; @@ -83,7 +83,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, next = pud_addr_end(addr, end); if (pud_none(*pud) || !walk->vma) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + err = walk->pte_hole(addr, next, walk, mmrange); if (err) break; continue; @@ -106,7 +106,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, goto again; if (walk->pmd_entry || walk->pte_entry) - err = walk_pmd_range(pud, addr, next, walk); + err = walk_pmd_range(pud, addr, next, walk, mmrange); if (err) break; } while (pud++, addr = next, addr != end); @@ -115,7 +115,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, } static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { p4d_t *p4d; unsigned long next; @@ -126,13 +126,13 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + err = walk->pte_hole(addr, next, walk, mmrange); if (err) break; continue; } if (walk->pmd_entry || walk->pte_entry) - err = walk_pud_range(p4d, addr, next, walk); + err = walk_pud_range(p4d, addr, next, walk, mmrange); if (err) break; } while (p4d++, addr = next, addr != end); @@ -141,7 +141,7 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, } static int walk_pgd_range(unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { pgd_t *pgd; unsigned long next; @@ -152,13 +152,13 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + err = walk->pte_hole(addr, next, walk, mmrange); if (err) break; continue; } if (walk->pmd_entry || walk->pte_entry) - err = walk_p4d_range(pgd, addr, next, walk); + err = walk_p4d_range(pgd, addr, next, walk, mmrange); if (err) break; } while (pgd++, addr = next, addr != end); @@ -175,7 +175,7 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, } static int walk_hugetlb_range(unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { struct vm_area_struct *vma = walk->vma; struct hstate *h = hstate_vma(vma); @@ -192,7 +192,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, if (pte) err = walk->hugetlb_entry(pte, hmask, addr, next, walk); else if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + err = walk->pte_hole(addr, next, walk, mmrange); if (err) break; @@ -203,7 +203,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, #else /* CONFIG_HUGETLB_PAGE */ static int walk_hugetlb_range(unsigned long addr, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { return 0; } @@ -217,7 +217,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, * error, where we abort the current walk. */ static int walk_page_test(unsigned long start, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { struct vm_area_struct *vma = walk->vma; @@ -235,23 +235,23 @@ static int walk_page_test(unsigned long start, unsigned long end, if (vma->vm_flags & VM_PFNMAP) { int err = 1; if (walk->pte_hole) - err = walk->pte_hole(start, end, walk); + err = walk->pte_hole(start, end, walk, mmrange); return err ? err : 1; } return 0; } static int __walk_page_range(unsigned long start, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { int err = 0; struct vm_area_struct *vma = walk->vma; if (vma && is_vm_hugetlb_page(vma)) { if (walk->hugetlb_entry) - err = walk_hugetlb_range(start, end, walk); + err = walk_hugetlb_range(start, end, walk, mmrange); } else - err = walk_pgd_range(start, end, walk); + err = walk_pgd_range(start, end, walk, mmrange); return err; } @@ -285,10 +285,11 @@ static int __walk_page_range(unsigned long start, unsigned long end, * Locking: * Callers of walk_page_range() and walk_page_vma() should hold * @walk->mm->mmap_sem, because these function traverse vma list and/or - * access to vma's data. + * access to vma's data. As such, the @mmrange will represent the + * address space range. */ int walk_page_range(unsigned long start, unsigned long end, - struct mm_walk *walk) + struct mm_walk *walk, struct range_lock *mmrange) { int err = 0; unsigned long next; @@ -315,7 +316,7 @@ int walk_page_range(unsigned long start, unsigned long end, next = min(end, vma->vm_end); vma = vma->vm_next; - err = walk_page_test(start, next, walk); + err = walk_page_test(start, next, walk, mmrange); if (err > 0) { /* * positive return values are purely for @@ -329,14 +330,15 @@ int walk_page_range(unsigned long start, unsigned long end, break; } if (walk->vma || walk->pte_hole) - err = __walk_page_range(start, next, walk); + err = __walk_page_range(start, next, walk, mmrange); if (err) break; } while (start = next, start < end); return err; } -int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) +int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk, + struct range_lock *mmrange) { int err; @@ -346,10 +348,10 @@ int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); VM_BUG_ON(!vma); walk->vma = vma; - err = walk_page_test(vma->vm_start, vma->vm_end, walk); + err = walk_page_test(vma->vm_start, vma->vm_end, walk, mmrange); if (err > 0) return 0; if (err < 0) return err; - return __walk_page_range(vma->vm_start, vma->vm_end, walk); + return __walk_page_range(vma->vm_start, vma->vm_end, walk, mmrange); } diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index a447092d4635..ff6772b86195 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -90,6 +90,7 @@ static int process_vm_rw_single_vec(unsigned long addr, unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); unsigned int flags = 0; + DEFINE_RANGE_LOCK_FULL(mmrange); /* Work out address and page range required */ if (len == 0) @@ -111,7 +112,8 @@ static int process_vm_rw_single_vec(unsigned long addr, */ down_read(&mm->mmap_sem); pages = get_user_pages_remote(task, mm, pa, pages, flags, - process_pages, NULL, &locked); + process_pages, NULL, &locked, + &mmrange); if (locked) up_read(&mm->mmap_sem); if (pages <= 0) diff --git a/mm/util.c b/mm/util.c index c1250501364f..b0ec1d88bb71 100644 --- a/mm/util.c +++ b/mm/util.c @@ -347,13 +347,14 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); + DEFINE_RANGE_LOCK_FULL(mmrange); ret = security_mmap_file(file, prot, flag); if (!ret) { if (down_write_killable(&mm->mmap_sem)) return -EINTR; ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, - &populate, &uf); + &populate, &uf, &mmrange); up_write(&mm->mmap_sem); userfaultfd_unmap_complete(mm, &uf); if (populate) diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index f6758dad981f..c1e36ea2c6fc 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -868,6 +868,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, struct tomoyo_page_dump *dump) { struct page *page; + DEFINE_RANGE_LOCK_FULL(mmrange); /* see get_page_arg() in fs/exec.c */ /* dump->data is released by tomoyo_find_next_domain(). */ if (!dump->data) { @@ -884,7 +885,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, * the execve(). */ if (get_user_pages_remote(current, bprm->mm, pos, 1, - FOLL_FORCE, &page, NULL, NULL) <= 0) + FOLL_FORCE, &page, NULL, NULL, &mmrange) <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 57bcb27dcf30..4cd2b93bb20c 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -78,6 +78,7 @@ static void async_pf_execute(struct work_struct *work) unsigned long addr = apf->addr; gva_t gva = apf->gva; int locked = 1; + DEFINE_RANGE_LOCK_FULL(mmrange); might_sleep(); @@ -88,7 +89,7 @@ static void async_pf_execute(struct work_struct *work) */ down_read(&mm->mmap_sem); get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL, - &locked); + &locked, &mmrange); if (locked) up_read(&mm->mmap_sem); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4501e658e8d6..86ec078f4c3b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1317,11 +1317,12 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w return gfn_to_hva_memslot_prot(slot, gfn, writable); } -static inline int check_user_page_hwpoison(unsigned long addr) +static inline int check_user_page_hwpoison(unsigned long addr, + struct range_lock *mmrange) { int rc, flags = FOLL_HWPOISON | FOLL_WRITE; - rc = get_user_pages(addr, 1, flags, NULL, NULL); + rc = get_user_pages(addr, 1, flags, NULL, NULL, mmrange); return rc == -EHWPOISON; } @@ -1411,7 +1412,8 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) static int hva_to_pfn_remapped(struct vm_area_struct *vma, unsigned long addr, bool *async, bool write_fault, bool *writable, - kvm_pfn_t *p_pfn) + kvm_pfn_t *p_pfn, + struct range_lock *mmrange) { unsigned long pfn; int r; @@ -1425,7 +1427,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, bool unlocked = false; r = fixup_user_fault(current, current->mm, addr, (write_fault ? FAULT_FLAG_WRITE : 0), - &unlocked); + &unlocked, mmrange); if (unlocked) return -EAGAIN; if (r) @@ -1477,6 +1479,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, struct vm_area_struct *vma; kvm_pfn_t pfn = 0; int npages, r; + DEFINE_RANGE_LOCK_FULL(mmrange); /* we can do it either atomically or asynchronously, not both */ BUG_ON(atomic && async); @@ -1493,7 +1496,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, down_read(¤t->mm->mmap_sem); if (npages == -EHWPOISON || - (!async && check_user_page_hwpoison(addr))) { + (!async && check_user_page_hwpoison(addr, &mmrange))) { pfn = KVM_PFN_ERR_HWPOISON; goto exit; } @@ -1504,7 +1507,8 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { - r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); + r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, + &pfn, &mmrange); if (r == -EAGAIN) goto retry; if (r < 0) -- 2.13.6 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>