The patch titled Subject: mm: change locked_vm's type from unsigned long to atomic64_t has been added to the -mm tree. Its filename is mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t.patch and later at http://ozlabs.org/~akpm/mmotm/broken-out/mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Daniel Jordan <daniel.m.jordan@xxxxxxxxxx> Subject: mm: change locked_vm's type from unsigned long to atomic64_t Patch series "convert locked_vm from unsigned long to atomic64_t" Taking and dropping mmap_sem to modify a single counter, locked_vm, is overkill when the counter could be synchronized separately. Make mmap_sem a little less coarse by changing locked_vm to an atomic, the 64-bit variety to avoid issues with overflow on 32-bit systems. If user-controlled values are used to increase locked_vm, multiple threads doing it at once on a 32-bit system could theoretically cause overflow, so in the absence of atomic overflow checking, the 64-bit counter on 32b is defensive programming. I wouldn't have thought to do it, but Jason Gunthorpe raised the same issue in the pinned_vm series: https://lore.kernel.org/linux-mm/20190115205311.GD22031@xxxxxxxxxxxx/ This is a more conservative alternative to [1] with no user-visible effects. Thanks to Alexey Kardashevskiy for pointing out the racy atomics and to Alex Williamson, Christoph Lameter, Ira Weiny, and Jason Gunthorpe for their comments on [1]. Davidlohr Bueso recently did a similar conversion for pinned_vm[2]. Testing 1. passes LTP mlock[all], munlock[all], fork, mmap, and mremap tests in an x86 kvm guest 2. a VFIO-enabled x86 kvm guest shows the same VmLck in /proc/pid/status before and after this change 3. cross-compiles on powerpc [1] https://lore.kernel.org/linux-mm/20190211224437.25267-1-daniel.m.jordan@xxxxxxxxxx/ [2] https://lore.kernel.org/linux-mm/20190206175920.31082-1-dave@xxxxxxxxxxxx/ This patch (of 6): Taking and dropping mmap_sem to modify a single counter, locked_vm, is overkill when the counter could be synchronized separately. Make mmap_sem a little less coarse by changing locked_vm to an atomic, the 64-bit variety to avoid issues with overflow on 32-bit systems. Link: http://lkml.kernel.org/r/20190402204158.27582-2-daniel.m.jordan@xxxxxxxxxx Signed-off-by: Daniel Jordan <daniel.m.jordan@xxxxxxxxxx> Cc: Alan Tull <atull@xxxxxxxxxx> Cc: Alexey Kardashevskiy <aik@xxxxxxxxx> Cc: Alex Williamson <alex.williamson@xxxxxxxxxx> Cc: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> Cc: Christoph Lameter <cl@xxxxxxxxx> Cc: Davidlohr Bueso <dave@xxxxxxxxxxxx> Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx> Cc: Moritz Fischer <mdf@xxxxxxxxxx> Cc: Paul Mackerras <paulus@xxxxxxxxxx> Cc: Wu Hao <hao.wu@xxxxxxxxx> Cc: Ira Weiny <ira.weiny@xxxxxxxxx> Cc: Jason Gunthorpe <jgg@xxxxxxxxxxxx> Cc: Paul Mackerras <paulus@xxxxxxxxx> Cc: Mark Rutland <mark.rutland@xxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- arch/powerpc/kvm/book3s_64_vio.c | 14 ++++++++------ arch/powerpc/mm/mmu_context_iommu.c | 15 ++++++++------- drivers/fpga/dfl-afu-dma-region.c | 18 ++++++++++-------- drivers/vfio/vfio_iommu_spapr_tce.c | 17 +++++++++-------- drivers/vfio/vfio_iommu_type1.c | 10 ++++++---- fs/proc/task_mmu.c | 2 +- include/linux/mm_types.h | 2 +- kernel/fork.c | 2 +- mm/debug.c | 5 +++-- mm/mlock.c | 4 ++-- mm/mmap.c | 18 +++++++++--------- mm/mremap.c | 6 +++--- 12 files changed, 61 insertions(+), 52 deletions(-) --- a/arch/powerpc/kvm/book3s_64_vio.c~mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t +++ a/arch/powerpc/kvm/book3s_64_vio.c @@ -59,32 +59,34 @@ static unsigned long kvmppc_stt_pages(un static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc) { long ret = 0; + s64 locked_vm; if (!current || !current->mm) return ret; /* process exited */ down_write(¤t->mm->mmap_sem); + locked_vm = atomic64_read(¤t->mm->locked_vm); if (inc) { unsigned long locked, lock_limit; - locked = current->mm->locked_vm + stt_pages; + locked = locked_vm + stt_pages; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) ret = -ENOMEM; else - current->mm->locked_vm += stt_pages; + atomic64_add(stt_pages, ¤t->mm->locked_vm); } else { - if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm)) - stt_pages = current->mm->locked_vm; + if (WARN_ON_ONCE(stt_pages > locked_vm)) + stt_pages = locked_vm; - current->mm->locked_vm -= stt_pages; + atomic64_sub(stt_pages, ¤t->mm->locked_vm); } pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid, inc ? '+' : '-', stt_pages << PAGE_SHIFT, - current->mm->locked_vm << PAGE_SHIFT, + atomic64_read(¤t->mm->locked_vm) << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : ""); --- a/arch/powerpc/mm/mmu_context_iommu.c~mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t +++ a/arch/powerpc/mm/mmu_context_iommu.c @@ -55,30 +55,31 @@ static long mm_iommu_adjust_locked_vm(st unsigned long npages, bool incr) { long ret = 0, locked, lock_limit; + s64 locked_vm; if (!npages) return 0; down_write(&mm->mmap_sem); - + locked_vm = atomic64_read(&mm->locked_vm); if (incr) { - locked = mm->locked_vm + npages; + locked = locked_vm + npages; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) ret = -ENOMEM; else - mm->locked_vm += npages; + atomic64_add(npages, &mm->locked_vm); } else { - if (WARN_ON_ONCE(npages > mm->locked_vm)) - npages = mm->locked_vm; - mm->locked_vm -= npages; + if (WARN_ON_ONCE(npages > locked_vm)) + npages = locked_vm; + atomic64_sub(npages, &mm->locked_vm); } pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n", current ? current->pid : 0, incr ? '+' : '-', npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, + atomic64_read(&mm->locked_vm) << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK)); up_write(&mm->mmap_sem); --- a/drivers/fpga/dfl-afu-dma-region.c~mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t +++ a/drivers/fpga/dfl-afu-dma-region.c @@ -45,6 +45,7 @@ void afu_dma_region_init(struct dfl_feat static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr) { unsigned long locked, lock_limit; + s64 locked_vm; int ret = 0; /* the task is exiting. */ @@ -53,24 +54,25 @@ static int afu_dma_adjust_locked_vm(stru down_write(¤t->mm->mmap_sem); + locked_vm = atomic64_read(¤t->mm->locked_vm); if (incr) { - locked = current->mm->locked_vm + npages; + locked = locked_vm + npages; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) ret = -ENOMEM; else - current->mm->locked_vm += npages; + atomic64_add(npages, ¤t->mm->locked_vm); } else { - if (WARN_ON_ONCE(npages > current->mm->locked_vm)) - npages = current->mm->locked_vm; - current->mm->locked_vm -= npages; + if (WARN_ON_ONCE(npages > locked_vm)) + npages = locked_vm; + atomic64_sub(npages, ¤t->mm->locked_vm); } - dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid, + dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %lld/%lu%s\n", current->pid, incr ? '+' : '-', npages << PAGE_SHIFT, - current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK), - ret ? "- exceeded" : ""); + (s64)atomic64_read(¤t->mm->locked_vm) << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK), ret ? "- exceeded" : ""); up_write(¤t->mm->mmap_sem); --- a/drivers/vfio/vfio_iommu_spapr_tce.c~mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t +++ a/drivers/vfio/vfio_iommu_spapr_tce.c @@ -36,7 +36,8 @@ static void tce_iommu_detach_group(void static long try_increment_locked_vm(struct mm_struct *mm, long npages) { - long ret = 0, locked, lock_limit; + long ret = 0, lock_limit; + s64 locked; if (WARN_ON_ONCE(!mm)) return -EPERM; @@ -45,16 +46,16 @@ static long try_increment_locked_vm(stru return 0; down_write(&mm->mmap_sem); - locked = mm->locked_vm + npages; + locked = atomic64_read(&mm->locked_vm) + npages; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) ret = -ENOMEM; else - mm->locked_vm += npages; + atomic64_add(npages, &mm->locked_vm); pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, + atomic64_read(&mm->locked_vm) << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : ""); @@ -69,12 +70,12 @@ static void decrement_locked_vm(struct m return; down_write(&mm->mmap_sem); - if (WARN_ON_ONCE(npages > mm->locked_vm)) - npages = mm->locked_vm; - mm->locked_vm -= npages; + if (WARN_ON_ONCE(npages > atomic64_read(&mm->locked_vm))) + npages = atomic64_read(&mm->locked_vm); + atomic64_sub(npages, &mm->locked_vm); pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, + atomic64_read(&mm->locked_vm) << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK)); up_write(&mm->mmap_sem); } --- a/drivers/vfio/vfio_iommu_type1.c~mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t +++ a/drivers/vfio/vfio_iommu_type1.c @@ -276,18 +276,19 @@ static int vfio_lock_acct(struct vfio_dm if (!ret) { if (npage > 0) { if (!dma->lock_cap) { + s64 locked_vm = atomic64_read(&mm->locked_vm); unsigned long limit; limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (mm->locked_vm + npage > limit) + if (locked_vm + npage > limit) ret = -ENOMEM; } } if (!ret) - mm->locked_vm += npage; + atomic64_add(npage, &mm->locked_vm); up_write(&mm->mmap_sem); } @@ -408,6 +409,7 @@ static long vfio_pin_pages_remote(struct long ret, pinned = 0, lock_acct = 0; bool rsvd; dma_addr_t iova = vaddr - dma->vaddr + dma->iova; + atomic64_t *locked_vm = ¤t->mm->locked_vm; /* This code path is only user initiated */ if (!current->mm) @@ -425,7 +427,7 @@ static long vfio_pin_pages_remote(struct * pages are already counted against the user. */ if (!rsvd && !vfio_find_vpfn(dma, iova)) { - if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) { + if (!dma->lock_cap && atomic64_read(locked_vm) + 1 > limit) { put_pfn(*pfn_base, dma->prot); pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); @@ -452,7 +454,7 @@ static long vfio_pin_pages_remote(struct if (!rsvd && !vfio_find_vpfn(dma, iova)) { if (!dma->lock_cap && - current->mm->locked_vm + lock_acct + 1 > limit) { + atomic64_read(locked_vm) + lock_acct + 1 > limit) { put_pfn(pfn, dma->prot); pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); --- a/fs/proc/task_mmu.c~mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t +++ a/fs/proc/task_mmu.c @@ -58,7 +58,7 @@ void task_mem(struct seq_file *m, struct swap = get_mm_counter(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); - SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); + SEQ_PUT_DEC(" kB\nVmLck:\t", atomic64_read(&mm->locked_vm)); SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); --- a/include/linux/mm_types.h~mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t +++ a/include/linux/mm_types.h @@ -410,7 +410,7 @@ struct mm_struct { unsigned long hiwater_vm; /* High-water virtual memory usage */ unsigned long total_vm; /* Total pages mapped */ - unsigned long locked_vm; /* Pages that have PG_mlocked set */ + atomic64_t locked_vm; /* Pages that have PG_mlocked set */ atomic64_t pinned_vm; /* Refcount permanently increased */ unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ --- a/kernel/fork.c~mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t +++ a/kernel/fork.c @@ -988,7 +988,7 @@ static struct mm_struct *mm_init(struct mm->core_state = NULL; mm_pgtables_bytes_init(mm); mm->map_count = 0; - mm->locked_vm = 0; + atomic64_set(&mm->locked_vm, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); --- a/mm/debug.c~mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t +++ a/mm/debug.c @@ -136,7 +136,7 @@ void dump_mm(const struct mm_struct *mm) #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" - "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" + "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %llx\n" "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" @@ -167,7 +167,8 @@ void dump_mm(const struct mm_struct *mm) atomic_read(&mm->mm_count), mm_pgtables_bytes(mm), mm->map_count, - mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, + mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, + (u64)atomic64_read(&mm->locked_vm), (u64)atomic64_read(&mm->pinned_vm), mm->data_vm, mm->exec_vm, mm->stack_vm, mm->start_code, mm->end_code, mm->start_data, mm->end_data, --- a/mm/mlock.c~mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t +++ a/mm/mlock.c @@ -562,7 +562,7 @@ success: nr_pages = -nr_pages; else if (old_flags & VM_LOCKED) nr_pages = 0; - mm->locked_vm += nr_pages; + atomic64_add(nr_pages, &mm->locked_vm); /* * vm_flags is protected by the mmap_sem held in write mode. @@ -687,7 +687,7 @@ static __must_check int do_mlock(unsigne if (down_write_killable(¤t->mm->mmap_sem)) return -EINTR; - locked += current->mm->locked_vm; + locked += atomic64_read(¤t->mm->locked_vm); if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { /* * It is possible that the regions requested intersect with --- a/mm/mmap.c~mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t +++ a/mm/mmap.c @@ -1340,7 +1340,7 @@ static inline int mlock_future_check(str /* mlock MCL_FUTURE? */ if (flags & VM_LOCKED) { locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; + locked += atomic64_read(&mm->locked_vm); lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) @@ -1826,7 +1826,7 @@ out: vma == get_gate_vma(current->mm)) vma->vm_flags &= VM_LOCKED_CLEAR_MASK; else - mm->locked_vm += (len >> PAGE_SHIFT); + atomic64_add(len >> PAGE_SHIFT, &mm->locked_vm); } if (file) @@ -2302,7 +2302,7 @@ static int acct_stack_growth(struct vm_a if (vma->vm_flags & VM_LOCKED) { unsigned long locked; unsigned long limit; - locked = mm->locked_vm + grow; + locked = atomic64_read(&mm->locked_vm) + grow; limit = rlimit(RLIMIT_MEMLOCK); limit >>= PAGE_SHIFT; if (locked > limit && !capable(CAP_IPC_LOCK)) @@ -2396,7 +2396,7 @@ int expand_upwards(struct vm_area_struct */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; + atomic64_add(grow, &mm->locked_vm); vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; @@ -2476,7 +2476,7 @@ int expand_downwards(struct vm_area_stru */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; + atomic64_add(grow, &mm->locked_vm); vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; @@ -2801,11 +2801,11 @@ int __do_munmap(struct mm_struct *mm, un /* * unlock any mlock()ed ranges before detaching vmas */ - if (mm->locked_vm) { + if (atomic64_read(&mm->locked_vm)) { struct vm_area_struct *tmp = vma; while (tmp && tmp->vm_start < end) { if (tmp->vm_flags & VM_LOCKED) { - mm->locked_vm -= vma_pages(tmp); + atomic64_sub(vma_pages(tmp), &mm->locked_vm); munlock_vma_pages_all(tmp); } @@ -3048,7 +3048,7 @@ out: mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) - mm->locked_vm += (len >> PAGE_SHIFT); + atomic64_add(len >> PAGE_SHIFT, &mm->locked_vm); vma->vm_flags |= VM_SOFTDIRTY; return 0; } @@ -3120,7 +3120,7 @@ void exit_mmap(struct mm_struct *mm) up_write(&mm->mmap_sem); } - if (mm->locked_vm) { + if (atomic64_read(&mm->locked_vm)) { vma = mm->mmap; while (vma) { if (vma->vm_flags & VM_LOCKED) --- a/mm/mremap.c~mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t +++ a/mm/mremap.c @@ -422,7 +422,7 @@ static unsigned long move_vma(struct vm_ } if (vm_flags & VM_LOCKED) { - mm->locked_vm += new_len >> PAGE_SHIFT; + atomic64_add(new_len >> PAGE_SHIFT, &mm->locked_vm); *locked = true; } @@ -473,7 +473,7 @@ static struct vm_area_struct *vma_to_res if (vma->vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; - locked = mm->locked_vm << PAGE_SHIFT; + locked = atomic64_read(&mm->locked_vm) << PAGE_SHIFT; lock_limit = rlimit(RLIMIT_MEMLOCK); locked += new_len - old_len; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) @@ -679,7 +679,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, a vm_stat_account(mm, vma->vm_flags, pages); if (vma->vm_flags & VM_LOCKED) { - mm->locked_vm += pages; + atomic64_add(pages, &mm->locked_vm); locked = true; new_addr = addr; } _ Patches currently in -mm which might be from daniel.m.jordan@xxxxxxxxxx are mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t.patch vfio-type1-drop-mmap_sem-now-that-locked_vm-is-atomic.patch vfio-spapr_tce-drop-mmap_sem-now-that-locked_vm-is-atomic.patch fpga-dlf-afu-drop-mmap_sem-now-that-locked_vm-is-atomic.patch powerpc-mmu-drop-mmap_sem-now-that-locked_vm-is-atomic.patch kvm-book3s-drop-mmap_sem-now-that-locked_vm-is-atomic.patch