This introduces the MREMAP_RETAIN flag for preserving the source mapping when MREMAP_MAYMOVE moves the pages to a new destination. Accesses to the source mapping will fault and map in fresh zeroed pages. It is currently limited to writable MAP_PRIVATE|MAP_ANONYMOUS mappings and will return EFAULT when used on anything else. This covers the intended use case in general purpose allocators. For consistency, the old_len >= new_len case could decommit the pages instead of unmapping. However, userspace can accomplish the same thing via madvise and the flag is coherent without the additional complexity. Motivation: TCMalloc and jemalloc avoid releasing virtual memory in order to reduce virtual memory fragmentation. A call to munmap or mremap would leave a hole in the address space. Instead, unused pages are lazily returned to the operating system via MADV_DONTNEED. Since mremap cannot be used to elide copies, TCMalloc and jemalloc end up being significantly slower for patterns like repeated vector / hash table reallocations. Consider the typical vector building pattern: #include <string.h> #include <stdlib.h> int main(void) { for (size_t i = 0; i < 100; i++) { void *ptr = NULL; size_t old_size = 0; for (size_t size = 4; size < (1 << 30); size *= 2) { ptr = realloc(ptr, size); if (!ptr) return 1; memset(ptr + old_size, 0xff, size - old_size); old_size = size; } free(ptr); } } Transparent huge pages disabled: glibc (baseline, uses mremap already): 15.051s jemalloc without MREMAP_RETAIN: 38.540s jemalloc with MREMAP_RETAIN: 15.086s Transparent huge pages enabled: glibc (baseline, uses mremap already): 8.464s jemalloc without MREMAP_RETAIN: 18.230s jemalloc with MREMAP_RETAIN: 6.696s In practice, in-place growth never occurs for huge allocations because the heap grows in the downwards direction for all 3 allocators. TCMalloc and jemalloc pay for enormous copies while glibc is only spending time writing new elements to the vector. Even if it was grown in the other direction, real-world applications would end up blocking in-place growth with new allocations. The allocators could attempt to map the source location again after an mremap call, but there is no guarantee of success in a multi-threaded program and fragmentating memory over time is considered unacceptable. Signed-off-by: Daniel Micay <danielmicay@xxxxxxxxx> --- include/linux/huge_mm.h | 2 +- include/linux/mm.h | 6 ++++++ include/uapi/linux/mman.h | 1 + mm/huge_memory.c | 4 ++-- mm/memory.c | 2 +- mm/mmap.c | 12 +++++++++++ mm/mremap.c | 52 +++++++++++++++++++++++++++++++---------------- 7 files changed, 57 insertions(+), 22 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 63579cb..3c13b20 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -143,7 +143,7 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long end, long adjust_next) { - if (!vma->anon_vma || vma->vm_ops) + if (!vma->anon_vma || (vma->vm_ops && !vma->vm_ops->allow_huge_pages)) return; __vma_adjust_trans_huge(vma, start, end, adjust_next); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 8981cc8..1e61036 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -273,6 +273,12 @@ struct vm_operations_struct { /* called by sys_remap_file_pages() to populate non-linear mapping */ int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr, unsigned long size, pgoff_t pgoff); + + /* Check if the mapping may be duplicated by MREMAP_RETAIN */ + bool (*may_duplicate)(struct vm_area_struct *vma); + + /* if there is no vm_ops table, this is considered true */ + bool allow_huge_pages; }; struct mmu_gather; diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index ade4acd..4e9a546 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -5,6 +5,7 @@ #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 +#define MREMAP_RETAIN 4 #define OVERCOMMIT_GUESS 0 #define OVERCOMMIT_ALWAYS 1 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d9a21d06..350b478 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2077,7 +2077,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) * page fault if needed. */ return 0; - if (vma->vm_ops) + if ((vma->vm_ops && !vma->vm_ops->allow_huge_pages)) /* khugepaged not yet working on file or special mappings */ return 0; VM_BUG_ON(vma->vm_flags & VM_NO_THP); @@ -2405,7 +2405,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) (vma->vm_flags & VM_NOHUGEPAGE)) return false; - if (!vma->anon_vma || vma->vm_ops) + if (!vma->anon_vma || (vma->vm_ops && !vma->vm_ops->allow_huge_pages)) return false; if (is_vma_temporary_stack(vma)) return false; diff --git a/mm/memory.c b/mm/memory.c index e229970..c181401 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3275,7 +3275,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { int ret = VM_FAULT_FALLBACK; - if (!vma->vm_ops) + if (!vma->vm_ops || vma->vm_ops->allow_huge_pages) ret = do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); if (!(ret & VM_FAULT_FALLBACK)) diff --git a/mm/mmap.c b/mm/mmap.c index c0a3637..6b644fe 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1500,6 +1500,16 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } +static bool anon_may_duplicate(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_WRITE && !(vma->vm_flags & VM_SHARED); +} + +static const struct vm_operations_struct anon_vmops = { + .may_duplicate = anon_may_duplicate, + .allow_huge_pages = true +}; + unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) { @@ -1569,6 +1579,8 @@ munmap_back: vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; + if (!file) + vma->vm_ops = &anon_vmops; INIT_LIST_HEAD(&vma->anon_vma_chain); if (file) { diff --git a/mm/mremap.c b/mm/mremap.c index 05f1180..ca7a662 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -235,7 +235,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, static unsigned long move_vma(struct vm_area_struct *vma, unsigned long old_addr, unsigned long old_len, - unsigned long new_len, unsigned long new_addr, bool *locked) + unsigned long new_len, unsigned long new_addr, bool retain, + bool *locked) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; @@ -287,15 +288,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, old_len = new_len; old_addr = new_addr; new_addr = -ENOMEM; - } - - /* Conceal VM_ACCOUNT so old reservation is not undone */ - if (vm_flags & VM_ACCOUNT) { - vma->vm_flags &= ~VM_ACCOUNT; - excess = vma->vm_end - vma->vm_start - old_len; - if (old_addr > vma->vm_start && - old_addr + old_len < vma->vm_end) - split = 1; + retain = false; } /* @@ -310,6 +303,19 @@ static unsigned long move_vma(struct vm_area_struct *vma, hiwater_vm = mm->hiwater_vm; vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + /* Leave the old mapping in place for MREMAP_RETAIN */ + if (retain) + goto out; + + /* Conceal VM_ACCOUNT so old reservation is not undone */ + if (vm_flags & VM_ACCOUNT) { + vma->vm_flags &= ~VM_ACCOUNT; + excess = vma->vm_end - vma->vm_start - old_len; + if (old_addr > vma->vm_start && + old_addr + old_len < vma->vm_end) + split = 1; + } + if (do_munmap(mm, old_addr, old_len) < 0) { /* OOM: unable to split vma, just get accounts right */ vm_unacct_memory(excess >> PAGE_SHIFT); @@ -324,6 +330,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, vma->vm_next->vm_flags |= VM_ACCOUNT; } +out: if (vm_flags & VM_LOCKED) { mm->locked_vm += new_len >> PAGE_SHIFT; *locked = true; @@ -333,7 +340,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, } static struct vm_area_struct *vma_to_resize(unsigned long addr, - unsigned long old_len, unsigned long new_len, unsigned long *p) + unsigned long old_len, unsigned long new_len, bool retain, + unsigned long *p) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = find_vma(mm, addr); @@ -348,6 +356,11 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (old_len > vma->vm_end - addr) goto Efault; + /* Forbid MREMAP_RETAIN if not explicitly permitted by the mapping */ + if (retain && !(vma->vm_ops && vma->vm_ops->may_duplicate && + vma->vm_ops->may_duplicate(vma))) + goto Efault; + /* Need to be careful about a growing mapping */ if (new_len > old_len) { unsigned long pgoff; @@ -392,7 +405,8 @@ Eagain: } static unsigned long mremap_to(unsigned long addr, unsigned long old_len, - unsigned long new_addr, unsigned long new_len, bool *locked) + unsigned long new_addr, unsigned long new_len, bool retain, + bool *locked) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -426,7 +440,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, old_len = new_len; } - vma = vma_to_resize(addr, old_len, new_len, &charged); + vma = vma_to_resize(addr, old_len, new_len, retain, &charged); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -442,7 +456,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (ret & ~PAGE_MASK) goto out1; - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); + ret = move_vma(vma, addr, old_len, new_len, new_addr, retain, locked); if (!(ret & ~PAGE_MASK)) goto out; out1: @@ -482,7 +496,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long charged = 0; bool locked = false; - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_RETAIN)) return ret; if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) @@ -506,7 +520,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (flags & MREMAP_FIXED) { ret = mremap_to(addr, old_len, new_addr, new_len, - &locked); + flags & MREMAP_RETAIN, &locked); goto out; } @@ -526,7 +540,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Ok, we need to grow.. */ - vma = vma_to_resize(addr, old_len, new_len, &charged); + vma = vma_to_resize(addr, old_len, new_len, flags & MREMAP_RETAIN, + &charged); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -575,7 +590,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } - ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); + ret = move_vma(vma, addr, old_len, new_len, new_addr, + flags & MREMAP_RETAIN, &locked); } out: if (ret & ~PAGE_MASK) -- 2.1.2 -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html