On Thu, Jun 27, 2024 at 01:56:12PM -0400, Liam R. Howlett wrote: > * Lorenzo Stoakes <lstoakes@xxxxxxxxx> [240627 06:39]: > > This patch introduces vma.c and moves internal core VMA manipulation > > functions to this file from mmap.c. > > > > This allows us to isolate VMA functionality in a single place such that we > > can create userspace testing code that invokes this functionality in an > > environment where we can implement simple unit tests of core functionality. > > > > This patch ensures that core VMA functionality is explicitly marked as such > > by its presence in mm/vma.h. > > > > It also places the header includes required by vma.c in vma_internal.h, > > which is simply imported by vma.c. This makes the VMA functionality > > testable, as userland testing code can simply stub out functionality > > as required. > > My initial thought on vma_internal.h would be to contain the number of > 'helper' functions and internal structures while mm/vma.h would have the > interface. This is what I've done though? > > In this way, we could include mm/vma.h into mm/internal.h (which most > files you've edited already has included), and any special cases > (mmu_notifier.c, etc) would need the addition. vma_internal.h would > have only things needed in the vma.c file. Happy to change it so mm/internal.h imports mm/vma_internal.h, it does make sense for one to include theother. > > On testing, we could use the header guards to exclude what we wanted by > either just #defining the right guard, or by making an entirely new > header with a duplicate guard with the necessary stubs/functions. Yeah this is a nice idea as mentioned on other patch, I will be doing this. > > > > > Signed-off-by: Lorenzo Stoakes <lstoakes@xxxxxxxxx> > > --- > > include/linux/mm.h | 35 - > > mm/Makefile | 2 +- > > mm/gup.c | 1 + > > mm/huge_memory.c | 1 + > > mm/internal.h | 227 +---- > > mm/madvise.c | 1 + > > mm/memory.c | 1 + > > mm/mempolicy.c | 1 + > > mm/mlock.c | 1 + > > mm/mmap.c | 1983 +++----------------------------------------- > > mm/mmu_notifier.c | 2 + > > mm/mprotect.c | 1 + > > mm/mremap.c | 1 + > > mm/mseal.c | 2 + > > mm/rmap.c | 1 + > > mm/userfaultfd.c | 2 + > > mm/vma.c | 1766 +++++++++++++++++++++++++++++++++++++++ > > mm/vma.h | 356 ++++++++ > > mm/vma_internal.h | 143 ++++ > > 19 files changed, 2389 insertions(+), 2138 deletions(-) > > create mode 100644 mm/vma.c > > create mode 100644 mm/vma.h > > create mode 100644 mm/vma_internal.h > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index e3220439cf75..31f85db029b8 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -1004,21 +1004,6 @@ struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) > > return mas_prev_range(&vmi->mas, 0); > > } > > > > -static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) > > -{ > > - return vmi->mas.index; > > -} > > - > > -static inline unsigned long vma_iter_end(struct vma_iterator *vmi) > > -{ > > - return vmi->mas.last + 1; > > -} > > -static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, > > - unsigned long count) > > -{ > > - return mas_expected_entries(&vmi->mas, count); > > -} > > - > > static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, > > unsigned long start, unsigned long end, gfp_t gfp) > > { > > @@ -2548,21 +2533,6 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, > > #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ > > MM_CP_UFFD_WP_RESOLVE) > > > > -bool vma_needs_dirty_tracking(struct vm_area_struct *vma); > > -bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); > > -static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) > > -{ > > - /* > > - * We want to check manually if we can change individual PTEs writable > > - * if we can't do that automatically for all PTEs in a mapping. For > > - * private mappings, that's always the case when we have write > > - * permissions as we properly have to handle COW. > > - */ > > - if (vma->vm_flags & VM_SHARED) > > - return vma_wants_writenotify(vma, vma->vm_page_prot); > > - return !!(vma->vm_flags & VM_WRITE); > > - > > -} > > bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, > > pte_t pte); > > extern long change_protection(struct mmu_gather *tlb, > > @@ -3277,12 +3247,7 @@ extern int vma_expand_bottom(struct vma_iterator *vmi, struct vm_area_struct *vm > > unsigned long shift, struct vm_area_struct **next); > > extern int vma_shrink_top(struct vma_iterator *vmi, struct vm_area_struct *vma, > > unsigned long shift); > > -extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); > > extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); > > -extern void unlink_file_vma(struct vm_area_struct *); > > -extern struct vm_area_struct *copy_vma(struct vm_area_struct **, > > - unsigned long addr, unsigned long len, pgoff_t pgoff, > > - bool *need_rmap_locks); > > extern void exit_mmap(struct mm_struct *); > > > > static inline int check_data_rlimit(unsigned long rlim, > > diff --git a/mm/Makefile b/mm/Makefile > > index d2915f8c9dc0..140a22654dde 100644 > > --- a/mm/Makefile > > +++ b/mm/Makefile > > @@ -37,7 +37,7 @@ mmu-y := nommu.o > > mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ > > mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ > > msync.o page_vma_mapped.o pagewalk.o \ > > - pgtable-generic.o rmap.o vmalloc.o > > + pgtable-generic.o rmap.o vmalloc.o vma.o > > > > > > ifdef CONFIG_CROSS_MEMORY_ATTACH > > diff --git a/mm/gup.c b/mm/gup.c > > index 8bea9ad80984..34b846352679 100644 > > --- a/mm/gup.c > > +++ b/mm/gup.c > > @@ -26,6 +26,7 @@ > > #include <asm/tlbflush.h> > > > > #include "internal.h" > > +#include "vma.h" > > > > struct follow_page_context { > > struct dev_pagemap *pgmap; > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index c7ce28f6b7f3..de6f150ed97b 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > > @@ -44,6 +44,7 @@ > > #include <asm/tlb.h> > > #include <asm/pgalloc.h> > > #include "internal.h" > > +#include "vma.h" > > #include "swap.h" > > > > #define CREATE_TRACE_POINTS > > diff --git a/mm/internal.h b/mm/internal.h > > index f7779727bb78..76b4821cd751 100644 > > --- a/mm/internal.h > > +++ b/mm/internal.h > > @@ -8,7 +8,9 @@ > > #define __MM_INTERNAL_H > > > > #include <linux/fs.h> > > +#include <linux/khugepaged.h> > > #include <linux/mm.h> > > +#include <linux/mm_inline.h> > > #include <linux/pagemap.h> > > #include <linux/rmap.h> > > #include <linux/swap.h> > > @@ -778,37 +780,6 @@ static inline bool free_area_empty(struct free_area *area, int migratetype) > > return list_empty(&area->free_list[migratetype]); > > } > > > > -/* > > - * These three helpers classifies VMAs for virtual memory accounting. > > - */ > > - > > -/* > > - * Executable code area - executable, not writable, not stack > > - */ > > -static inline bool is_exec_mapping(vm_flags_t flags) > > -{ > > - return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; > > -} > > - > > -/* > > - * Stack area (including shadow stacks) > > - * > > - * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: > > - * do_mmap() forbids all other combinations. > > - */ > > -static inline bool is_stack_mapping(vm_flags_t flags) > > -{ > > - return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); > > -} > > - > > -/* > > - * Data area - private, writable, not stack > > - */ > > -static inline bool is_data_mapping(vm_flags_t flags) > > -{ > > - return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; > > -} > > - > > /* mm/util.c */ > > struct anon_vma *folio_anon_vma(struct folio *folio); > > > > @@ -1237,80 +1208,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr, > > void touch_pmd(struct vm_area_struct *vma, unsigned long addr, > > pmd_t *pmd, bool write); > > > > -/* > > - * mm/mmap.c > > - */ > > -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, > > - struct vm_area_struct *vma, > > - unsigned long delta); > > - > > -struct vm_area_struct *vma_modify(struct vma_iterator *vmi, > > - struct vm_area_struct *prev, > > - struct vm_area_struct *vma, > > - unsigned long start, unsigned long end, > > - unsigned long vm_flags, > > - struct mempolicy *policy, > > - struct vm_userfaultfd_ctx uffd_ctx, > > - struct anon_vma_name *anon_name); > > - > > -/* We are about to modify the VMA's flags. */ > > -static inline struct vm_area_struct > > -*vma_modify_flags(struct vma_iterator *vmi, > > - struct vm_area_struct *prev, > > - struct vm_area_struct *vma, > > - unsigned long start, unsigned long end, > > - unsigned long new_flags) > > -{ > > - return vma_modify(vmi, prev, vma, start, end, new_flags, > > - vma_policy(vma), vma->vm_userfaultfd_ctx, > > - anon_vma_name(vma)); > > -} > > - > > -/* We are about to modify the VMA's flags and/or anon_name. */ > > -static inline struct vm_area_struct > > -*vma_modify_flags_name(struct vma_iterator *vmi, > > - struct vm_area_struct *prev, > > - struct vm_area_struct *vma, > > - unsigned long start, > > - unsigned long end, > > - unsigned long new_flags, > > - struct anon_vma_name *new_name) > > -{ > > - return vma_modify(vmi, prev, vma, start, end, new_flags, > > - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); > > -} > > - > > -/* We are about to modify the VMA's memory policy. */ > > -static inline struct vm_area_struct > > -*vma_modify_policy(struct vma_iterator *vmi, > > - struct vm_area_struct *prev, > > - struct vm_area_struct *vma, > > - unsigned long start, unsigned long end, > > - struct mempolicy *new_pol) > > -{ > > - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, > > - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); > > -} > > - > > -/* We are about to modify the VMA's flags and/or uffd context. */ > > -static inline struct vm_area_struct > > -*vma_modify_flags_uffd(struct vma_iterator *vmi, > > - struct vm_area_struct *prev, > > - struct vm_area_struct *vma, > > - unsigned long start, unsigned long end, > > - unsigned long new_flags, > > - struct vm_userfaultfd_ctx new_ctx) > > -{ > > - return vma_modify(vmi, prev, vma, start, end, new_flags, > > - vma_policy(vma), new_ctx, anon_vma_name(vma)); > > -} > > - > > -int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, > > - unsigned long start, unsigned long end, pgoff_t pgoff, > > - struct vm_area_struct *next); > > -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, > > - unsigned long start, unsigned long end, pgoff_t pgoff); > > - > > enum { > > /* mark page accessed */ > > FOLL_TOUCH = 1 << 16, > > @@ -1437,117 +1334,6 @@ static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte > > return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte); > > } > > > > -static inline void vma_iter_config(struct vma_iterator *vmi, > > - unsigned long index, unsigned long last) > > -{ > > - __mas_set_range(&vmi->mas, index, last - 1); > > -} > > - > > -static inline void vma_iter_reset(struct vma_iterator *vmi) > > -{ > > - mas_reset(&vmi->mas); > > -} > > - > > -static inline > > -struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) > > -{ > > - return mas_prev_range(&vmi->mas, min); > > -} > > - > > -static inline > > -struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) > > -{ > > - return mas_next_range(&vmi->mas, max); > > -} > > - > > -static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, > > - unsigned long max, unsigned long size) > > -{ > > - return mas_empty_area(&vmi->mas, min, max - 1, size); > > -} > > - > > -static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, > > - unsigned long max, unsigned long size) > > -{ > > - return mas_empty_area_rev(&vmi->mas, min, max - 1, size); > > -} > > - > > -/* > > - * VMA Iterator functions shared between nommu and mmap > > - */ > > -static inline int vma_iter_prealloc(struct vma_iterator *vmi, > > - struct vm_area_struct *vma) > > -{ > > - return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); > > -} > > - > > -static inline void vma_iter_clear(struct vma_iterator *vmi) > > -{ > > - mas_store_prealloc(&vmi->mas, NULL); > > -} > > - > > -static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) > > -{ > > - return mas_walk(&vmi->mas); > > -} > > - > > -/* Store a VMA with preallocated memory */ > > -static inline void vma_iter_store(struct vma_iterator *vmi, > > - struct vm_area_struct *vma) > > -{ > > - > > -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) > > - if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && > > - vmi->mas.index > vma->vm_start)) { > > - pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", > > - vmi->mas.index, vma->vm_start, vma->vm_start, > > - vma->vm_end, vmi->mas.index, vmi->mas.last); > > - } > > - if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && > > - vmi->mas.last < vma->vm_start)) { > > - pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", > > - vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, > > - vmi->mas.index, vmi->mas.last); > > - } > > -#endif > > - > > - if (vmi->mas.status != ma_start && > > - ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) > > - vma_iter_invalidate(vmi); > > - > > - __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); > > - mas_store_prealloc(&vmi->mas, vma); > > -} > > - > > -static inline int vma_iter_store_gfp(struct vma_iterator *vmi, > > - struct vm_area_struct *vma, gfp_t gfp) > > -{ > > - if (vmi->mas.status != ma_start && > > - ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) > > - vma_iter_invalidate(vmi); > > - > > - __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); > > - mas_store_gfp(&vmi->mas, vma, gfp); > > - if (unlikely(mas_is_err(&vmi->mas))) > > - return -ENOMEM; > > - > > - return 0; > > -} > > - > > -/* > > - * VMA lock generalization > > - */ > > -struct vma_prepare { > > - struct vm_area_struct *vma; > > - struct vm_area_struct *adj_next; > > - struct file *file; > > - struct address_space *mapping; > > - struct anon_vma *anon_vma; > > - struct vm_area_struct *insert; > > - struct vm_area_struct *remove; > > - struct vm_area_struct *remove2; > > -}; > > - > > void __meminit __init_single_page(struct page *page, unsigned long pfn, > > unsigned long zone, int nid); > > > > @@ -1636,13 +1422,4 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, > > void workingset_update_node(struct xa_node *node); > > extern struct list_lru shadow_nodes; > > > > -struct unlink_vma_file_batch { > > - int count; > > - struct vm_area_struct *vmas[8]; > > -}; > > - > > -void unlink_file_vma_batch_init(struct unlink_vma_file_batch *); > > -void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *); > > -void unlink_file_vma_batch_final(struct unlink_vma_file_batch *); > > - > > #endif /* __MM_INTERNAL_H */ > > diff --git a/mm/madvise.c b/mm/madvise.c > > index 96c026fe0c99..42f62a8efd71 100644 > > --- a/mm/madvise.c > > +++ b/mm/madvise.c > > @@ -35,6 +35,7 @@ > > #include <asm/tlb.h> > > > > #include "internal.h" > > +#include "vma.h" > > #include "swap.h" > > > > struct madvise_walk_private { > > diff --git a/mm/memory.c b/mm/memory.c > > index 0a769f34bbb2..a2ca7df9d2cf 100644 > > --- a/mm/memory.c > > +++ b/mm/memory.c > > @@ -90,6 +90,7 @@ > > > > #include "pgalloc-track.h" > > #include "internal.h" > > +#include "vma.h" > > #include "swap.h" > > > > #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > > index f73acb01ad45..3dad2b52f319 100644 > > --- a/mm/mempolicy.c > > +++ b/mm/mempolicy.c > > @@ -115,6 +115,7 @@ > > #include <linux/uaccess.h> > > > > #include "internal.h" > > +#include "vma.h" > > > > /* Internal flags */ > > #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ > > diff --git a/mm/mlock.c b/mm/mlock.c > > index 52d6e401ad67..ac84378bb796 100644 > > --- a/mm/mlock.c > > +++ b/mm/mlock.c > > @@ -27,6 +27,7 @@ > > #include <linux/secretmem.h> > > > > #include "internal.h" > > +#include "vma.h" > > > > struct mlock_fbatch { > > local_lock_t lock; > > diff --git a/mm/mmap.c b/mm/mmap.c > > index 574e69a04ebe..b4f7c1ea3f0f 100644 > > --- a/mm/mmap.c > > +++ b/mm/mmap.c > > @@ -57,6 +57,7 @@ > > #include <trace/events/mmap.h> > > > > #include "internal.h" > > +#include "vma.h" > > > > #ifndef arch_mmap_check > > #define arch_mmap_check(addr, len, flags) (0) > > @@ -76,16 +77,6 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; > > static bool ignore_rlimit_data; > > core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); > > > > -static void unmap_region(struct mm_struct *mm, struct ma_state *mas, > > - struct vm_area_struct *vma, struct vm_area_struct *prev, > > - struct vm_area_struct *next, unsigned long start, > > - unsigned long end, unsigned long tree_end, bool mm_wr_locked); > > - > > -static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) > > -{ > > - return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); > > -} > > - > > /* Update vma->vm_page_prot to reflect vma->vm_flags. */ > > void vma_set_page_prot(struct vm_area_struct *vma) > > { > > @@ -101,100 +92,6 @@ void vma_set_page_prot(struct vm_area_struct *vma) > > WRITE_ONCE(vma->vm_page_prot, vm_page_prot); > > } > > > > -/* > > - * Requires inode->i_mapping->i_mmap_rwsem > > - */ > > -static void __remove_shared_vm_struct(struct vm_area_struct *vma, > > - struct address_space *mapping) > > -{ > > - if (vma_is_shared_maywrite(vma)) > > - mapping_unmap_writable(mapping); > > - > > - flush_dcache_mmap_lock(mapping); > > - vma_interval_tree_remove(vma, &mapping->i_mmap); > > - flush_dcache_mmap_unlock(mapping); > > -} > > - > > -/* > > - * Unlink a file-based vm structure from its interval tree, to hide > > - * vma from rmap and vmtruncate before freeing its page tables. > > - */ > > -void unlink_file_vma(struct vm_area_struct *vma) > > -{ > > - struct file *file = vma->vm_file; > > - > > - if (file) { > > - struct address_space *mapping = file->f_mapping; > > - i_mmap_lock_write(mapping); > > - __remove_shared_vm_struct(vma, mapping); > > - i_mmap_unlock_write(mapping); > > - } > > -} > > - > > -void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) > > -{ > > - vb->count = 0; > > -} > > - > > -static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) > > -{ > > - struct address_space *mapping; > > - int i; > > - > > - mapping = vb->vmas[0]->vm_file->f_mapping; > > - i_mmap_lock_write(mapping); > > - for (i = 0; i < vb->count; i++) { > > - VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); > > - __remove_shared_vm_struct(vb->vmas[i], mapping); > > - } > > - i_mmap_unlock_write(mapping); > > - > > - unlink_file_vma_batch_init(vb); > > -} > > - > > -void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, > > - struct vm_area_struct *vma) > > -{ > > - if (vma->vm_file == NULL) > > - return; > > - > > - if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || > > - vb->count == ARRAY_SIZE(vb->vmas)) > > - unlink_file_vma_batch_process(vb); > > - > > - vb->vmas[vb->count] = vma; > > - vb->count++; > > -} > > - > > -void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) > > -{ > > - if (vb->count > 0) > > - unlink_file_vma_batch_process(vb); > > -} > > - > > -/* > > - * Close a vm structure and free it. > > - */ > > -static void remove_vma(struct vm_area_struct *vma, bool unreachable) > > -{ > > - might_sleep(); > > - if (vma->vm_ops && vma->vm_ops->close) > > - vma->vm_ops->close(vma); > > - if (vma->vm_file) > > - fput(vma->vm_file); > > - mpol_put(vma_policy(vma)); > > - if (unreachable) > > - __vm_area_free(vma); > > - else > > - vm_area_free(vma); > > -} > > - > > -static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, > > - unsigned long min) > > -{ > > - return mas_prev(&vmi->mas, min); > > -} > > - > > /* > > * check_brk_limits() - Use platform specific check of range & verify mlock > > * limits. > > @@ -298,893 +195,24 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) > > brkvma = vma_prev_limit(&vmi, mm->start_brk); > > /* Ok, looks good - let it rip. */ > > if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) > > - goto out; > > - > > - mm->brk = brk; > > - if (mm->def_flags & VM_LOCKED) > > - populate = true; > > - > > -success: > > - mmap_write_unlock(mm); > > -success_unlocked: > > - userfaultfd_unmap_complete(mm, &uf); > > - if (populate) > > - mm_populate(oldbrk, newbrk - oldbrk); > > - return brk; > > - > > -out: > > - mm->brk = origbrk; > > - mmap_write_unlock(mm); > > - return origbrk; > > -} > > - > > -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) > > -static void validate_mm(struct mm_struct *mm) > > -{ > > - int bug = 0; > > - int i = 0; > > - struct vm_area_struct *vma; > > - VMA_ITERATOR(vmi, mm, 0); > > - > > - mt_validate(&mm->mm_mt); > > - for_each_vma(vmi, vma) { > > -#ifdef CONFIG_DEBUG_VM_RB > > - struct anon_vma *anon_vma = vma->anon_vma; > > - struct anon_vma_chain *avc; > > -#endif > > - unsigned long vmi_start, vmi_end; > > - bool warn = 0; > > - > > - vmi_start = vma_iter_addr(&vmi); > > - vmi_end = vma_iter_end(&vmi); > > - if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) > > - warn = 1; > > - > > - if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) > > - warn = 1; > > - > > - if (warn) { > > - pr_emerg("issue in %s\n", current->comm); > > - dump_stack(); > > - dump_vma(vma); > > - pr_emerg("tree range: %px start %lx end %lx\n", vma, > > - vmi_start, vmi_end - 1); > > - vma_iter_dump_tree(&vmi); > > - } > > - > > -#ifdef CONFIG_DEBUG_VM_RB > > - if (anon_vma) { > > - anon_vma_lock_read(anon_vma); > > - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) > > - anon_vma_interval_tree_verify(avc); > > - anon_vma_unlock_read(anon_vma); > > - } > > -#endif > > - i++; > > - } > > - if (i != mm->map_count) { > > - pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); > > - bug = 1; > > - } > > - VM_BUG_ON_MM(bug, mm); > > -} > > - > > -#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ > > -#define validate_mm(mm) do { } while (0) > > -#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ > > - > > -/* > > - * vma has some anon_vma assigned, and is already inserted on that > > - * anon_vma's interval trees. > > - * > > - * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the > > - * vma must be removed from the anon_vma's interval trees using > > - * anon_vma_interval_tree_pre_update_vma(). > > - * > > - * After the update, the vma will be reinserted using > > - * anon_vma_interval_tree_post_update_vma(). > > - * > > - * The entire update must be protected by exclusive mmap_lock and by > > - * the root anon_vma's mutex. > > - */ > > -static inline void > > -anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) > > -{ > > - struct anon_vma_chain *avc; > > - > > - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) > > - anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); > > -} > > - > > -static inline void > > -anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) > > -{ > > - struct anon_vma_chain *avc; > > - > > - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) > > - anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); > > -} > > - > > -static unsigned long count_vma_pages_range(struct mm_struct *mm, > > - unsigned long addr, unsigned long end) > > -{ > > - VMA_ITERATOR(vmi, mm, addr); > > - struct vm_area_struct *vma; > > - unsigned long nr_pages = 0; > > - > > - for_each_vma_range(vmi, vma, end) { > > - unsigned long vm_start = max(addr, vma->vm_start); > > - unsigned long vm_end = min(end, vma->vm_end); > > - > > - nr_pages += PHYS_PFN(vm_end - vm_start); > > - } > > - > > - return nr_pages; > > -} > > - > > -static void __vma_link_file(struct vm_area_struct *vma, > > - struct address_space *mapping) > > -{ > > - if (vma_is_shared_maywrite(vma)) > > - mapping_allow_writable(mapping); > > - > > - flush_dcache_mmap_lock(mapping); > > - vma_interval_tree_insert(vma, &mapping->i_mmap); > > - flush_dcache_mmap_unlock(mapping); > > -} > > - > > -static void vma_link_file(struct vm_area_struct *vma) > > -{ > > - struct file *file = vma->vm_file; > > - struct address_space *mapping; > > - > > - if (file) { > > - mapping = file->f_mapping; > > - i_mmap_lock_write(mapping); > > - __vma_link_file(vma, mapping); > > - i_mmap_unlock_write(mapping); > > - } > > -} > > - > > -static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) > > -{ > > - VMA_ITERATOR(vmi, mm, 0); > > - > > - vma_iter_config(&vmi, vma->vm_start, vma->vm_end); > > - if (vma_iter_prealloc(&vmi, vma)) > > - return -ENOMEM; > > - > > - vma_start_write(vma); > > - vma_iter_store(&vmi, vma); > > - vma_link_file(vma); > > - mm->map_count++; > > - validate_mm(mm); > > - return 0; > > -} > > - > > -/* > > - * init_multi_vma_prep() - Initializer for struct vma_prepare > > - * @vp: The vma_prepare struct > > - * @vma: The vma that will be altered once locked > > - * @next: The next vma if it is to be adjusted > > - * @remove: The first vma to be removed > > - * @remove2: The second vma to be removed > > - */ > > -static inline void init_multi_vma_prep(struct vma_prepare *vp, > > - struct vm_area_struct *vma, struct vm_area_struct *next, > > - struct vm_area_struct *remove, struct vm_area_struct *remove2) > > -{ > > - memset(vp, 0, sizeof(struct vma_prepare)); > > - vp->vma = vma; > > - vp->anon_vma = vma->anon_vma; > > - vp->remove = remove; > > - vp->remove2 = remove2; > > - vp->adj_next = next; > > - if (!vp->anon_vma && next) > > - vp->anon_vma = next->anon_vma; > > - > > - vp->file = vma->vm_file; > > - if (vp->file) > > - vp->mapping = vma->vm_file->f_mapping; > > - > > -} > > - > > -/* > > - * init_vma_prep() - Initializer wrapper for vma_prepare struct > > - * @vp: The vma_prepare struct > > - * @vma: The vma that will be altered once locked > > - */ > > -static inline void init_vma_prep(struct vma_prepare *vp, > > - struct vm_area_struct *vma) > > -{ > > - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); > > -} > > - > > - > > -/* > > - * vma_prepare() - Helper function for handling locking VMAs prior to altering > > - * @vp: The initialized vma_prepare struct > > - */ > > -static inline void vma_prepare(struct vma_prepare *vp) > > -{ > > - if (vp->file) { > > - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); > > - > > - if (vp->adj_next) > > - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, > > - vp->adj_next->vm_end); > > - > > - i_mmap_lock_write(vp->mapping); > > - if (vp->insert && vp->insert->vm_file) { > > - /* > > - * Put into interval tree now, so instantiated pages > > - * are visible to arm/parisc __flush_dcache_page > > - * throughout; but we cannot insert into address > > - * space until vma start or end is updated. > > - */ > > - __vma_link_file(vp->insert, > > - vp->insert->vm_file->f_mapping); > > - } > > - } > > - > > - if (vp->anon_vma) { > > - anon_vma_lock_write(vp->anon_vma); > > - anon_vma_interval_tree_pre_update_vma(vp->vma); > > - if (vp->adj_next) > > - anon_vma_interval_tree_pre_update_vma(vp->adj_next); > > - } > > - > > - if (vp->file) { > > - flush_dcache_mmap_lock(vp->mapping); > > - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); > > - if (vp->adj_next) > > - vma_interval_tree_remove(vp->adj_next, > > - &vp->mapping->i_mmap); > > - } > > - > > -} > > - > > -/* > > - * vma_complete- Helper function for handling the unlocking after altering VMAs, > > - * or for inserting a VMA. > > - * > > - * @vp: The vma_prepare struct > > - * @vmi: The vma iterator > > - * @mm: The mm_struct > > - */ > > -static inline void vma_complete(struct vma_prepare *vp, > > - struct vma_iterator *vmi, struct mm_struct *mm) > > -{ > > - if (vp->file) { > > - if (vp->adj_next) > > - vma_interval_tree_insert(vp->adj_next, > > - &vp->mapping->i_mmap); > > - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); > > - flush_dcache_mmap_unlock(vp->mapping); > > - } > > - > > - if (vp->remove && vp->file) { > > - __remove_shared_vm_struct(vp->remove, vp->mapping); > > - if (vp->remove2) > > - __remove_shared_vm_struct(vp->remove2, vp->mapping); > > - } else if (vp->insert) { > > - /* > > - * split_vma has split insert from vma, and needs > > - * us to insert it before dropping the locks > > - * (it may either follow vma or precede it). > > - */ > > - vma_iter_store(vmi, vp->insert); > > - mm->map_count++; > > - } > > - > > - if (vp->anon_vma) { > > - anon_vma_interval_tree_post_update_vma(vp->vma); > > - if (vp->adj_next) > > - anon_vma_interval_tree_post_update_vma(vp->adj_next); > > - anon_vma_unlock_write(vp->anon_vma); > > - } > > - > > - if (vp->file) { > > - i_mmap_unlock_write(vp->mapping); > > - uprobe_mmap(vp->vma); > > - > > - if (vp->adj_next) > > - uprobe_mmap(vp->adj_next); > > - } > > - > > - if (vp->remove) { > > -again: > > - vma_mark_detached(vp->remove, true); > > - if (vp->file) { > > - uprobe_munmap(vp->remove, vp->remove->vm_start, > > - vp->remove->vm_end); > > - fput(vp->file); > > - } > > - if (vp->remove->anon_vma) > > - anon_vma_merge(vp->vma, vp->remove); > > - mm->map_count--; > > - mpol_put(vma_policy(vp->remove)); > > - if (!vp->remove2) > > - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); > > - vm_area_free(vp->remove); > > - > > - /* > > - * In mprotect's case 6 (see comments on vma_merge), > > - * we are removing both mid and next vmas > > - */ > > - if (vp->remove2) { > > - vp->remove = vp->remove2; > > - vp->remove2 = NULL; > > - goto again; > > - } > > - } > > - if (vp->insert && vp->file) > > - uprobe_mmap(vp->insert); > > - validate_mm(mm); > > -} > > - > > -/* > > - * dup_anon_vma() - Helper function to duplicate anon_vma > > - * @dst: The destination VMA > > - * @src: The source VMA > > - * @dup: Pointer to the destination VMA when successful. > > - * > > - * Returns: 0 on success. > > - */ > > -static inline int dup_anon_vma(struct vm_area_struct *dst, > > - struct vm_area_struct *src, struct vm_area_struct **dup) > > -{ > > - /* > > - * Easily overlooked: when mprotect shifts the boundary, make sure the > > - * expanding vma has anon_vma set if the shrinking vma had, to cover any > > - * anon pages imported. > > - */ > > - if (src->anon_vma && !dst->anon_vma) { > > - int ret; > > - > > - vma_assert_write_locked(dst); > > - dst->anon_vma = src->anon_vma; > > - ret = anon_vma_clone(dst, src); > > - if (ret) > > - return ret; > > - > > - *dup = dst; > > - } > > - > > - return 0; > > -} > > - > > -/* > > - * vma_expand - Expand an existing VMA > > - * > > - * @vmi: The vma iterator > > - * @vma: The vma to expand > > - * @start: The start of the vma > > - * @end: The exclusive end of the vma > > - * @pgoff: The page offset of vma > > - * @next: The current of next vma. > > - * > > - * Expand @vma to @start and @end. Can expand off the start and end. Will > > - * expand over @next if it's different from @vma and @end == @next->vm_end. > > - * Checking if the @vma can expand and merge with @next needs to be handled by > > - * the caller. > > - * > > - * Returns: 0 on success > > - */ > > -int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, > > - unsigned long start, unsigned long end, pgoff_t pgoff, > > - struct vm_area_struct *next) > > -{ > > - struct vm_area_struct *anon_dup = NULL; > > - bool remove_next = false; > > - struct vma_prepare vp; > > - > > - vma_start_write(vma); > > - if (next && (vma != next) && (end == next->vm_end)) { > > - int ret; > > - > > - remove_next = true; > > - vma_start_write(next); > > - ret = dup_anon_vma(vma, next, &anon_dup); > > - if (ret) > > - return ret; > > - } > > - > > - init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); > > - /* Not merging but overwriting any part of next is not handled. */ > > - VM_WARN_ON(next && !vp.remove && > > - next != vma && end > next->vm_start); > > - /* Only handles expanding */ > > - VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); > > - > > - /* Note: vma iterator must be pointing to 'start' */ > > - vma_iter_config(vmi, start, end); > > - if (vma_iter_prealloc(vmi, vma)) > > - goto nomem; > > - > > - vma_prepare(&vp); > > - vma_adjust_trans_huge(vma, start, end, 0); > > - vma_set_range(vma, start, end, pgoff); > > - vma_iter_store(vmi, vma); > > - > > - vma_complete(&vp, vmi, vma->vm_mm); > > - return 0; > > - > > -nomem: > > - if (anon_dup) > > - unlink_anon_vmas(anon_dup); > > - return -ENOMEM; > > -} > > - > > -/* > > - * vma_shrink() - Reduce an existing VMAs memory area > > - * @vmi: The vma iterator > > - * @vma: The VMA to modify > > - * @start: The new start > > - * @end: The new end > > - * > > - * Returns: 0 on success, -ENOMEM otherwise > > - */ > > -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, > > - unsigned long start, unsigned long end, pgoff_t pgoff) > > -{ > > - struct vma_prepare vp; > > - > > - WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); > > - > > - if (vma->vm_start < start) > > - vma_iter_config(vmi, vma->vm_start, start); > > - else > > - vma_iter_config(vmi, end, vma->vm_end); > > - > > - if (vma_iter_prealloc(vmi, NULL)) > > - return -ENOMEM; > > - > > - vma_start_write(vma); > > - > > - init_vma_prep(&vp, vma); > > - vma_prepare(&vp); > > - vma_adjust_trans_huge(vma, start, end, 0); > > - > > - vma_iter_clear(vmi); > > - vma_set_range(vma, start, end, pgoff); > > - vma_complete(&vp, vmi, vma->vm_mm); > > - return 0; > > -} > > - > > -/* > > - * If the vma has a ->close operation then the driver probably needs to release > > - * per-vma resources, so we don't attempt to merge those if the caller indicates > > - * the current vma may be removed as part of the merge. > > - */ > > -static inline bool is_mergeable_vma(struct vm_area_struct *vma, > > - struct file *file, unsigned long vm_flags, > > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, > > - struct anon_vma_name *anon_name, bool may_remove_vma) > > -{ > > - /* > > - * VM_SOFTDIRTY should not prevent from VMA merging, if we > > - * match the flags but dirty bit -- the caller should mark > > - * merged VMA as dirty. If dirty bit won't be excluded from > > - * comparison, we increase pressure on the memory system forcing > > - * the kernel to generate new VMAs when old one could be > > - * extended instead. > > - */ > > - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) > > - return false; > > - if (vma->vm_file != file) > > - return false; > > - if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) > > - return false; > > - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) > > - return false; > > - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) > > - return false; > > - return true; > > -} > > - > > -static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, > > - struct anon_vma *anon_vma2, struct vm_area_struct *vma) > > -{ > > - /* > > - * The list_is_singular() test is to avoid merging VMA cloned from > > - * parents. This can improve scalability caused by anon_vma lock. > > - */ > > - if ((!anon_vma1 || !anon_vma2) && (!vma || > > - list_is_singular(&vma->anon_vma_chain))) > > - return true; > > - return anon_vma1 == anon_vma2; > > -} > > - > > -/* > > - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) > > - * in front of (at a lower virtual address and file offset than) the vma. > > - * > > - * We cannot merge two vmas if they have differently assigned (non-NULL) > > - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. > > - * > > - * We don't check here for the merged mmap wrapping around the end of pagecache > > - * indices (16TB on ia32) because do_mmap() does not permit mmap's which > > - * wrap, nor mmaps which cover the final page at index -1UL. > > - * > > - * We assume the vma may be removed as part of the merge. > > - */ > > -static bool > > -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, > > - struct anon_vma *anon_vma, struct file *file, > > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, > > - struct anon_vma_name *anon_name) > > -{ > > - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) && > > - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { > > - if (vma->vm_pgoff == vm_pgoff) > > - return true; > > - } > > - return false; > > -} > > - > > -/* > > - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) > > - * beyond (at a higher virtual address and file offset than) the vma. > > - * > > - * We cannot merge two vmas if they have differently assigned (non-NULL) > > - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. > > - * > > - * We assume that vma is not removed as part of the merge. > > - */ > > -static bool > > -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, > > - struct anon_vma *anon_vma, struct file *file, > > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, > > - struct anon_vma_name *anon_name) > > -{ > > - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) && > > - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { > > - pgoff_t vm_pglen; > > - vm_pglen = vma_pages(vma); > > - if (vma->vm_pgoff + vm_pglen == vm_pgoff) > > - return true; > > - } > > - return false; > > -} > > - > > -/* > > - * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), > > - * figure out whether that can be merged with its predecessor or its > > - * successor. Or both (it neatly fills a hole). > > - * > > - * In most cases - when called for mmap, brk or mremap - [addr,end) is > > - * certain not to be mapped by the time vma_merge is called; but when > > - * called for mprotect, it is certain to be already mapped (either at > > - * an offset within prev, or at the start of next), and the flags of > > - * this area are about to be changed to vm_flags - and the no-change > > - * case has already been eliminated. > > - * > > - * The following mprotect cases have to be considered, where **** is > > - * the area passed down from mprotect_fixup, never extending beyond one > > - * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts > > - * at the same address as **** and is of the same or larger span, and > > - * NNNN the next vma after ****: > > - * > > - * **** **** **** > > - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC > > - * cannot merge might become might become > > - * PPNNNNNNNNNN PPPPPPPPPPCC > > - * mmap, brk or case 4 below case 5 below > > - * mremap move: > > - * **** **** > > - * PPPP NNNN PPPPCCCCNNNN > > - * might become might become > > - * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or > > - * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or > > - * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 > > - * > > - * It is important for case 8 that the vma CCCC overlapping the > > - * region **** is never going to extended over NNNN. Instead NNNN must > > - * be extended in region **** and CCCC must be removed. This way in > > - * all cases where vma_merge succeeds, the moment vma_merge drops the > > - * rmap_locks, the properties of the merged vma will be already > > - * correct for the whole merged range. Some of those properties like > > - * vm_page_prot/vm_flags may be accessed by rmap_walks and they must > > - * be correct for the whole merged range immediately after the > > - * rmap_locks are released. Otherwise if NNNN would be removed and > > - * CCCC would be extended over the NNNN range, remove_migration_ptes > > - * or other rmap walkers (if working on addresses beyond the "end" > > - * parameter) may establish ptes with the wrong permissions of CCCC > > - * instead of the right permissions of NNNN. > > - * > > - * In the code below: > > - * PPPP is represented by *prev > > - * CCCC is represented by *curr or not represented at all (NULL) > > - * NNNN is represented by *next or not represented at all (NULL) > > - * **** is not represented - it will be merged and the vma containing the > > - * area is returned, or the function will return NULL > > - */ > > -static struct vm_area_struct > > -*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, > > - struct vm_area_struct *src, unsigned long addr, unsigned long end, > > - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, > > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, > > - struct anon_vma_name *anon_name) > > -{ > > - struct mm_struct *mm = src->vm_mm; > > - struct anon_vma *anon_vma = src->anon_vma; > > - struct file *file = src->vm_file; > > - struct vm_area_struct *curr, *next, *res; > > - struct vm_area_struct *vma, *adjust, *remove, *remove2; > > - struct vm_area_struct *anon_dup = NULL; > > - struct vma_prepare vp; > > - pgoff_t vma_pgoff; > > - int err = 0; > > - bool merge_prev = false; > > - bool merge_next = false; > > - bool vma_expanded = false; > > - unsigned long vma_start = addr; > > - unsigned long vma_end = end; > > - pgoff_t pglen = (end - addr) >> PAGE_SHIFT; > > - long adj_start = 0; > > - > > - /* > > - * We later require that vma->vm_flags == vm_flags, > > - * so this tests vma->vm_flags & VM_SPECIAL, too. > > - */ > > - if (vm_flags & VM_SPECIAL) > > - return NULL; > > - > > - /* Does the input range span an existing VMA? (cases 5 - 8) */ > > - curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); > > - > > - if (!curr || /* cases 1 - 4 */ > > - end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ > > - next = vma_lookup(mm, end); > > - else > > - next = NULL; /* case 5 */ > > - > > - if (prev) { > > - vma_start = prev->vm_start; > > - vma_pgoff = prev->vm_pgoff; > > - > > - /* Can we merge the predecessor? */ > > - if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy) > > - && can_vma_merge_after(prev, vm_flags, anon_vma, file, > > - pgoff, vm_userfaultfd_ctx, anon_name)) { > > - merge_prev = true; > > - vma_prev(vmi); > > - } > > - } > > - > > - /* Can we merge the successor? */ > > - if (next && mpol_equal(policy, vma_policy(next)) && > > - can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, > > - vm_userfaultfd_ctx, anon_name)) { > > - merge_next = true; > > - } > > - > > - /* Verify some invariant that must be enforced by the caller. */ > > - VM_WARN_ON(prev && addr <= prev->vm_start); > > - VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); > > - VM_WARN_ON(addr >= end); > > - > > - if (!merge_prev && !merge_next) > > - return NULL; /* Not mergeable. */ > > - > > - if (merge_prev) > > - vma_start_write(prev); > > - > > - res = vma = prev; > > - remove = remove2 = adjust = NULL; > > - > > - /* Can we merge both the predecessor and the successor? */ > > - if (merge_prev && merge_next && > > - is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { > > - vma_start_write(next); > > - remove = next; /* case 1 */ > > - vma_end = next->vm_end; > > - err = dup_anon_vma(prev, next, &anon_dup); > > - if (curr) { /* case 6 */ > > - vma_start_write(curr); > > - remove = curr; > > - remove2 = next; > > - /* > > - * Note that the dup_anon_vma below cannot overwrite err > > - * since the first caller would do nothing unless next > > - * has an anon_vma. > > - */ > > - if (!next->anon_vma) > > - err = dup_anon_vma(prev, curr, &anon_dup); > > - } > > - } else if (merge_prev) { /* case 2 */ > > - if (curr) { > > - vma_start_write(curr); > > - if (end == curr->vm_end) { /* case 7 */ > > - /* > > - * can_vma_merge_after() assumed we would not be > > - * removing prev vma, so it skipped the check > > - * for vm_ops->close, but we are removing curr > > - */ > > - if (curr->vm_ops && curr->vm_ops->close) > > - err = -EINVAL; > > - remove = curr; > > - } else { /* case 5 */ > > - adjust = curr; > > - adj_start = (end - curr->vm_start); > > - } > > - if (!err) > > - err = dup_anon_vma(prev, curr, &anon_dup); > > - } > > - } else { /* merge_next */ > > - vma_start_write(next); > > - res = next; > > - if (prev && addr < prev->vm_end) { /* case 4 */ > > - vma_start_write(prev); > > - vma_end = addr; > > - adjust = next; > > - adj_start = -(prev->vm_end - addr); > > - err = dup_anon_vma(next, prev, &anon_dup); > > - } else { > > - /* > > - * Note that cases 3 and 8 are the ONLY ones where prev > > - * is permitted to be (but is not necessarily) NULL. > > - */ > > - vma = next; /* case 3 */ > > - vma_start = addr; > > - vma_end = next->vm_end; > > - vma_pgoff = next->vm_pgoff - pglen; > > - if (curr) { /* case 8 */ > > - vma_pgoff = curr->vm_pgoff; > > - vma_start_write(curr); > > - remove = curr; > > - err = dup_anon_vma(next, curr, &anon_dup); > > - } > > - } > > - } > > - > > - /* Error in anon_vma clone. */ > > - if (err) > > - goto anon_vma_fail; > > - > > - if (vma_start < vma->vm_start || vma_end > vma->vm_end) > > - vma_expanded = true; > > - > > - if (vma_expanded) { > > - vma_iter_config(vmi, vma_start, vma_end); > > - } else { > > - vma_iter_config(vmi, adjust->vm_start + adj_start, > > - adjust->vm_end); > > - } > > - > > - if (vma_iter_prealloc(vmi, vma)) > > - goto prealloc_fail; > > - > > - init_multi_vma_prep(&vp, vma, adjust, remove, remove2); > > - VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && > > - vp.anon_vma != adjust->anon_vma); > > - > > - vma_prepare(&vp); > > - vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); > > - vma_set_range(vma, vma_start, vma_end, vma_pgoff); > > - > > - if (vma_expanded) > > - vma_iter_store(vmi, vma); > > - > > - if (adj_start) { > > - adjust->vm_start += adj_start; > > - adjust->vm_pgoff += adj_start >> PAGE_SHIFT; > > - if (adj_start < 0) { > > - WARN_ON(vma_expanded); > > - vma_iter_store(vmi, next); > > - } > > - } > > - > > - vma_complete(&vp, vmi, mm); > > - khugepaged_enter_vma(res, vm_flags); > > - return res; > > - > > -prealloc_fail: > > - if (anon_dup) > > - unlink_anon_vmas(anon_dup); > > - > > -anon_vma_fail: > > - vma_iter_set(vmi, addr); > > - vma_iter_load(vmi); > > - return NULL; > > -} > > - > > -/* > > - * Rough compatibility check to quickly see if it's even worth looking > > - * at sharing an anon_vma. > > - * > > - * They need to have the same vm_file, and the flags can only differ > > - * in things that mprotect may change. > > - * > > - * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that > > - * we can merge the two vma's. For example, we refuse to merge a vma if > > - * there is a vm_ops->close() function, because that indicates that the > > - * driver is doing some kind of reference counting. But that doesn't > > - * really matter for the anon_vma sharing case. > > - */ > > -static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) > > -{ > > - return a->vm_end == b->vm_start && > > - mpol_equal(vma_policy(a), vma_policy(b)) && > > - a->vm_file == b->vm_file && > > - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && > > - b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); > > -} > > - > > -/* > > - * Do some basic sanity checking to see if we can re-use the anon_vma > > - * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be > > - * the same as 'old', the other will be the new one that is trying > > - * to share the anon_vma. > > - * > > - * NOTE! This runs with mmap_lock held for reading, so it is possible that > > - * the anon_vma of 'old' is concurrently in the process of being set up > > - * by another page fault trying to merge _that_. But that's ok: if it > > - * is being set up, that automatically means that it will be a singleton > > - * acceptable for merging, so we can do all of this optimistically. But > > - * we do that READ_ONCE() to make sure that we never re-load the pointer. > > - * > > - * IOW: that the "list_is_singular()" test on the anon_vma_chain only > > - * matters for the 'stable anon_vma' case (ie the thing we want to avoid > > - * is to return an anon_vma that is "complex" due to having gone through > > - * a fork). > > - * > > - * We also make sure that the two vma's are compatible (adjacent, > > - * and with the same memory policies). That's all stable, even with just > > - * a read lock on the mmap_lock. > > - */ > > -static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) > > -{ > > - if (anon_vma_compatible(a, b)) { > > - struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); > > - > > - if (anon_vma && list_is_singular(&old->anon_vma_chain)) > > - return anon_vma; > > - } > > - return NULL; > > -} > > + goto out; > > > > -/* > > - * find_mergeable_anon_vma is used by anon_vma_prepare, to check > > - * neighbouring vmas for a suitable anon_vma, before it goes off > > - * to allocate a new anon_vma. It checks because a repetitive > > - * sequence of mprotects and faults may otherwise lead to distinct > > - * anon_vmas being allocated, preventing vma merge in subsequent > > - * mprotect. > > - */ > > -struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) > > -{ > > - struct anon_vma *anon_vma = NULL; > > - struct vm_area_struct *prev, *next; > > - VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); > > - > > - /* Try next first. */ > > - next = vma_iter_load(&vmi); > > - if (next) { > > - anon_vma = reusable_anon_vma(next, vma, next); > > - if (anon_vma) > > - return anon_vma; > > - } > > + mm->brk = brk; > > + if (mm->def_flags & VM_LOCKED) > > + populate = true; > > > > - prev = vma_prev(&vmi); > > - VM_BUG_ON_VMA(prev != vma, vma); > > - prev = vma_prev(&vmi); > > - /* Try prev next. */ > > - if (prev) > > - anon_vma = reusable_anon_vma(prev, prev, vma); > > +success: > > + mmap_write_unlock(mm); > > +success_unlocked: > > + userfaultfd_unmap_complete(mm, &uf); > > + if (populate) > > + mm_populate(oldbrk, newbrk - oldbrk); > > + return brk; > > > > - /* > > - * We might reach here with anon_vma == NULL if we can't find > > - * any reusable anon_vma. > > - * There's no absolute need to look only at touching neighbours: > > - * we could search further afield for "compatible" anon_vmas. > > - * But it would probably just be a waste of time searching, > > - * or lead to too many vmas hanging off the same anon_vma. > > - * We're trying to allow mprotect remerging later on, > > - * not trying to minimize memory used for anon_vmas. > > - */ > > - return anon_vma; > > +out: > > + mm->brk = origbrk; > > + mmap_write_unlock(mm); > > + return origbrk; > > } > > > > /* > > @@ -1519,85 +547,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) > > } > > #endif /* __ARCH_WANT_SYS_OLD_MMAP */ > > > > -static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) > > -{ > > - return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); > > -} > > - > > -static bool vma_is_shared_writable(struct vm_area_struct *vma) > > -{ > > - return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == > > - (VM_WRITE | VM_SHARED); > > -} > > - > > -static bool vma_fs_can_writeback(struct vm_area_struct *vma) > > -{ > > - /* No managed pages to writeback. */ > > - if (vma->vm_flags & VM_PFNMAP) > > - return false; > > - > > - return vma->vm_file && vma->vm_file->f_mapping && > > - mapping_can_writeback(vma->vm_file->f_mapping); > > -} > > - > > -/* > > - * Does this VMA require the underlying folios to have their dirty state > > - * tracked? > > - */ > > -bool vma_needs_dirty_tracking(struct vm_area_struct *vma) > > -{ > > - /* Only shared, writable VMAs require dirty tracking. */ > > - if (!vma_is_shared_writable(vma)) > > - return false; > > - > > - /* Does the filesystem need to be notified? */ > > - if (vm_ops_needs_writenotify(vma->vm_ops)) > > - return true; > > - > > - /* > > - * Even if the filesystem doesn't indicate a need for writenotify, if it > > - * can writeback, dirty tracking is still required. > > - */ > > - return vma_fs_can_writeback(vma); > > -} > > - > > -/* > > - * Some shared mappings will want the pages marked read-only > > - * to track write events. If so, we'll downgrade vm_page_prot > > - * to the private version (using protection_map[] without the > > - * VM_SHARED bit). > > - */ > > -bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) > > -{ > > - /* If it was private or non-writable, the write bit is already clear */ > > - if (!vma_is_shared_writable(vma)) > > - return false; > > - > > - /* The backer wishes to know when pages are first written to? */ > > - if (vm_ops_needs_writenotify(vma->vm_ops)) > > - return true; > > - > > - /* The open routine did something to the protections that pgprot_modify > > - * won't preserve? */ > > - if (pgprot_val(vm_page_prot) != > > - pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) > > - return false; > > - > > - /* > > - * Do we need to track softdirty? hugetlb does not support softdirty > > - * tracking yet. > > - */ > > - if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) > > - return true; > > - > > - /* Do we need write faults for uffd-wp tracking? */ > > - if (userfaultfd_wp(vma)) > > - return true; > > - > > - /* Can the mapping track the dirty pages? */ > > - return vma_fs_can_writeback(vma); > > -} > > - > > /* > > * We account for memory if it's a private writeable mapping, > > * not hugepages and VM_NORESERVE wasn't set. > > @@ -2238,566 +1187,129 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) > > anon_vma_interval_tree_post_update_vma(vma); > > spin_unlock(&mm->page_table_lock); > > > > - perf_event_mmap(vma); > > - } > > - } > > - } > > - anon_vma_unlock_write(vma->anon_vma); > > - vma_iter_free(&vmi); > > - validate_mm(mm); > > - return error; > > -} > > - > > -/* enforced gap between the expanding stack and other mappings. */ > > -unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; > > - > > -static int __init cmdline_parse_stack_guard_gap(char *p) > > -{ > > - unsigned long val; > > - char *endptr; > > - > > - val = simple_strtoul(p, &endptr, 10); > > - if (!*endptr) > > - stack_guard_gap = val << PAGE_SHIFT; > > - > > - return 1; > > -} > > -__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); > > - > > -#ifdef CONFIG_STACK_GROWSUP > > -int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) > > -{ > > - return expand_upwards(vma, address); > > -} > > - > > -struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) > > -{ > > - struct vm_area_struct *vma, *prev; > > - > > - addr &= PAGE_MASK; > > - vma = find_vma_prev(mm, addr, &prev); > > - if (vma && (vma->vm_start <= addr)) > > - return vma; > > - if (!prev) > > - return NULL; > > - if (expand_stack_locked(prev, addr)) > > - return NULL; > > - if (prev->vm_flags & VM_LOCKED) > > - populate_vma_page_range(prev, addr, prev->vm_end, NULL); > > - return prev; > > -} > > -#else > > -int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) > > -{ > > - return expand_downwards(vma, address); > > -} > > - > > -struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) > > -{ > > - struct vm_area_struct *vma; > > - unsigned long start; > > - > > - addr &= PAGE_MASK; > > - vma = find_vma(mm, addr); > > - if (!vma) > > - return NULL; > > - if (vma->vm_start <= addr) > > - return vma; > > - start = vma->vm_start; > > - if (expand_stack_locked(vma, addr)) > > - return NULL; > > - if (vma->vm_flags & VM_LOCKED) > > - populate_vma_page_range(vma, addr, start, NULL); > > - return vma; > > -} > > -#endif > > - > > -#if defined(CONFIG_STACK_GROWSUP) > > - > > -#define vma_expand_up(vma,addr) expand_upwards(vma, addr) > > -#define vma_expand_down(vma, addr) (-EFAULT) > > - > > -#else > > - > > -#define vma_expand_up(vma,addr) (-EFAULT) > > -#define vma_expand_down(vma, addr) expand_downwards(vma, addr) > > - > > -#endif > > - > > -/* > > - * expand_stack(): legacy interface for page faulting. Don't use unless > > - * you have to. > > - * > > - * This is called with the mm locked for reading, drops the lock, takes > > - * the lock for writing, tries to look up a vma again, expands it if > > - * necessary, and downgrades the lock to reading again. > > - * > > - * If no vma is found or it can't be expanded, it returns NULL and has > > - * dropped the lock. > > - */ > > -struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) > > -{ > > - struct vm_area_struct *vma, *prev; > > - > > - mmap_read_unlock(mm); > > - if (mmap_write_lock_killable(mm)) > > - return NULL; > > - > > - vma = find_vma_prev(mm, addr, &prev); > > - if (vma && vma->vm_start <= addr) > > - goto success; > > - > > - if (prev && !vma_expand_up(prev, addr)) { > > - vma = prev; > > - goto success; > > - } > > - > > - if (vma && !vma_expand_down(vma, addr)) > > - goto success; > > - > > - mmap_write_unlock(mm); > > - return NULL; > > - > > -success: > > - mmap_write_downgrade(mm); > > - return vma; > > -} > > - > > -/* > > - * Ok - we have the memory areas we should free on a maple tree so release them, > > - * and do the vma updates. > > - * > > - * Called with the mm semaphore held. > > - */ > > -static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) > > -{ > > - unsigned long nr_accounted = 0; > > - struct vm_area_struct *vma; > > - > > - /* Update high watermark before we lower total_vm */ > > - update_hiwater_vm(mm); > > - mas_for_each(mas, vma, ULONG_MAX) { > > - long nrpages = vma_pages(vma); > > - > > - if (vma->vm_flags & VM_ACCOUNT) > > - nr_accounted += nrpages; > > - vm_stat_account(mm, vma->vm_flags, -nrpages); > > - remove_vma(vma, false); > > - } > > - vm_unacct_memory(nr_accounted); > > -} > > - > > -/* > > - * Get rid of page table information in the indicated region. > > - * > > - * Called with the mm semaphore held. > > - */ > > -static void unmap_region(struct mm_struct *mm, struct ma_state *mas, > > - struct vm_area_struct *vma, struct vm_area_struct *prev, > > - struct vm_area_struct *next, unsigned long start, > > - unsigned long end, unsigned long tree_end, bool mm_wr_locked) > > -{ > > - struct mmu_gather tlb; > > - unsigned long mt_start = mas->index; > > - > > - lru_add_drain(); > > - tlb_gather_mmu(&tlb, mm); > > - update_hiwater_rss(mm); > > - unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked); > > - mas_set(mas, mt_start); > > - free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, > > - next ? next->vm_start : USER_PGTABLES_CEILING, > > - mm_wr_locked); > > - tlb_finish_mmu(&tlb); > > -} > > - > > -/* > > - * __split_vma() bypasses sysctl_max_map_count checking. We use this where it > > - * has already been checked or doesn't make sense to fail. > > - * VMA Iterator will point to the end VMA. > > - */ > > -static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, > > - unsigned long addr, int new_below) > > -{ > > - struct vma_prepare vp; > > - struct vm_area_struct *new; > > - int err; > > - > > - WARN_ON(vma->vm_start >= addr); > > - WARN_ON(vma->vm_end <= addr); > > - > > - if (vma->vm_ops && vma->vm_ops->may_split) { > > - err = vma->vm_ops->may_split(vma, addr); > > - if (err) > > - return err; > > - } > > - > > - new = vm_area_dup(vma); > > - if (!new) > > - return -ENOMEM; > > - > > - if (new_below) { > > - new->vm_end = addr; > > - } else { > > - new->vm_start = addr; > > - new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); > > - } > > - > > - err = -ENOMEM; > > - vma_iter_config(vmi, new->vm_start, new->vm_end); > > - if (vma_iter_prealloc(vmi, new)) > > - goto out_free_vma; > > - > > - err = vma_dup_policy(vma, new); > > - if (err) > > - goto out_free_vmi; > > - > > - err = anon_vma_clone(new, vma); > > - if (err) > > - goto out_free_mpol; > > - > > - if (new->vm_file) > > - get_file(new->vm_file); > > - > > - if (new->vm_ops && new->vm_ops->open) > > - new->vm_ops->open(new); > > - > > - vma_start_write(vma); > > - vma_start_write(new); > > - > > - init_vma_prep(&vp, vma); > > - vp.insert = new; > > - vma_prepare(&vp); > > - vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); > > - > > - if (new_below) { > > - vma->vm_start = addr; > > - vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; > > - } else { > > - vma->vm_end = addr; > > - } > > - > > - /* vma_complete stores the new vma */ > > - vma_complete(&vp, vmi, vma->vm_mm); > > - > > - /* Success. */ > > - if (new_below) > > - vma_next(vmi); > > - return 0; > > - > > -out_free_mpol: > > - mpol_put(vma_policy(new)); > > -out_free_vmi: > > - vma_iter_free(vmi); > > -out_free_vma: > > - vm_area_free(new); > > - return err; > > -} > > - > > -/* > > - * Split a vma into two pieces at address 'addr', a new vma is allocated > > - * either for the first part or the tail. > > - */ > > -static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, > > - unsigned long addr, int new_below) > > -{ > > - if (vma->vm_mm->map_count >= sysctl_max_map_count) > > - return -ENOMEM; > > - > > - return __split_vma(vmi, vma, addr, new_below); > > -} > > - > > -/* > > - * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd > > - * context and anonymous VMA name within the range [start, end). > > - * > > - * As a result, we might be able to merge the newly modified VMA range with an > > - * adjacent VMA with identical properties. > > - * > > - * If no merge is possible and the range does not span the entirety of the VMA, > > - * we then need to split the VMA to accommodate the change. > > - * > > - * The function returns either the merged VMA, the original VMA if a split was > > - * required instead, or an error if the split failed. > > - */ > > -struct vm_area_struct *vma_modify(struct vma_iterator *vmi, > > - struct vm_area_struct *prev, > > - struct vm_area_struct *vma, > > - unsigned long start, unsigned long end, > > - unsigned long vm_flags, > > - struct mempolicy *policy, > > - struct vm_userfaultfd_ctx uffd_ctx, > > - struct anon_vma_name *anon_name) > > -{ > > - pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); > > - struct vm_area_struct *merged; > > - > > - merged = vma_merge(vmi, prev, vma, start, end, vm_flags, > > - pgoff, policy, uffd_ctx, anon_name); > > - if (merged) > > - return merged; > > - > > - if (vma->vm_start < start) { > > - int err = split_vma(vmi, vma, start, 1); > > - > > - if (err) > > - return ERR_PTR(err); > > + perf_event_mmap(vma); > > + } > > + } > > } > > + anon_vma_unlock_write(vma->anon_vma); > > + vma_iter_free(&vmi); > > + validate_mm(mm); > > + return error; > > +} > > > > - if (vma->vm_end > end) { > > - int err = split_vma(vmi, vma, end, 0); > > +/* enforced gap between the expanding stack and other mappings. */ > > +unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; > > > > - if (err) > > - return ERR_PTR(err); > > - } > > +static int __init cmdline_parse_stack_guard_gap(char *p) > > +{ > > + unsigned long val; > > + char *endptr; > > > > - return vma; > > + val = simple_strtoul(p, &endptr, 10); > > + if (!*endptr) > > + stack_guard_gap = val << PAGE_SHIFT; > > + > > + return 1; > > } > > +__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); > > > > -/* > > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller > > - * must ensure that [start, end) does not overlap any existing VMA. > > - */ > > -static struct vm_area_struct > > -*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, > > - struct vm_area_struct *vma, unsigned long start, > > - unsigned long end, pgoff_t pgoff) > > +#ifdef CONFIG_STACK_GROWSUP > > +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) > > { > > - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff, > > - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); > > + return expand_upwards(vma, address); > > } > > > > -/* > > - * Expand vma by delta bytes, potentially merging with an immediately adjacent > > - * VMA with identical properties. > > - */ > > -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, > > - struct vm_area_struct *vma, > > - unsigned long delta) > > +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) > > { > > - pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); > > + struct vm_area_struct *vma, *prev; > > > > - /* vma is specified as prev, so case 1 or 2 will apply. */ > > - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta, > > - vma->vm_flags, pgoff, vma_policy(vma), > > - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); > > + addr &= PAGE_MASK; > > + vma = find_vma_prev(mm, addr, &prev); > > + if (vma && (vma->vm_start <= addr)) > > + return vma; > > + if (!prev) > > + return NULL; > > + if (expand_stack_locked(prev, addr)) > > + return NULL; > > + if (prev->vm_flags & VM_LOCKED) > > + populate_vma_page_range(prev, addr, prev->vm_end, NULL); > > + return prev; > > } > > - > > -/* > > - * do_vmi_align_munmap() - munmap the aligned region from @start to @end. > > - * @vmi: The vma iterator > > - * @vma: The starting vm_area_struct > > - * @mm: The mm_struct > > - * @start: The aligned start address to munmap. > > - * @end: The aligned end address to munmap. > > - * @uf: The userfaultfd list_head > > - * @unlock: Set to true to drop the mmap_lock. unlocking only happens on > > - * success. > > - * > > - * Return: 0 on success and drops the lock if so directed, error and leaves the > > - * lock held otherwise. > > - */ > > -static int > > -do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, > > - struct mm_struct *mm, unsigned long start, > > - unsigned long end, struct list_head *uf, bool unlock) > > +#else > > +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) > > { > > - struct vm_area_struct *prev, *next = NULL; > > - struct maple_tree mt_detach; > > - int count = 0; > > - int error = -ENOMEM; > > - unsigned long locked_vm = 0; > > - MA_STATE(mas_detach, &mt_detach, 0, 0); > > - mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); > > - mt_on_stack(mt_detach); > > - > > - /* > > - * If we need to split any vma, do it now to save pain later. > > - * > > - * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially > > - * unmapped vm_area_struct will remain in use: so lower split_vma > > - * places tmp vma above, and higher split_vma places tmp vma below. > > - */ > > - > > - /* Does it split the first one? */ > > - if (start > vma->vm_start) { > > - > > - /* > > - * Make sure that map_count on return from munmap() will > > - * not exceed its limit; but let map_count go just above > > - * its limit temporarily, to help free resources as expected. > > - */ > > - if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) > > - goto map_count_exceeded; > > - > > - error = __split_vma(vmi, vma, start, 1); > > - if (error) > > - goto start_split_failed; > > - } > > - > > - /* > > - * Detach a range of VMAs from the mm. Using next as a temp variable as > > - * it is always overwritten. > > - */ > > - next = vma; > > - do { > > - /* Does it split the end? */ > > - if (next->vm_end > end) { > > - error = __split_vma(vmi, next, end, 0); > > - if (error) > > - goto end_split_failed; > > - } > > - vma_start_write(next); > > - mas_set(&mas_detach, count); > > - error = mas_store_gfp(&mas_detach, next, GFP_KERNEL); > > - if (error) > > - goto munmap_gather_failed; > > - vma_mark_detached(next, true); > > - if (next->vm_flags & VM_LOCKED) > > - locked_vm += vma_pages(next); > > + return expand_downwards(vma, address); > > +} > > > > - count++; > > - if (unlikely(uf)) { > > - /* > > - * If userfaultfd_unmap_prep returns an error the vmas > > - * will remain split, but userland will get a > > - * highly unexpected error anyway. This is no > > - * different than the case where the first of the two > > - * __split_vma fails, but we don't undo the first > > - * split, despite we could. This is unlikely enough > > - * failure that it's not worth optimizing it for. > > - */ > > - error = userfaultfd_unmap_prep(next, start, end, uf); > > +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) > > +{ > > + struct vm_area_struct *vma; > > + unsigned long start; > > > > - if (error) > > - goto userfaultfd_error; > > - } > > -#ifdef CONFIG_DEBUG_VM_MAPLE_TREE > > - BUG_ON(next->vm_start < start); > > - BUG_ON(next->vm_start > end); > > -#endif > > - } for_each_vma_range(*vmi, next, end); > > - > > -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) > > - /* Make sure no VMAs are about to be lost. */ > > - { > > - MA_STATE(test, &mt_detach, 0, 0); > > - struct vm_area_struct *vma_mas, *vma_test; > > - int test_count = 0; > > - > > - vma_iter_set(vmi, start); > > - rcu_read_lock(); > > - vma_test = mas_find(&test, count - 1); > > - for_each_vma_range(*vmi, vma_mas, end) { > > - BUG_ON(vma_mas != vma_test); > > - test_count++; > > - vma_test = mas_next(&test, count - 1); > > - } > > - rcu_read_unlock(); > > - BUG_ON(count != test_count); > > - } > > + addr &= PAGE_MASK; > > + vma = find_vma(mm, addr); > > + if (!vma) > > + return NULL; > > + if (vma->vm_start <= addr) > > + return vma; > > + start = vma->vm_start; > > + if (expand_stack_locked(vma, addr)) > > + return NULL; > > + if (vma->vm_flags & VM_LOCKED) > > + populate_vma_page_range(vma, addr, start, NULL); > > + return vma; > > +} > > #endif > > > > - while (vma_iter_addr(vmi) > start) > > - vma_iter_prev_range(vmi); > > - > > - error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); > > - if (error) > > - goto clear_tree_failed; > > - > > - /* Point of no return */ > > - mm->locked_vm -= locked_vm; > > - mm->map_count -= count; > > - if (unlock) > > - mmap_write_downgrade(mm); > > +#if defined(CONFIG_STACK_GROWSUP) > > > > - prev = vma_iter_prev_range(vmi); > > - next = vma_next(vmi); > > - if (next) > > - vma_iter_prev_range(vmi); > > +#define vma_expand_up(vma,addr) expand_upwards(vma, addr) > > +#define vma_expand_down(vma, addr) (-EFAULT) > > > > - /* > > - * We can free page tables without write-locking mmap_lock because VMAs > > - * were isolated before we downgraded mmap_lock. > > - */ > > - mas_set(&mas_detach, 1); > > - unmap_region(mm, &mas_detach, vma, prev, next, start, end, count, > > - !unlock); > > - /* Statistics and freeing VMAs */ > > - mas_set(&mas_detach, 0); > > - remove_mt(mm, &mas_detach); > > - validate_mm(mm); > > - if (unlock) > > - mmap_read_unlock(mm); > > +#else > > > > - __mt_destroy(&mt_detach); > > - return 0; > > +#define vma_expand_up(vma,addr) (-EFAULT) > > +#define vma_expand_down(vma, addr) expand_downwards(vma, addr) > > > > -clear_tree_failed: > > -userfaultfd_error: > > -munmap_gather_failed: > > -end_split_failed: > > - mas_set(&mas_detach, 0); > > - mas_for_each(&mas_detach, next, end) > > - vma_mark_detached(next, false); > > - > > - __mt_destroy(&mt_detach); > > -start_split_failed: > > -map_count_exceeded: > > - validate_mm(mm); > > - return error; > > -} > > +#endif > > > > /* > > - * do_vmi_munmap() - munmap a given range. > > - * @vmi: The vma iterator > > - * @mm: The mm_struct > > - * @start: The start address to munmap > > - * @len: The length of the range to munmap > > - * @uf: The userfaultfd list_head > > - * @unlock: set to true if the user wants to drop the mmap_lock on success > > + * expand_stack(): legacy interface for page faulting. Don't use unless > > + * you have to. > > * > > - * This function takes a @mas that is either pointing to the previous VMA or set > > - * to MA_START and sets it up to remove the mapping(s). The @len will be > > - * aligned and any arch_unmap work will be preformed. > > + * This is called with the mm locked for reading, drops the lock, takes > > + * the lock for writing, tries to look up a vma again, expands it if > > + * necessary, and downgrades the lock to reading again. > > * > > - * Return: 0 on success and drops the lock if so directed, error and leaves the > > - * lock held otherwise. > > + * If no vma is found or it can't be expanded, it returns NULL and has > > + * dropped the lock. > > */ > > -int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, > > - unsigned long start, size_t len, struct list_head *uf, > > - bool unlock) > > +struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) > > { > > - unsigned long end; > > - struct vm_area_struct *vma; > > + struct vm_area_struct *vma, *prev; > > > > - if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) > > - return -EINVAL; > > + mmap_read_unlock(mm); > > + if (mmap_write_lock_killable(mm)) > > + return NULL; > > > > - end = start + PAGE_ALIGN(len); > > - if (end == start) > > - return -EINVAL; > > + vma = find_vma_prev(mm, addr, &prev); > > + if (vma && vma->vm_start <= addr) > > + goto success; > > > > - /* > > - * Check if memory is sealed before arch_unmap. > > - * Prevent unmapping a sealed VMA. > > - * can_modify_mm assumes we have acquired the lock on MM. > > - */ > > - if (unlikely(!can_modify_mm(mm, start, end))) > > - return -EPERM; > > + if (prev && !vma_expand_up(prev, addr)) { > > + vma = prev; > > + goto success; > > + } > > > > - /* arch_unmap() might do unmaps itself. */ > > - arch_unmap(mm, start, end); > > + if (vma && !vma_expand_down(vma, addr)) > > + goto success; > > > > - /* Find the first overlapping VMA */ > > - vma = vma_find(vmi, end); > > - if (!vma) { > > - if (unlock) > > - mmap_write_unlock(mm); > > - return 0; > > - } > > + mmap_write_unlock(mm); > > + return NULL; > > > > - return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); > > +success: > > + mmap_write_downgrade(mm); > > + return vma; > > } > > > > /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. > > @@ -3460,92 +1972,6 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) > > return 0; > > } > > > > -/* > > - * Copy the vma structure to a new location in the same mm, > > - * prior to moving page table entries, to effect an mremap move. > > - */ > > -struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, > > - unsigned long addr, unsigned long len, pgoff_t pgoff, > > - bool *need_rmap_locks) > > -{ > > - struct vm_area_struct *vma = *vmap; > > - unsigned long vma_start = vma->vm_start; > > - struct mm_struct *mm = vma->vm_mm; > > - struct vm_area_struct *new_vma, *prev; > > - bool faulted_in_anon_vma = true; > > - VMA_ITERATOR(vmi, mm, addr); > > - > > - /* > > - * If anonymous vma has not yet been faulted, update new pgoff > > - * to match new location, to increase its chance of merging. > > - */ > > - if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { > > - pgoff = addr >> PAGE_SHIFT; > > - faulted_in_anon_vma = false; > > - } > > - > > - new_vma = find_vma_prev(mm, addr, &prev); > > - if (new_vma && new_vma->vm_start < addr + len) > > - return NULL; /* should never get here */ > > - > > - new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); > > - if (new_vma) { > > - /* > > - * Source vma may have been merged into new_vma > > - */ > > - if (unlikely(vma_start >= new_vma->vm_start && > > - vma_start < new_vma->vm_end)) { > > - /* > > - * The only way we can get a vma_merge with > > - * self during an mremap is if the vma hasn't > > - * been faulted in yet and we were allowed to > > - * reset the dst vma->vm_pgoff to the > > - * destination address of the mremap to allow > > - * the merge to happen. mremap must change the > > - * vm_pgoff linearity between src and dst vmas > > - * (in turn preventing a vma_merge) to be > > - * safe. It is only safe to keep the vm_pgoff > > - * linear if there are no pages mapped yet. > > - */ > > - VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); > > - *vmap = vma = new_vma; > > - } > > - *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); > > - } else { > > - new_vma = vm_area_dup(vma); > > - if (!new_vma) > > - goto out; > > - vma_set_range(new_vma, addr, addr + len, pgoff); > > - if (vma_dup_policy(vma, new_vma)) > > - goto out_free_vma; > > - if (anon_vma_clone(new_vma, vma)) > > - goto out_free_mempol; > > - if (new_vma->vm_file) > > - get_file(new_vma->vm_file); > > - if (new_vma->vm_ops && new_vma->vm_ops->open) > > - new_vma->vm_ops->open(new_vma); > > - if (vma_link(mm, new_vma)) > > - goto out_vma_link; > > - *need_rmap_locks = false; > > - } > > - return new_vma; > > - > > -out_vma_link: > > - if (new_vma->vm_ops && new_vma->vm_ops->close) > > - new_vma->vm_ops->close(new_vma); > > - > > - if (new_vma->vm_file) > > - fput(new_vma->vm_file); > > - > > - unlink_anon_vmas(new_vma); > > -out_free_mempol: > > - mpol_put(vma_policy(new_vma)); > > -out_free_vma: > > - vm_area_free(new_vma); > > -out: > > - return NULL; > > -} > > - > > /* > > * Return true if the calling process may expand its vm space by the passed > > * number of pages > > @@ -3743,203 +2169,6 @@ int install_special_mapping(struct mm_struct *mm, > > return PTR_ERR_OR_ZERO(vma); > > } > > > > -static DEFINE_MUTEX(mm_all_locks_mutex); > > - > > -static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) > > -{ > > - if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { > > - /* > > - * The LSB of head.next can't change from under us > > - * because we hold the mm_all_locks_mutex. > > - */ > > - down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); > > - /* > > - * We can safely modify head.next after taking the > > - * anon_vma->root->rwsem. If some other vma in this mm shares > > - * the same anon_vma we won't take it again. > > - * > > - * No need of atomic instructions here, head.next > > - * can't change from under us thanks to the > > - * anon_vma->root->rwsem. > > - */ > > - if (__test_and_set_bit(0, (unsigned long *) > > - &anon_vma->root->rb_root.rb_root.rb_node)) > > - BUG(); > > - } > > -} > > - > > -static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) > > -{ > > - if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { > > - /* > > - * AS_MM_ALL_LOCKS can't change from under us because > > - * we hold the mm_all_locks_mutex. > > - * > > - * Operations on ->flags have to be atomic because > > - * even if AS_MM_ALL_LOCKS is stable thanks to the > > - * mm_all_locks_mutex, there may be other cpus > > - * changing other bitflags in parallel to us. > > - */ > > - if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) > > - BUG(); > > - down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); > > - } > > -} > > - > > -/* > > - * This operation locks against the VM for all pte/vma/mm related > > - * operations that could ever happen on a certain mm. This includes > > - * vmtruncate, try_to_unmap, and all page faults. > > - * > > - * The caller must take the mmap_lock in write mode before calling > > - * mm_take_all_locks(). The caller isn't allowed to release the > > - * mmap_lock until mm_drop_all_locks() returns. > > - * > > - * mmap_lock in write mode is required in order to block all operations > > - * that could modify pagetables and free pages without need of > > - * altering the vma layout. It's also needed in write mode to avoid new > > - * anon_vmas to be associated with existing vmas. > > - * > > - * A single task can't take more than one mm_take_all_locks() in a row > > - * or it would deadlock. > > - * > > - * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in > > - * mapping->flags avoid to take the same lock twice, if more than one > > - * vma in this mm is backed by the same anon_vma or address_space. > > - * > > - * We take locks in following order, accordingly to comment at beginning > > - * of mm/rmap.c: > > - * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for > > - * hugetlb mapping); > > - * - all vmas marked locked > > - * - all i_mmap_rwsem locks; > > - * - all anon_vma->rwseml > > - * > > - * We can take all locks within these types randomly because the VM code > > - * doesn't nest them and we protected from parallel mm_take_all_locks() by > > - * mm_all_locks_mutex. > > - * > > - * mm_take_all_locks() and mm_drop_all_locks are expensive operations > > - * that may have to take thousand of locks. > > - * > > - * mm_take_all_locks() can fail if it's interrupted by signals. > > - */ > > -int mm_take_all_locks(struct mm_struct *mm) > > -{ > > - struct vm_area_struct *vma; > > - struct anon_vma_chain *avc; > > - VMA_ITERATOR(vmi, mm, 0); > > - > > - mmap_assert_write_locked(mm); > > - > > - mutex_lock(&mm_all_locks_mutex); > > - > > - /* > > - * vma_start_write() does not have a complement in mm_drop_all_locks() > > - * because vma_start_write() is always asymmetrical; it marks a VMA as > > - * being written to until mmap_write_unlock() or mmap_write_downgrade() > > - * is reached. > > - */ > > - for_each_vma(vmi, vma) { > > - if (signal_pending(current)) > > - goto out_unlock; > > - vma_start_write(vma); > > - } > > - > > - vma_iter_init(&vmi, mm, 0); > > - for_each_vma(vmi, vma) { > > - if (signal_pending(current)) > > - goto out_unlock; > > - if (vma->vm_file && vma->vm_file->f_mapping && > > - is_vm_hugetlb_page(vma)) > > - vm_lock_mapping(mm, vma->vm_file->f_mapping); > > - } > > - > > - vma_iter_init(&vmi, mm, 0); > > - for_each_vma(vmi, vma) { > > - if (signal_pending(current)) > > - goto out_unlock; > > - if (vma->vm_file && vma->vm_file->f_mapping && > > - !is_vm_hugetlb_page(vma)) > > - vm_lock_mapping(mm, vma->vm_file->f_mapping); > > - } > > - > > - vma_iter_init(&vmi, mm, 0); > > - for_each_vma(vmi, vma) { > > - if (signal_pending(current)) > > - goto out_unlock; > > - if (vma->anon_vma) > > - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) > > - vm_lock_anon_vma(mm, avc->anon_vma); > > - } > > - > > - return 0; > > - > > -out_unlock: > > - mm_drop_all_locks(mm); > > - return -EINTR; > > -} > > - > > -static void vm_unlock_anon_vma(struct anon_vma *anon_vma) > > -{ > > - if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { > > - /* > > - * The LSB of head.next can't change to 0 from under > > - * us because we hold the mm_all_locks_mutex. > > - * > > - * We must however clear the bitflag before unlocking > > - * the vma so the users using the anon_vma->rb_root will > > - * never see our bitflag. > > - * > > - * No need of atomic instructions here, head.next > > - * can't change from under us until we release the > > - * anon_vma->root->rwsem. > > - */ > > - if (!__test_and_clear_bit(0, (unsigned long *) > > - &anon_vma->root->rb_root.rb_root.rb_node)) > > - BUG(); > > - anon_vma_unlock_write(anon_vma); > > - } > > -} > > - > > -static void vm_unlock_mapping(struct address_space *mapping) > > -{ > > - if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { > > - /* > > - * AS_MM_ALL_LOCKS can't change to 0 from under us > > - * because we hold the mm_all_locks_mutex. > > - */ > > - i_mmap_unlock_write(mapping); > > - if (!test_and_clear_bit(AS_MM_ALL_LOCKS, > > - &mapping->flags)) > > - BUG(); > > - } > > -} > > - > > -/* > > - * The mmap_lock cannot be released by the caller until > > - * mm_drop_all_locks() returns. > > - */ > > -void mm_drop_all_locks(struct mm_struct *mm) > > -{ > > - struct vm_area_struct *vma; > > - struct anon_vma_chain *avc; > > - VMA_ITERATOR(vmi, mm, 0); > > - > > - mmap_assert_write_locked(mm); > > - BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); > > - > > - for_each_vma(vmi, vma) { > > - if (vma->anon_vma) > > - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) > > - vm_unlock_anon_vma(avc->anon_vma); > > - if (vma->vm_file && vma->vm_file->f_mapping) > > - vm_unlock_mapping(vma->vm_file->f_mapping); > > - } > > - > > - mutex_unlock(&mm_all_locks_mutex); > > -} > > - > > /* > > * vma_expand_bottom() - Expands the bottom of a VMA downwards. An error will > > * arise if there is another VMA in the expanded range, or > > diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c > > index 8982e6139d07..fc18fe274505 100644 > > --- a/mm/mmu_notifier.c > > +++ b/mm/mmu_notifier.c > > @@ -19,6 +19,8 @@ > > #include <linux/sched/mm.h> > > #include <linux/slab.h> > > > > +#include "vma.h" > > + > > /* global SRCU for all MMs */ > > DEFINE_STATIC_SRCU(srcu); > > > > diff --git a/mm/mprotect.c b/mm/mprotect.c > > index 222ab434da54..77951e2d0863 100644 > > --- a/mm/mprotect.c > > +++ b/mm/mprotect.c > > @@ -39,6 +39,7 @@ > > #include <asm/tlb.h> > > > > #include "internal.h" > > +#include "vma.h" > > > > bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, > > pte_t pte) > > diff --git a/mm/mremap.c b/mm/mremap.c > > index e7ae140fc640..09ef3eb31fbf 100644 > > --- a/mm/mremap.c > > +++ b/mm/mremap.c > > @@ -31,6 +31,7 @@ > > #include <asm/pgalloc.h> > > > > #include "internal.h" > > +#include "vma.h" > > > > static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) > > { > > diff --git a/mm/mseal.c b/mm/mseal.c > > index bf783bba8ed0..7bcceda42a1a 100644 > > --- a/mm/mseal.c > > +++ b/mm/mseal.c > > @@ -14,7 +14,9 @@ > > #include <linux/mmu_context.h> > > #include <linux/syscalls.h> > > #include <linux/sched.h> > > + > > #include "internal.h" > > +#include "vma.h" > > > > static inline bool vma_is_sealed(struct vm_area_struct *vma) > > { > > diff --git a/mm/rmap.c b/mm/rmap.c > > index 8616308610b9..4dec7ab3638c 100644 > > --- a/mm/rmap.c > > +++ b/mm/rmap.c > > @@ -83,6 +83,7 @@ > > #include <trace/events/migrate.h> > > > > #include "internal.h" > > +#include "vma.h" > > > > static struct kmem_cache *anon_vma_cachep; > > static struct kmem_cache *anon_vma_chain_cachep; > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > > index 950fe6b2f0f7..30be083788be 100644 > > --- a/mm/userfaultfd.c > > +++ b/mm/userfaultfd.c > > @@ -17,7 +17,9 @@ > > #include <linux/shmem_fs.h> > > #include <asm/tlbflush.h> > > #include <asm/tlb.h> > > + > > #include "internal.h" > > +#include "vma.h" > > > > static __always_inline > > bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) > > diff --git a/mm/vma.c b/mm/vma.c > > new file mode 100644 > > index 000000000000..bf0546fe6eab > > --- /dev/null > > +++ b/mm/vma.c > > @@ -0,0 +1,1766 @@ > > +// SPDX-License-Identifier: GPL-2.0-or-later > > + > > +/* > > + * VMA-specific functions. > > + */ > > + > > +#include "vma_internal.h" > > +#include "vma.h" > > + > > +/* > > + * If the vma has a ->close operation then the driver probably needs to release > > + * per-vma resources, so we don't attempt to merge those if the caller indicates > > + * the current vma may be removed as part of the merge. > > + */ > > +static inline bool is_mergeable_vma(struct vm_area_struct *vma, > > + struct file *file, unsigned long vm_flags, > > + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, > > + struct anon_vma_name *anon_name, bool may_remove_vma) > > +{ > > + /* > > + * VM_SOFTDIRTY should not prevent from VMA merging, if we > > + * match the flags but dirty bit -- the caller should mark > > + * merged VMA as dirty. If dirty bit won't be excluded from > > + * comparison, we increase pressure on the memory system forcing > > + * the kernel to generate new VMAs when old one could be > > + * extended instead. > > + */ > > + if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) > > + return false; > > + if (vma->vm_file != file) > > + return false; > > + if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) > > + return false; > > + if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) > > + return false; > > + if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) > > + return false; > > + return true; > > +} > > + > > +static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, > > + struct anon_vma *anon_vma2, struct vm_area_struct *vma) > > +{ > > + /* > > + * The list_is_singular() test is to avoid merging VMA cloned from > > + * parents. This can improve scalability caused by anon_vma lock. > > + */ > > + if ((!anon_vma1 || !anon_vma2) && (!vma || > > + list_is_singular(&vma->anon_vma_chain))) > > + return true; > > + return anon_vma1 == anon_vma2; > > +} > > + > > +/* > > + * init_multi_vma_prep() - Initializer for struct vma_prepare > > + * @vp: The vma_prepare struct > > + * @vma: The vma that will be altered once locked > > + * @next: The next vma if it is to be adjusted > > + * @remove: The first vma to be removed > > + * @remove2: The second vma to be removed > > + */ > > +static void init_multi_vma_prep(struct vma_prepare *vp, > > + struct vm_area_struct *vma, > > + struct vm_area_struct *next, > > + struct vm_area_struct *remove, > > + struct vm_area_struct *remove2) > > +{ > > + memset(vp, 0, sizeof(struct vma_prepare)); > > + vp->vma = vma; > > + vp->anon_vma = vma->anon_vma; > > + vp->remove = remove; > > + vp->remove2 = remove2; > > + vp->adj_next = next; > > + if (!vp->anon_vma && next) > > + vp->anon_vma = next->anon_vma; > > + > > + vp->file = vma->vm_file; > > + if (vp->file) > > + vp->mapping = vma->vm_file->f_mapping; > > + > > +} > > + > > +/* > > + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) > > + * in front of (at a lower virtual address and file offset than) the vma. > > + * > > + * We cannot merge two vmas if they have differently assigned (non-NULL) > > + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. > > + * > > + * We don't check here for the merged mmap wrapping around the end of pagecache > > + * indices (16TB on ia32) because do_mmap() does not permit mmap's which > > + * wrap, nor mmaps which cover the final page at index -1UL. > > + * > > + * We assume the vma may be removed as part of the merge. > > + */ > > +bool > > +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, > > + struct anon_vma *anon_vma, struct file *file, > > + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, > > + struct anon_vma_name *anon_name) > > +{ > > + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) && > > + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { > > + if (vma->vm_pgoff == vm_pgoff) > > + return true; > > + } > > + return false; > > +} > > + > > +/* > > + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) > > + * beyond (at a higher virtual address and file offset than) the vma. > > + * > > + * We cannot merge two vmas if they have differently assigned (non-NULL) > > + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. > > + * > > + * We assume that vma is not removed as part of the merge. > > + */ > > +bool > > +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, > > + struct anon_vma *anon_vma, struct file *file, > > + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, > > + struct anon_vma_name *anon_name) > > +{ > > + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) && > > + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { > > + pgoff_t vm_pglen; > > + > > + vm_pglen = vma_pages(vma); > > + if (vma->vm_pgoff + vm_pglen == vm_pgoff) > > + return true; > > + } > > + return false; > > +} > > + > > +/* > > + * Close a vm structure and free it. > > + */ > > +void remove_vma(struct vm_area_struct *vma, bool unreachable) > > +{ > > + might_sleep(); > > + if (vma->vm_ops && vma->vm_ops->close) > > + vma->vm_ops->close(vma); > > + if (vma->vm_file) > > + fput(vma->vm_file); > > + mpol_put(vma_policy(vma)); > > + if (unreachable) > > + __vm_area_free(vma); > > + else > > + vm_area_free(vma); > > +} > > + > > +/* > > + * Get rid of page table information in the indicated region. > > + * > > + * Called with the mm semaphore held. > > + */ > > +void unmap_region(struct mm_struct *mm, struct ma_state *mas, > > + struct vm_area_struct *vma, struct vm_area_struct *prev, > > + struct vm_area_struct *next, unsigned long start, > > + unsigned long end, unsigned long tree_end, bool mm_wr_locked) > > +{ > > + struct mmu_gather tlb; > > + unsigned long mt_start = mas->index; > > + > > + lru_add_drain(); > > + tlb_gather_mmu(&tlb, mm); > > + update_hiwater_rss(mm); > > + unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked); > > + mas_set(mas, mt_start); > > + free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, > > + next ? next->vm_start : USER_PGTABLES_CEILING, > > + mm_wr_locked); > > + tlb_finish_mmu(&tlb); > > +} > > + > > +/* > > + * __split_vma() bypasses sysctl_max_map_count checking. We use this where it > > + * has already been checked or doesn't make sense to fail. > > + * VMA Iterator will point to the end VMA. > > + */ > > +static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, > > + unsigned long addr, int new_below) > > +{ > > + struct vma_prepare vp; > > + struct vm_area_struct *new; > > + int err; > > + > > + WARN_ON(vma->vm_start >= addr); > > + WARN_ON(vma->vm_end <= addr); > > + > > + if (vma->vm_ops && vma->vm_ops->may_split) { > > + err = vma->vm_ops->may_split(vma, addr); > > + if (err) > > + return err; > > + } > > + > > + new = vm_area_dup(vma); > > + if (!new) > > + return -ENOMEM; > > + > > + if (new_below) { > > + new->vm_end = addr; > > + } else { > > + new->vm_start = addr; > > + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); > > + } > > + > > + err = -ENOMEM; > > + vma_iter_config(vmi, new->vm_start, new->vm_end); > > + if (vma_iter_prealloc(vmi, new)) > > + goto out_free_vma; > > + > > + err = vma_dup_policy(vma, new); > > + if (err) > > + goto out_free_vmi; > > + > > + err = anon_vma_clone(new, vma); > > + if (err) > > + goto out_free_mpol; > > + > > + if (new->vm_file) > > + get_file(new->vm_file); > > + > > + if (new->vm_ops && new->vm_ops->open) > > + new->vm_ops->open(new); > > + > > + vma_start_write(vma); > > + vma_start_write(new); > > + > > + init_vma_prep(&vp, vma); > > + vp.insert = new; > > + vma_prepare(&vp); > > + vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); > > + > > + if (new_below) { > > + vma->vm_start = addr; > > + vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; > > + } else { > > + vma->vm_end = addr; > > + } > > + > > + /* vma_complete stores the new vma */ > > + vma_complete(&vp, vmi, vma->vm_mm); > > + > > + /* Success. */ > > + if (new_below) > > + vma_next(vmi); > > + return 0; > > + > > +out_free_mpol: > > + mpol_put(vma_policy(new)); > > +out_free_vmi: > > + vma_iter_free(vmi); > > +out_free_vma: > > + vm_area_free(new); > > + return err; > > +} > > + > > +/* > > + * Split a vma into two pieces at address 'addr', a new vma is allocated > > + * either for the first part or the tail. > > + */ > > +static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, > > + unsigned long addr, int new_below) > > +{ > > + if (vma->vm_mm->map_count >= sysctl_max_map_count) > > + return -ENOMEM; > > + > > + return __split_vma(vmi, vma, addr, new_below); > > +} > > + > > +/* > > + * Ok - we have the memory areas we should free on a maple tree so release them, > > + * and do the vma updates. > > + * > > + * Called with the mm semaphore held. > > + */ > > +static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) > > +{ > > + unsigned long nr_accounted = 0; > > + struct vm_area_struct *vma; > > + > > + /* Update high watermark before we lower total_vm */ > > + update_hiwater_vm(mm); > > + mas_for_each(mas, vma, ULONG_MAX) { > > + long nrpages = vma_pages(vma); > > + > > + if (vma->vm_flags & VM_ACCOUNT) > > + nr_accounted += nrpages; > > + vm_stat_account(mm, vma->vm_flags, -nrpages); > > + remove_vma(vma, false); > > + } > > + vm_unacct_memory(nr_accounted); > > +} > > + > > +/* > > + * init_vma_prep() - Initializer wrapper for vma_prepare struct > > + * @vp: The vma_prepare struct > > + * @vma: The vma that will be altered once locked > > + */ > > +void init_vma_prep(struct vma_prepare *vp, > > + struct vm_area_struct *vma) > > +{ > > + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); > > +} > > + > > +/* > > + * Requires inode->i_mapping->i_mmap_rwsem > > + */ > > +static void __remove_shared_vm_struct(struct vm_area_struct *vma, > > + struct address_space *mapping) > > +{ > > + if (vma_is_shared_maywrite(vma)) > > + mapping_unmap_writable(mapping); > > + > > + flush_dcache_mmap_lock(mapping); > > + vma_interval_tree_remove(vma, &mapping->i_mmap); > > + flush_dcache_mmap_unlock(mapping); > > +} > > + > > +/* > > + * vma has some anon_vma assigned, and is already inserted on that > > + * anon_vma's interval trees. > > + * > > + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the > > + * vma must be removed from the anon_vma's interval trees using > > + * anon_vma_interval_tree_pre_update_vma(). > > + * > > + * After the update, the vma will be reinserted using > > + * anon_vma_interval_tree_post_update_vma(). > > + * > > + * The entire update must be protected by exclusive mmap_lock and by > > + * the root anon_vma's mutex. > > + */ > > +void > > +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) > > +{ > > + struct anon_vma_chain *avc; > > + > > + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) > > + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); > > +} > > + > > +void > > +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) > > +{ > > + struct anon_vma_chain *avc; > > + > > + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) > > + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); > > +} > > + > > +static void __vma_link_file(struct vm_area_struct *vma, > > + struct address_space *mapping) > > +{ > > + if (vma_is_shared_maywrite(vma)) > > + mapping_allow_writable(mapping); > > + > > + flush_dcache_mmap_lock(mapping); > > + vma_interval_tree_insert(vma, &mapping->i_mmap); > > + flush_dcache_mmap_unlock(mapping); > > +} > > + > > +/* > > + * vma_prepare() - Helper function for handling locking VMAs prior to altering > > + * @vp: The initialized vma_prepare struct > > + */ > > +void vma_prepare(struct vma_prepare *vp) > > +{ > > + if (vp->file) { > > + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); > > + > > + if (vp->adj_next) > > + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, > > + vp->adj_next->vm_end); > > + > > + i_mmap_lock_write(vp->mapping); > > + if (vp->insert && vp->insert->vm_file) { > > + /* > > + * Put into interval tree now, so instantiated pages > > + * are visible to arm/parisc __flush_dcache_page > > + * throughout; but we cannot insert into address > > + * space until vma start or end is updated. > > + */ > > + __vma_link_file(vp->insert, > > + vp->insert->vm_file->f_mapping); > > + } > > + } > > + > > + if (vp->anon_vma) { > > + anon_vma_lock_write(vp->anon_vma); > > + anon_vma_interval_tree_pre_update_vma(vp->vma); > > + if (vp->adj_next) > > + anon_vma_interval_tree_pre_update_vma(vp->adj_next); > > + } > > + > > + if (vp->file) { > > + flush_dcache_mmap_lock(vp->mapping); > > + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); > > + if (vp->adj_next) > > + vma_interval_tree_remove(vp->adj_next, > > + &vp->mapping->i_mmap); > > + } > > + > > +} > > + > > +/* > > + * dup_anon_vma() - Helper function to duplicate anon_vma > > + * @dst: The destination VMA > > + * @src: The source VMA > > + * @dup: Pointer to the destination VMA when successful. > > + * > > + * Returns: 0 on success. > > + */ > > +static int dup_anon_vma(struct vm_area_struct *dst, > > + struct vm_area_struct *src, struct vm_area_struct **dup) > > +{ > > + /* > > + * Easily overlooked: when mprotect shifts the boundary, make sure the > > + * expanding vma has anon_vma set if the shrinking vma had, to cover any > > + * anon pages imported. > > + */ > > + if (src->anon_vma && !dst->anon_vma) { > > + int ret; > > + > > + vma_assert_write_locked(dst); > > + dst->anon_vma = src->anon_vma; > > + ret = anon_vma_clone(dst, src); > > + if (ret) > > + return ret; > > + > > + *dup = dst; > > + } > > + > > + return 0; > > +} > > + > > +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE > > +void validate_mm(struct mm_struct *mm) > > +{ > > + int bug = 0; > > + int i = 0; > > + struct vm_area_struct *vma; > > + VMA_ITERATOR(vmi, mm, 0); > > + > > + mt_validate(&mm->mm_mt); > > + for_each_vma(vmi, vma) { > > +#ifdef CONFIG_DEBUG_VM_RB > > + struct anon_vma *anon_vma = vma->anon_vma; > > + struct anon_vma_chain *avc; > > +#endif > > + unsigned long vmi_start, vmi_end; > > + bool warn = 0; > > + > > + vmi_start = vma_iter_addr(&vmi); > > + vmi_end = vma_iter_end(&vmi); > > + if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) > > + warn = 1; > > + > > + if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) > > + warn = 1; > > + > > + if (warn) { > > + pr_emerg("issue in %s\n", current->comm); > > + dump_stack(); > > + dump_vma(vma); > > + pr_emerg("tree range: %px start %lx end %lx\n", vma, > > + vmi_start, vmi_end - 1); > > + vma_iter_dump_tree(&vmi); > > + } > > + > > +#ifdef CONFIG_DEBUG_VM_RB > > + if (anon_vma) { > > + anon_vma_lock_read(anon_vma); > > + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) > > + anon_vma_interval_tree_verify(avc); > > + anon_vma_unlock_read(anon_vma); > > + } > > +#endif > > + i++; > > + } > > + if (i != mm->map_count) { > > + pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); > > + bug = 1; > > + } > > + VM_BUG_ON_MM(bug, mm); > > +} > > +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ > > + > > +/* > > + * vma_expand - Expand an existing VMA > > + * > > + * @vmi: The vma iterator > > + * @vma: The vma to expand > > + * @start: The start of the vma > > + * @end: The exclusive end of the vma > > + * @pgoff: The page offset of vma > > + * @next: The current of next vma. > > + * > > + * Expand @vma to @start and @end. Can expand off the start and end. Will > > + * expand over @next if it's different from @vma and @end == @next->vm_end. > > + * Checking if the @vma can expand and merge with @next needs to be handled by > > + * the caller. > > + * > > + * Returns: 0 on success > > + */ > > +int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, > > + unsigned long start, unsigned long end, pgoff_t pgoff, > > + struct vm_area_struct *next) > > +{ > > + struct vm_area_struct *anon_dup = NULL; > > + bool remove_next = false; > > + struct vma_prepare vp; > > + > > + vma_start_write(vma); > > + if (next && (vma != next) && (end == next->vm_end)) { > > + int ret; > > + > > + remove_next = true; > > + vma_start_write(next); > > + ret = dup_anon_vma(vma, next, &anon_dup); > > + if (ret) > > + return ret; > > + } > > + > > + init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); > > + /* Not merging but overwriting any part of next is not handled. */ > > + VM_WARN_ON(next && !vp.remove && > > + next != vma && end > next->vm_start); > > + /* Only handles expanding */ > > + VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); > > + > > + /* Note: vma iterator must be pointing to 'start' */ > > + vma_iter_config(vmi, start, end); > > + if (vma_iter_prealloc(vmi, vma)) > > + goto nomem; > > + > > + vma_prepare(&vp); > > + vma_adjust_trans_huge(vma, start, end, 0); > > + vma_set_range(vma, start, end, pgoff); > > + vma_iter_store(vmi, vma); > > + > > + vma_complete(&vp, vmi, vma->vm_mm); > > + return 0; > > + > > +nomem: > > + if (anon_dup) > > + unlink_anon_vmas(anon_dup); > > + return -ENOMEM; > > +} > > + > > +/* > > + * vma_shrink() - Reduce an existing VMAs memory area > > + * @vmi: The vma iterator > > + * @vma: The VMA to modify > > + * @start: The new start > > + * @end: The new end > > + * > > + * Returns: 0 on success, -ENOMEM otherwise > > + */ > > +int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, > > + unsigned long start, unsigned long end, pgoff_t pgoff) > > +{ > > + struct vma_prepare vp; > > + > > + WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); > > + > > + if (vma->vm_start < start) > > + vma_iter_config(vmi, vma->vm_start, start); > > + else > > + vma_iter_config(vmi, end, vma->vm_end); > > + > > + if (vma_iter_prealloc(vmi, NULL)) > > + return -ENOMEM; > > + > > + vma_start_write(vma); > > + > > + init_vma_prep(&vp, vma); > > + vma_prepare(&vp); > > + vma_adjust_trans_huge(vma, start, end, 0); > > + > > + vma_iter_clear(vmi); > > + vma_set_range(vma, start, end, pgoff); > > + vma_complete(&vp, vmi, vma->vm_mm); > > + return 0; > > +} > > + > > +/* > > + * vma_complete- Helper function for handling the unlocking after altering VMAs, > > + * or for inserting a VMA. > > + * > > + * @vp: The vma_prepare struct > > + * @vmi: The vma iterator > > + * @mm: The mm_struct > > + */ > > +void vma_complete(struct vma_prepare *vp, > > + struct vma_iterator *vmi, struct mm_struct *mm) > > +{ > > + if (vp->file) { > > + if (vp->adj_next) > > + vma_interval_tree_insert(vp->adj_next, > > + &vp->mapping->i_mmap); > > + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); > > + flush_dcache_mmap_unlock(vp->mapping); > > + } > > + > > + if (vp->remove && vp->file) { > > + __remove_shared_vm_struct(vp->remove, vp->mapping); > > + if (vp->remove2) > > + __remove_shared_vm_struct(vp->remove2, vp->mapping); > > + } else if (vp->insert) { > > + /* > > + * split_vma has split insert from vma, and needs > > + * us to insert it before dropping the locks > > + * (it may either follow vma or precede it). > > + */ > > + vma_iter_store(vmi, vp->insert); > > + mm->map_count++; > > + } > > + > > + if (vp->anon_vma) { > > + anon_vma_interval_tree_post_update_vma(vp->vma); > > + if (vp->adj_next) > > + anon_vma_interval_tree_post_update_vma(vp->adj_next); > > + anon_vma_unlock_write(vp->anon_vma); > > + } > > + > > + if (vp->file) { > > + i_mmap_unlock_write(vp->mapping); > > + uprobe_mmap(vp->vma); > > + > > + if (vp->adj_next) > > + uprobe_mmap(vp->adj_next); > > + } > > + > > + if (vp->remove) { > > +again: > > + vma_mark_detached(vp->remove, true); > > + if (vp->file) { > > + uprobe_munmap(vp->remove, vp->remove->vm_start, > > + vp->remove->vm_end); > > + fput(vp->file); > > + } > > + if (vp->remove->anon_vma) > > + anon_vma_merge(vp->vma, vp->remove); > > + mm->map_count--; > > + mpol_put(vma_policy(vp->remove)); > > + if (!vp->remove2) > > + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); > > + vm_area_free(vp->remove); > > + > > + /* > > + * In mprotect's case 6 (see comments on vma_merge), > > + * we are removing both mid and next vmas > > + */ > > + if (vp->remove2) { > > + vp->remove = vp->remove2; > > + vp->remove2 = NULL; > > + goto again; > > + } > > + } > > + if (vp->insert && vp->file) > > + uprobe_mmap(vp->insert); > > + validate_mm(mm); > > +} > > + > > +/* > > + * do_vmi_align_munmap() - munmap the aligned region from @start to @end. > > + * @vmi: The vma iterator > > + * @vma: The starting vm_area_struct > > + * @mm: The mm_struct > > + * @start: The aligned start address to munmap. > > + * @end: The aligned end address to munmap. > > + * @uf: The userfaultfd list_head > > + * @unlock: Set to true to drop the mmap_lock. unlocking only happens on > > + * success. > > + * > > + * Return: 0 on success and drops the lock if so directed, error and leaves the > > + * lock held otherwise. > > + */ > > +int > > +do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, > > + struct mm_struct *mm, unsigned long start, > > + unsigned long end, struct list_head *uf, bool unlock) > > +{ > > + struct vm_area_struct *prev, *next = NULL; > > + struct maple_tree mt_detach; > > + int count = 0; > > + int error = -ENOMEM; > > + unsigned long locked_vm = 0; > > + MA_STATE(mas_detach, &mt_detach, 0, 0); > > + mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); > > + mt_on_stack(mt_detach); > > + > > + /* > > + * If we need to split any vma, do it now to save pain later. > > + * > > + * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially > > + * unmapped vm_area_struct will remain in use: so lower split_vma > > + * places tmp vma above, and higher split_vma places tmp vma below. > > + */ > > + > > + /* Does it split the first one? */ > > + if (start > vma->vm_start) { > > + > > + /* > > + * Make sure that map_count on return from munmap() will > > + * not exceed its limit; but let map_count go just above > > + * its limit temporarily, to help free resources as expected. > > + */ > > + if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) > > + goto map_count_exceeded; > > + > > + error = __split_vma(vmi, vma, start, 1); > > + if (error) > > + goto start_split_failed; > > + } > > + > > + /* > > + * Detach a range of VMAs from the mm. Using next as a temp variable as > > + * it is always overwritten. > > + */ > > + next = vma; > > + do { > > + /* Does it split the end? */ > > + if (next->vm_end > end) { > > + error = __split_vma(vmi, next, end, 0); > > + if (error) > > + goto end_split_failed; > > + } > > + vma_start_write(next); > > + mas_set(&mas_detach, count); > > + error = mas_store_gfp(&mas_detach, next, GFP_KERNEL); > > + if (error) > > + goto munmap_gather_failed; > > + vma_mark_detached(next, true); > > + if (next->vm_flags & VM_LOCKED) > > + locked_vm += vma_pages(next); > > + > > + count++; > > + if (unlikely(uf)) { > > + /* > > + * If userfaultfd_unmap_prep returns an error the vmas > > + * will remain split, but userland will get a > > + * highly unexpected error anyway. This is no > > + * different than the case where the first of the two > > + * __split_vma fails, but we don't undo the first > > + * split, despite we could. This is unlikely enough > > + * failure that it's not worth optimizing it for. > > + */ > > + error = userfaultfd_unmap_prep(next, start, end, uf); > > + > > + if (error) > > + goto userfaultfd_error; > > + } > > +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE > > + BUG_ON(next->vm_start < start); > > + BUG_ON(next->vm_start > end); > > +#endif > > + } for_each_vma_range(*vmi, next, end); > > + > > +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) > > + /* Make sure no VMAs are about to be lost. */ > > + { > > + MA_STATE(test, &mt_detach, 0, 0); > > + struct vm_area_struct *vma_mas, *vma_test; > > + int test_count = 0; > > + > > + vma_iter_set(vmi, start); > > + rcu_read_lock(); > > + vma_test = mas_find(&test, count - 1); > > + for_each_vma_range(*vmi, vma_mas, end) { > > + BUG_ON(vma_mas != vma_test); > > + test_count++; > > + vma_test = mas_next(&test, count - 1); > > + } > > + rcu_read_unlock(); > > + BUG_ON(count != test_count); > > + } > > +#endif > > + > > + while (vma_iter_addr(vmi) > start) > > + vma_iter_prev_range(vmi); > > + > > + error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); > > + if (error) > > + goto clear_tree_failed; > > + > > + /* Point of no return */ > > + mm->locked_vm -= locked_vm; > > + mm->map_count -= count; > > + if (unlock) > > + mmap_write_downgrade(mm); > > + > > + prev = vma_iter_prev_range(vmi); > > + next = vma_next(vmi); > > + if (next) > > + vma_iter_prev_range(vmi); > > + > > + /* > > + * We can free page tables without write-locking mmap_lock because VMAs > > + * were isolated before we downgraded mmap_lock. > > + */ > > + mas_set(&mas_detach, 1); > > + unmap_region(mm, &mas_detach, vma, prev, next, start, end, count, > > + !unlock); > > + /* Statistics and freeing VMAs */ > > + mas_set(&mas_detach, 0); > > + remove_mt(mm, &mas_detach); > > + validate_mm(mm); > > + if (unlock) > > + mmap_read_unlock(mm); > > + > > + __mt_destroy(&mt_detach); > > + return 0; > > + > > +clear_tree_failed: > > +userfaultfd_error: > > +munmap_gather_failed: > > +end_split_failed: > > + mas_set(&mas_detach, 0); > > + mas_for_each(&mas_detach, next, end) > > + vma_mark_detached(next, false); > > + > > + __mt_destroy(&mt_detach); > > +start_split_failed: > > +map_count_exceeded: > > + validate_mm(mm); > > + return error; > > +} > > + > > +/* > > + * do_vmi_munmap() - munmap a given range. > > + * @vmi: The vma iterator > > + * @mm: The mm_struct > > + * @start: The start address to munmap > > + * @len: The length of the range to munmap > > + * @uf: The userfaultfd list_head > > + * @unlock: set to true if the user wants to drop the mmap_lock on success > > + * > > + * This function takes a @mas that is either pointing to the previous VMA or set > > + * to MA_START and sets it up to remove the mapping(s). The @len will be > > + * aligned and any arch_unmap work will be preformed. > > + * > > + * Return: 0 on success and drops the lock if so directed, error and leaves the > > + * lock held otherwise. > > + */ > > +int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, > > + unsigned long start, size_t len, struct list_head *uf, > > + bool unlock) > > +{ > > + unsigned long end; > > + struct vm_area_struct *vma; > > + > > + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) > > + return -EINVAL; > > + > > + end = start + PAGE_ALIGN(len); > > + if (end == start) > > + return -EINVAL; > > + > > + /* > > + * Check if memory is sealed before arch_unmap. > > + * Prevent unmapping a sealed VMA. > > + * can_modify_mm assumes we have acquired the lock on MM. > > + */ > > + if (unlikely(!can_modify_mm(mm, start, end))) > > + return -EPERM; > > + > > + /* arch_unmap() might do unmaps itself. */ > > + arch_unmap(mm, start, end); > > + > > + /* Find the first overlapping VMA */ > > + vma = vma_find(vmi, end); > > + if (!vma) { > > + if (unlock) > > + mmap_write_unlock(mm); > > + return 0; > > + } > > + > > + return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); > > +} > > + > > +/* > > + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), > > + * figure out whether that can be merged with its predecessor or its > > + * successor. Or both (it neatly fills a hole). > > + * > > + * In most cases - when called for mmap, brk or mremap - [addr,end) is > > + * certain not to be mapped by the time vma_merge is called; but when > > + * called for mprotect, it is certain to be already mapped (either at > > + * an offset within prev, or at the start of next), and the flags of > > + * this area are about to be changed to vm_flags - and the no-change > > + * case has already been eliminated. > > + * > > + * The following mprotect cases have to be considered, where **** is > > + * the area passed down from mprotect_fixup, never extending beyond one > > + * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts > > + * at the same address as **** and is of the same or larger span, and > > + * NNNN the next vma after ****: > > + * > > + * **** **** **** > > + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC > > + * cannot merge might become might become > > + * PPNNNNNNNNNN PPPPPPPPPPCC > > + * mmap, brk or case 4 below case 5 below > > + * mremap move: > > + * **** **** > > + * PPPP NNNN PPPPCCCCNNNN > > + * might become might become > > + * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or > > + * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or > > + * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 > > + * > > + * It is important for case 8 that the vma CCCC overlapping the > > + * region **** is never going to extended over NNNN. Instead NNNN must > > + * be extended in region **** and CCCC must be removed. This way in > > + * all cases where vma_merge succeeds, the moment vma_merge drops the > > + * rmap_locks, the properties of the merged vma will be already > > + * correct for the whole merged range. Some of those properties like > > + * vm_page_prot/vm_flags may be accessed by rmap_walks and they must > > + * be correct for the whole merged range immediately after the > > + * rmap_locks are released. Otherwise if NNNN would be removed and > > + * CCCC would be extended over the NNNN range, remove_migration_ptes > > + * or other rmap walkers (if working on addresses beyond the "end" > > + * parameter) may establish ptes with the wrong permissions of CCCC > > + * instead of the right permissions of NNNN. > > + * > > + * In the code below: > > + * PPPP is represented by *prev > > + * CCCC is represented by *curr or not represented at all (NULL) > > + * NNNN is represented by *next or not represented at all (NULL) > > + * **** is not represented - it will be merged and the vma containing the > > + * area is returned, or the function will return NULL > > + */ > > +static struct vm_area_struct > > +*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, > > + struct vm_area_struct *src, unsigned long addr, unsigned long end, > > + unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, > > + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, > > + struct anon_vma_name *anon_name) > > +{ > > + struct mm_struct *mm = src->vm_mm; > > + struct anon_vma *anon_vma = src->anon_vma; > > + struct file *file = src->vm_file; > > + struct vm_area_struct *curr, *next, *res; > > + struct vm_area_struct *vma, *adjust, *remove, *remove2; > > + struct vm_area_struct *anon_dup = NULL; > > + struct vma_prepare vp; > > + pgoff_t vma_pgoff; > > + int err = 0; > > + bool merge_prev = false; > > + bool merge_next = false; > > + bool vma_expanded = false; > > + unsigned long vma_start = addr; > > + unsigned long vma_end = end; > > + pgoff_t pglen = (end - addr) >> PAGE_SHIFT; > > + long adj_start = 0; > > + > > + /* > > + * We later require that vma->vm_flags == vm_flags, > > + * so this tests vma->vm_flags & VM_SPECIAL, too. > > + */ > > + if (vm_flags & VM_SPECIAL) > > + return NULL; > > + > > + /* Does the input range span an existing VMA? (cases 5 - 8) */ > > + curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); > > + > > + if (!curr || /* cases 1 - 4 */ > > + end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ > > + next = vma_lookup(mm, end); > > + else > > + next = NULL; /* case 5 */ > > + > > + if (prev) { > > + vma_start = prev->vm_start; > > + vma_pgoff = prev->vm_pgoff; > > + > > + /* Can we merge the predecessor? */ > > + if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy) > > + && can_vma_merge_after(prev, vm_flags, anon_vma, file, > > + pgoff, vm_userfaultfd_ctx, anon_name)) { > > + merge_prev = true; > > + vma_prev(vmi); > > + } > > + } > > + > > + /* Can we merge the successor? */ > > + if (next && mpol_equal(policy, vma_policy(next)) && > > + can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, > > + vm_userfaultfd_ctx, anon_name)) { > > + merge_next = true; > > + } > > + > > + /* Verify some invariant that must be enforced by the caller. */ > > + VM_WARN_ON(prev && addr <= prev->vm_start); > > + VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); > > + VM_WARN_ON(addr >= end); > > + > > + if (!merge_prev && !merge_next) > > + return NULL; /* Not mergeable. */ > > + > > + if (merge_prev) > > + vma_start_write(prev); > > + > > + res = vma = prev; > > + remove = remove2 = adjust = NULL; > > + > > + /* Can we merge both the predecessor and the successor? */ > > + if (merge_prev && merge_next && > > + is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { > > + vma_start_write(next); > > + remove = next; /* case 1 */ > > + vma_end = next->vm_end; > > + err = dup_anon_vma(prev, next, &anon_dup); > > + if (curr) { /* case 6 */ > > + vma_start_write(curr); > > + remove = curr; > > + remove2 = next; > > + /* > > + * Note that the dup_anon_vma below cannot overwrite err > > + * since the first caller would do nothing unless next > > + * has an anon_vma. > > + */ > > + if (!next->anon_vma) > > + err = dup_anon_vma(prev, curr, &anon_dup); > > + } > > + } else if (merge_prev) { /* case 2 */ > > + if (curr) { > > + vma_start_write(curr); > > + if (end == curr->vm_end) { /* case 7 */ > > + /* > > + * can_vma_merge_after() assumed we would not be > > + * removing prev vma, so it skipped the check > > + * for vm_ops->close, but we are removing curr > > + */ > > + if (curr->vm_ops && curr->vm_ops->close) > > + err = -EINVAL; > > + remove = curr; > > + } else { /* case 5 */ > > + adjust = curr; > > + adj_start = (end - curr->vm_start); > > + } > > + if (!err) > > + err = dup_anon_vma(prev, curr, &anon_dup); > > + } > > + } else { /* merge_next */ > > + vma_start_write(next); > > + res = next; > > + if (prev && addr < prev->vm_end) { /* case 4 */ > > + vma_start_write(prev); > > + vma_end = addr; > > + adjust = next; > > + adj_start = -(prev->vm_end - addr); > > + err = dup_anon_vma(next, prev, &anon_dup); > > + } else { > > + /* > > + * Note that cases 3 and 8 are the ONLY ones where prev > > + * is permitted to be (but is not necessarily) NULL. > > + */ > > + vma = next; /* case 3 */ > > + vma_start = addr; > > + vma_end = next->vm_end; > > + vma_pgoff = next->vm_pgoff - pglen; > > + if (curr) { /* case 8 */ > > + vma_pgoff = curr->vm_pgoff; > > + vma_start_write(curr); > > + remove = curr; > > + err = dup_anon_vma(next, curr, &anon_dup); > > + } > > + } > > + } > > + > > + /* Error in anon_vma clone. */ > > + if (err) > > + goto anon_vma_fail; > > + > > + if (vma_start < vma->vm_start || vma_end > vma->vm_end) > > + vma_expanded = true; > > + > > + if (vma_expanded) { > > + vma_iter_config(vmi, vma_start, vma_end); > > + } else { > > + vma_iter_config(vmi, adjust->vm_start + adj_start, > > + adjust->vm_end); > > + } > > + > > + if (vma_iter_prealloc(vmi, vma)) > > + goto prealloc_fail; > > + > > + init_multi_vma_prep(&vp, vma, adjust, remove, remove2); > > + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && > > + vp.anon_vma != adjust->anon_vma); > > + > > + vma_prepare(&vp); > > + vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); > > + vma_set_range(vma, vma_start, vma_end, vma_pgoff); > > + > > + if (vma_expanded) > > + vma_iter_store(vmi, vma); > > + > > + if (adj_start) { > > + adjust->vm_start += adj_start; > > + adjust->vm_pgoff += adj_start >> PAGE_SHIFT; > > + if (adj_start < 0) { > > + WARN_ON(vma_expanded); > > + vma_iter_store(vmi, next); > > + } > > + } > > + > > + vma_complete(&vp, vmi, mm); > > + khugepaged_enter_vma(res, vm_flags); > > + return res; > > + > > +prealloc_fail: > > + if (anon_dup) > > + unlink_anon_vmas(anon_dup); > > + > > +anon_vma_fail: > > + vma_iter_set(vmi, addr); > > + vma_iter_load(vmi); > > + return NULL; > > +} > > + > > +/* > > + * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd > > + * context and anonymous VMA name within the range [start, end). > > + * > > + * As a result, we might be able to merge the newly modified VMA range with an > > + * adjacent VMA with identical properties. > > + * > > + * If no merge is possible and the range does not span the entirety of the VMA, > > + * we then need to split the VMA to accommodate the change. > > + * > > + * The function returns either the merged VMA, the original VMA if a split was > > + * required instead, or an error if the split failed. > > + */ > > +struct vm_area_struct *vma_modify(struct vma_iterator *vmi, > > + struct vm_area_struct *prev, > > + struct vm_area_struct *vma, > > + unsigned long start, unsigned long end, > > + unsigned long vm_flags, > > + struct mempolicy *policy, > > + struct vm_userfaultfd_ctx uffd_ctx, > > + struct anon_vma_name *anon_name) > > +{ > > + pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); > > + struct vm_area_struct *merged; > > + > > + merged = vma_merge(vmi, prev, vma, start, end, vm_flags, > > + pgoff, policy, uffd_ctx, anon_name); > > + if (merged) > > + return merged; > > + > > + if (vma->vm_start < start) { > > + int err = split_vma(vmi, vma, start, 1); > > + > > + if (err) > > + return ERR_PTR(err); > > + } > > + > > + if (vma->vm_end > end) { > > + int err = split_vma(vmi, vma, end, 0); > > + > > + if (err) > > + return ERR_PTR(err); > > + } > > + > > + return vma; > > +} > > + > > +/* > > + * Attempt to merge a newly mapped VMA with those adjacent to it. The caller > > + * must ensure that [start, end) does not overlap any existing VMA. > > + */ > > +struct vm_area_struct > > +*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, > > + struct vm_area_struct *vma, unsigned long start, > > + unsigned long end, pgoff_t pgoff) > > +{ > > + return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff, > > + vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); > > +} > > + > > +/* > > + * Expand vma by delta bytes, potentially merging with an immediately adjacent > > + * VMA with identical properties. > > + */ > > +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, > > + struct vm_area_struct *vma, > > + unsigned long delta) > > +{ > > + pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); > > + > > + /* vma is specified as prev, so case 1 or 2 will apply. */ > > + return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta, > > + vma->vm_flags, pgoff, vma_policy(vma), > > + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); > > +} > > + > > +void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) > > +{ > > + vb->count = 0; > > +} > > + > > +static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) > > +{ > > + struct address_space *mapping; > > + int i; > > + > > + mapping = vb->vmas[0]->vm_file->f_mapping; > > + i_mmap_lock_write(mapping); > > + for (i = 0; i < vb->count; i++) { > > + VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); > > + __remove_shared_vm_struct(vb->vmas[i], mapping); > > + } > > + i_mmap_unlock_write(mapping); > > + > > + unlink_file_vma_batch_init(vb); > > +} > > + > > +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, > > + struct vm_area_struct *vma) > > +{ > > + if (vma->vm_file == NULL) > > + return; > > + > > + if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || > > + vb->count == ARRAY_SIZE(vb->vmas)) > > + unlink_file_vma_batch_process(vb); > > + > > + vb->vmas[vb->count] = vma; > > + vb->count++; > > +} > > + > > +void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) > > +{ > > + if (vb->count > 0) > > + unlink_file_vma_batch_process(vb); > > +} > > + > > +/* > > + * Unlink a file-based vm structure from its interval tree, to hide > > + * vma from rmap and vmtruncate before freeing its page tables. > > + */ > > +void unlink_file_vma(struct vm_area_struct *vma) > > +{ > > + struct file *file = vma->vm_file; > > + > > + if (file) { > > + struct address_space *mapping = file->f_mapping; > > + > > + i_mmap_lock_write(mapping); > > + __remove_shared_vm_struct(vma, mapping); > > + i_mmap_unlock_write(mapping); > > + } > > +} > > + > > +void vma_link_file(struct vm_area_struct *vma) > > +{ > > + struct file *file = vma->vm_file; > > + struct address_space *mapping; > > + > > + if (file) { > > + mapping = file->f_mapping; > > + i_mmap_lock_write(mapping); > > + __vma_link_file(vma, mapping); > > + i_mmap_unlock_write(mapping); > > + } > > +} > > + > > +int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) > > +{ > > + VMA_ITERATOR(vmi, mm, 0); > > + > > + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); > > + if (vma_iter_prealloc(&vmi, vma)) > > + return -ENOMEM; > > + > > + vma_start_write(vma); > > + vma_iter_store(&vmi, vma); > > + vma_link_file(vma); > > + mm->map_count++; > > + validate_mm(mm); > > + return 0; > > +} > > + > > +/* > > + * Copy the vma structure to a new location in the same mm, > > + * prior to moving page table entries, to effect an mremap move. > > + */ > > +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, > > + unsigned long addr, unsigned long len, pgoff_t pgoff, > > + bool *need_rmap_locks) > > +{ > > + struct vm_area_struct *vma = *vmap; > > + unsigned long vma_start = vma->vm_start; > > + struct mm_struct *mm = vma->vm_mm; > > + struct vm_area_struct *new_vma, *prev; > > + bool faulted_in_anon_vma = true; > > + VMA_ITERATOR(vmi, mm, addr); > > + > > + /* > > + * If anonymous vma has not yet been faulted, update new pgoff > > + * to match new location, to increase its chance of merging. > > + */ > > + if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { > > + pgoff = addr >> PAGE_SHIFT; > > + faulted_in_anon_vma = false; > > + } > > + > > + new_vma = find_vma_prev(mm, addr, &prev); > > + if (new_vma && new_vma->vm_start < addr + len) > > + return NULL; /* should never get here */ > > + > > + new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); > > + if (new_vma) { > > + /* > > + * Source vma may have been merged into new_vma > > + */ > > + if (unlikely(vma_start >= new_vma->vm_start && > > + vma_start < new_vma->vm_end)) { > > + /* > > + * The only way we can get a vma_merge with > > + * self during an mremap is if the vma hasn't > > + * been faulted in yet and we were allowed to > > + * reset the dst vma->vm_pgoff to the > > + * destination address of the mremap to allow > > + * the merge to happen. mremap must change the > > + * vm_pgoff linearity between src and dst vmas > > + * (in turn preventing a vma_merge) to be > > + * safe. It is only safe to keep the vm_pgoff > > + * linear if there are no pages mapped yet. > > + */ > > + VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); > > + *vmap = vma = new_vma; > > + } > > + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); > > + } else { > > + new_vma = vm_area_dup(vma); > > + if (!new_vma) > > + goto out; > > + vma_set_range(new_vma, addr, addr + len, pgoff); > > + if (vma_dup_policy(vma, new_vma)) > > + goto out_free_vma; > > + if (anon_vma_clone(new_vma, vma)) > > + goto out_free_mempol; > > + if (new_vma->vm_file) > > + get_file(new_vma->vm_file); > > + if (new_vma->vm_ops && new_vma->vm_ops->open) > > + new_vma->vm_ops->open(new_vma); > > + if (vma_link(mm, new_vma)) > > + goto out_vma_link; > > + *need_rmap_locks = false; > > + } > > + return new_vma; > > + > > +out_vma_link: > > + if (new_vma->vm_ops && new_vma->vm_ops->close) > > + new_vma->vm_ops->close(new_vma); > > + > > + if (new_vma->vm_file) > > + fput(new_vma->vm_file); > > + > > + unlink_anon_vmas(new_vma); > > +out_free_mempol: > > + mpol_put(vma_policy(new_vma)); > > +out_free_vma: > > + vm_area_free(new_vma); > > +out: > > + return NULL; > > +} > > + > > +/* > > + * Rough compatibility check to quickly see if it's even worth looking > > + * at sharing an anon_vma. > > + * > > + * They need to have the same vm_file, and the flags can only differ > > + * in things that mprotect may change. > > + * > > + * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that > > + * we can merge the two vma's. For example, we refuse to merge a vma if > > + * there is a vm_ops->close() function, because that indicates that the > > + * driver is doing some kind of reference counting. But that doesn't > > + * really matter for the anon_vma sharing case. > > + */ > > +static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) > > +{ > > + return a->vm_end == b->vm_start && > > + mpol_equal(vma_policy(a), vma_policy(b)) && > > + a->vm_file == b->vm_file && > > + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && > > + b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); > > +} > > + > > +/* > > + * Do some basic sanity checking to see if we can re-use the anon_vma > > + * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be > > + * the same as 'old', the other will be the new one that is trying > > + * to share the anon_vma. > > + * > > + * NOTE! This runs with mmap_lock held for reading, so it is possible that > > + * the anon_vma of 'old' is concurrently in the process of being set up > > + * by another page fault trying to merge _that_. But that's ok: if it > > + * is being set up, that automatically means that it will be a singleton > > + * acceptable for merging, so we can do all of this optimistically. But > > + * we do that READ_ONCE() to make sure that we never re-load the pointer. > > + * > > + * IOW: that the "list_is_singular()" test on the anon_vma_chain only > > + * matters for the 'stable anon_vma' case (ie the thing we want to avoid > > + * is to return an anon_vma that is "complex" due to having gone through > > + * a fork). > > + * > > + * We also make sure that the two vma's are compatible (adjacent, > > + * and with the same memory policies). That's all stable, even with just > > + * a read lock on the mmap_lock. > > + */ > > +static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, > > + struct vm_area_struct *a, > > + struct vm_area_struct *b) > > +{ > > + if (anon_vma_compatible(a, b)) { > > + struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); > > + > > + if (anon_vma && list_is_singular(&old->anon_vma_chain)) > > + return anon_vma; > > + } > > + return NULL; > > +} > > + > > +/* > > + * find_mergeable_anon_vma is used by anon_vma_prepare, to check > > + * neighbouring vmas for a suitable anon_vma, before it goes off > > + * to allocate a new anon_vma. It checks because a repetitive > > + * sequence of mprotects and faults may otherwise lead to distinct > > + * anon_vmas being allocated, preventing vma merge in subsequent > > + * mprotect. > > + */ > > +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) > > +{ > > + struct anon_vma *anon_vma = NULL; > > + struct vm_area_struct *prev, *next; > > + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); > > + > > + /* Try next first. */ > > + next = vma_iter_load(&vmi); > > + if (next) { > > + anon_vma = reusable_anon_vma(next, vma, next); > > + if (anon_vma) > > + return anon_vma; > > + } > > + > > + prev = vma_prev(&vmi); > > + VM_BUG_ON_VMA(prev != vma, vma); > > + prev = vma_prev(&vmi); > > + /* Try prev next. */ > > + if (prev) > > + anon_vma = reusable_anon_vma(prev, prev, vma); > > + > > + /* > > + * We might reach here with anon_vma == NULL if we can't find > > + * any reusable anon_vma. > > + * There's no absolute need to look only at touching neighbours: > > + * we could search further afield for "compatible" anon_vmas. > > + * But it would probably just be a waste of time searching, > > + * or lead to too many vmas hanging off the same anon_vma. > > + * We're trying to allow mprotect remerging later on, > > + * not trying to minimize memory used for anon_vmas. > > + */ > > + return anon_vma; > > +} > > + > > +static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) > > +{ > > + return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); > > +} > > + > > +static bool vma_is_shared_writable(struct vm_area_struct *vma) > > +{ > > + return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == > > + (VM_WRITE | VM_SHARED); > > +} > > + > > +static bool vma_fs_can_writeback(struct vm_area_struct *vma) > > +{ > > + /* No managed pages to writeback. */ > > + if (vma->vm_flags & VM_PFNMAP) > > + return false; > > + > > + return vma->vm_file && vma->vm_file->f_mapping && > > + mapping_can_writeback(vma->vm_file->f_mapping); > > +} > > + > > +/* > > + * Does this VMA require the underlying folios to have their dirty state > > + * tracked? > > + */ > > +bool vma_needs_dirty_tracking(struct vm_area_struct *vma) > > +{ > > + /* Only shared, writable VMAs require dirty tracking. */ > > + if (!vma_is_shared_writable(vma)) > > + return false; > > + > > + /* Does the filesystem need to be notified? */ > > + if (vm_ops_needs_writenotify(vma->vm_ops)) > > + return true; > > + > > + /* > > + * Even if the filesystem doesn't indicate a need for writenotify, if it > > + * can writeback, dirty tracking is still required. > > + */ > > + return vma_fs_can_writeback(vma); > > +} > > + > > +/* > > + * Some shared mappings will want the pages marked read-only > > + * to track write events. If so, we'll downgrade vm_page_prot > > + * to the private version (using protection_map[] without the > > + * VM_SHARED bit). > > + */ > > +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) > > +{ > > + /* If it was private or non-writable, the write bit is already clear */ > > + if (!vma_is_shared_writable(vma)) > > + return false; > > + > > + /* The backer wishes to know when pages are first written to? */ > > + if (vm_ops_needs_writenotify(vma->vm_ops)) > > + return true; > > + > > + /* The open routine did something to the protections that pgprot_modify > > + * won't preserve? */ > > + if (pgprot_val(vm_page_prot) != > > + pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) > > + return false; > > + > > + /* > > + * Do we need to track softdirty? hugetlb does not support softdirty > > + * tracking yet. > > + */ > > + if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) > > + return true; > > + > > + /* Do we need write faults for uffd-wp tracking? */ > > + if (userfaultfd_wp(vma)) > > + return true; > > + > > + /* Can the mapping track the dirty pages? */ > > + return vma_fs_can_writeback(vma); > > +} > > + > > +unsigned long count_vma_pages_range(struct mm_struct *mm, > > + unsigned long addr, unsigned long end) > > +{ > > + VMA_ITERATOR(vmi, mm, addr); > > + struct vm_area_struct *vma; > > + unsigned long nr_pages = 0; > > + > > + for_each_vma_range(vmi, vma, end) { > > + unsigned long vm_start = max(addr, vma->vm_start); > > + unsigned long vm_end = min(end, vma->vm_end); > > + > > + nr_pages += PHYS_PFN(vm_end - vm_start); > > + } > > + > > + return nr_pages; > > +} > > + > > +static DEFINE_MUTEX(mm_all_locks_mutex); > > + > > +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) > > +{ > > + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { > > + /* > > + * The LSB of head.next can't change from under us > > + * because we hold the mm_all_locks_mutex. > > + */ > > + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); > > + /* > > + * We can safely modify head.next after taking the > > + * anon_vma->root->rwsem. If some other vma in this mm shares > > + * the same anon_vma we won't take it again. > > + * > > + * No need of atomic instructions here, head.next > > + * can't change from under us thanks to the > > + * anon_vma->root->rwsem. > > + */ > > + if (__test_and_set_bit(0, (unsigned long *) > > + &anon_vma->root->rb_root.rb_root.rb_node)) > > + BUG(); > > + } > > +} > > + > > +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) > > +{ > > + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { > > + /* > > + * AS_MM_ALL_LOCKS can't change from under us because > > + * we hold the mm_all_locks_mutex. > > + * > > + * Operations on ->flags have to be atomic because > > + * even if AS_MM_ALL_LOCKS is stable thanks to the > > + * mm_all_locks_mutex, there may be other cpus > > + * changing other bitflags in parallel to us. > > + */ > > + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) > > + BUG(); > > + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); > > + } > > +} > > + > > +/* > > + * This operation locks against the VM for all pte/vma/mm related > > + * operations that could ever happen on a certain mm. This includes > > + * vmtruncate, try_to_unmap, and all page faults. > > + * > > + * The caller must take the mmap_lock in write mode before calling > > + * mm_take_all_locks(). The caller isn't allowed to release the > > + * mmap_lock until mm_drop_all_locks() returns. > > + * > > + * mmap_lock in write mode is required in order to block all operations > > + * that could modify pagetables and free pages without need of > > + * altering the vma layout. It's also needed in write mode to avoid new > > + * anon_vmas to be associated with existing vmas. > > + * > > + * A single task can't take more than one mm_take_all_locks() in a row > > + * or it would deadlock. > > + * > > + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in > > + * mapping->flags avoid to take the same lock twice, if more than one > > + * vma in this mm is backed by the same anon_vma or address_space. > > + * > > + * We take locks in following order, accordingly to comment at beginning > > + * of mm/rmap.c: > > + * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for > > + * hugetlb mapping); > > + * - all vmas marked locked > > + * - all i_mmap_rwsem locks; > > + * - all anon_vma->rwseml > > + * > > + * We can take all locks within these types randomly because the VM code > > + * doesn't nest them and we protected from parallel mm_take_all_locks() by > > + * mm_all_locks_mutex. > > + * > > + * mm_take_all_locks() and mm_drop_all_locks are expensive operations > > + * that may have to take thousand of locks. > > + * > > + * mm_take_all_locks() can fail if it's interrupted by signals. > > + */ > > +int mm_take_all_locks(struct mm_struct *mm) > > +{ > > + struct vm_area_struct *vma; > > + struct anon_vma_chain *avc; > > + VMA_ITERATOR(vmi, mm, 0); > > + > > + mmap_assert_write_locked(mm); > > + > > + mutex_lock(&mm_all_locks_mutex); > > + > > + /* > > + * vma_start_write() does not have a complement in mm_drop_all_locks() > > + * because vma_start_write() is always asymmetrical; it marks a VMA as > > + * being written to until mmap_write_unlock() or mmap_write_downgrade() > > + * is reached. > > + */ > > + for_each_vma(vmi, vma) { > > + if (signal_pending(current)) > > + goto out_unlock; > > + vma_start_write(vma); > > + } > > + > > + vma_iter_init(&vmi, mm, 0); > > + for_each_vma(vmi, vma) { > > + if (signal_pending(current)) > > + goto out_unlock; > > + if (vma->vm_file && vma->vm_file->f_mapping && > > + is_vm_hugetlb_page(vma)) > > + vm_lock_mapping(mm, vma->vm_file->f_mapping); > > + } > > + > > + vma_iter_init(&vmi, mm, 0); > > + for_each_vma(vmi, vma) { > > + if (signal_pending(current)) > > + goto out_unlock; > > + if (vma->vm_file && vma->vm_file->f_mapping && > > + !is_vm_hugetlb_page(vma)) > > + vm_lock_mapping(mm, vma->vm_file->f_mapping); > > + } > > + > > + vma_iter_init(&vmi, mm, 0); > > + for_each_vma(vmi, vma) { > > + if (signal_pending(current)) > > + goto out_unlock; > > + if (vma->anon_vma) > > + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) > > + vm_lock_anon_vma(mm, avc->anon_vma); > > + } > > + > > + return 0; > > + > > +out_unlock: > > + mm_drop_all_locks(mm); > > + return -EINTR; > > +} > > + > > +static void vm_unlock_anon_vma(struct anon_vma *anon_vma) > > +{ > > + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { > > + /* > > + * The LSB of head.next can't change to 0 from under > > + * us because we hold the mm_all_locks_mutex. > > + * > > + * We must however clear the bitflag before unlocking > > + * the vma so the users using the anon_vma->rb_root will > > + * never see our bitflag. > > + * > > + * No need of atomic instructions here, head.next > > + * can't change from under us until we release the > > + * anon_vma->root->rwsem. > > + */ > > + if (!__test_and_clear_bit(0, (unsigned long *) > > + &anon_vma->root->rb_root.rb_root.rb_node)) > > + BUG(); > > + anon_vma_unlock_write(anon_vma); > > + } > > +} > > + > > +static void vm_unlock_mapping(struct address_space *mapping) > > +{ > > + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { > > + /* > > + * AS_MM_ALL_LOCKS can't change to 0 from under us > > + * because we hold the mm_all_locks_mutex. > > + */ > > + i_mmap_unlock_write(mapping); > > + if (!test_and_clear_bit(AS_MM_ALL_LOCKS, > > + &mapping->flags)) > > + BUG(); > > + } > > +} > > + > > +/* > > + * The mmap_lock cannot be released by the caller until > > + * mm_drop_all_locks() returns. > > + */ > > +void mm_drop_all_locks(struct mm_struct *mm) > > +{ > > + struct vm_area_struct *vma; > > + struct anon_vma_chain *avc; > > + VMA_ITERATOR(vmi, mm, 0); > > + > > + mmap_assert_write_locked(mm); > > + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); > > + > > + for_each_vma(vmi, vma) { > > + if (vma->anon_vma) > > + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) > > + vm_unlock_anon_vma(avc->anon_vma); > > + if (vma->vm_file && vma->vm_file->f_mapping) > > + vm_unlock_mapping(vma->vm_file->f_mapping); > > + } > > + > > + mutex_unlock(&mm_all_locks_mutex); > > +} > > diff --git a/mm/vma.h b/mm/vma.h > > new file mode 100644 > > index 000000000000..15d82dbb7213 > > --- /dev/null > > +++ b/mm/vma.h > > @@ -0,0 +1,356 @@ > > +/* SPDX-License-Identifier: GPL-2.0-or-later */ > > +/* > > + * vma.h > > + * > > + * Core VMA manipulation API implemented in vma.c. > > + */ > > +#ifndef __MM_VMA_H > > +#define __MM_VMA_H > > + > > +/* > > + * VMA lock generalization > > + */ > > +struct vma_prepare { > > + struct vm_area_struct *vma; > > + struct vm_area_struct *adj_next; > > + struct file *file; > > + struct address_space *mapping; > > + struct anon_vma *anon_vma; > > + struct vm_area_struct *insert; > > + struct vm_area_struct *remove; > > + struct vm_area_struct *remove2; > > +}; > > + > > +struct unlink_vma_file_batch { > > + int count; > > + struct vm_area_struct *vmas[8]; > > +}; > > + > > +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE > > +void validate_mm(struct mm_struct *mm); > > +#else > > +#define validate_mm(mm) do { } while (0) > > +#endif > > + > > +/* Required for expand_downwards(). */ > > +void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); > > + > > +/* Required for expand_downwards(). */ > > +void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); > > + > > +/* Required for do_brk_flags(). */ > > +void vma_prepare(struct vma_prepare *vp); > > + > > +/* Required for do_brk_flags(). */ > > +void init_vma_prep(struct vma_prepare *vp, > > + struct vm_area_struct *vma); > > + > > +/* Required for do_brk_flags(). */ > > +void vma_complete(struct vma_prepare *vp, > > + struct vma_iterator *vmi, struct mm_struct *mm); > > + > > +int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, > > + unsigned long start, unsigned long end, pgoff_t pgoff, > > + struct vm_area_struct *next); > > + > > +int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, > > + unsigned long start, unsigned long end, pgoff_t pgoff); > > + > > +int > > +do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, > > + struct mm_struct *mm, unsigned long start, > > + unsigned long end, struct list_head *uf, bool unlock); > > + > > +int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, > > + unsigned long start, size_t len, struct list_head *uf, > > + bool unlock); > > + > > +void remove_vma(struct vm_area_struct *vma, bool unreachable); > > + > > +void unmap_region(struct mm_struct *mm, struct ma_state *mas, > > + struct vm_area_struct *vma, struct vm_area_struct *prev, > > + struct vm_area_struct *next, unsigned long start, > > + unsigned long end, unsigned long tree_end, bool mm_wr_locked); > > + > > +/* Required by mmap_region(). */ > > +bool > > +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, > > + struct anon_vma *anon_vma, struct file *file, > > + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, > > + struct anon_vma_name *anon_name); > > + > > +/* Required by mmap_region() and do_brk_flags(). */ > > +bool > > +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, > > + struct anon_vma *anon_vma, struct file *file, > > + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, > > + struct anon_vma_name *anon_name); > > + > > +struct vm_area_struct *vma_modify(struct vma_iterator *vmi, > > + struct vm_area_struct *prev, > > + struct vm_area_struct *vma, > > + unsigned long start, unsigned long end, > > + unsigned long vm_flags, > > + struct mempolicy *policy, > > + struct vm_userfaultfd_ctx uffd_ctx, > > + struct anon_vma_name *anon_name); > > + > > +/* We are about to modify the VMA's flags. */ > > +static inline struct vm_area_struct > > +*vma_modify_flags(struct vma_iterator *vmi, > > + struct vm_area_struct *prev, > > + struct vm_area_struct *vma, > > + unsigned long start, unsigned long end, > > + unsigned long new_flags) > > +{ > > + return vma_modify(vmi, prev, vma, start, end, new_flags, > > + vma_policy(vma), vma->vm_userfaultfd_ctx, > > + anon_vma_name(vma)); > > +} > > + > > +/* We are about to modify the VMA's flags and/or anon_name. */ > > +static inline struct vm_area_struct > > +*vma_modify_flags_name(struct vma_iterator *vmi, > > + struct vm_area_struct *prev, > > + struct vm_area_struct *vma, > > + unsigned long start, > > + unsigned long end, > > + unsigned long new_flags, > > + struct anon_vma_name *new_name) > > +{ > > + return vma_modify(vmi, prev, vma, start, end, new_flags, > > + vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); > > +} > > + > > +/* We are about to modify the VMA's memory policy. */ > > +static inline struct vm_area_struct > > +*vma_modify_policy(struct vma_iterator *vmi, > > + struct vm_area_struct *prev, > > + struct vm_area_struct *vma, > > + unsigned long start, unsigned long end, > > + struct mempolicy *new_pol) > > +{ > > + return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, > > + new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); > > +} > > + > > +/* We are about to modify the VMA's flags and/or uffd context. */ > > +static inline struct vm_area_struct > > +*vma_modify_flags_uffd(struct vma_iterator *vmi, > > + struct vm_area_struct *prev, > > + struct vm_area_struct *vma, > > + unsigned long start, unsigned long end, > > + unsigned long new_flags, > > + struct vm_userfaultfd_ctx new_ctx) > > +{ > > + return vma_modify(vmi, prev, vma, start, end, new_flags, > > + vma_policy(vma), new_ctx, anon_vma_name(vma)); > > +} > > + > > +struct vm_area_struct > > +*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, > > + struct vm_area_struct *vma, unsigned long start, > > + unsigned long end, pgoff_t pgoff); > > + > > +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, > > + struct vm_area_struct *vma, > > + unsigned long delta); > > + > > +void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); > > + > > +void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); > > + > > +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, > > + struct vm_area_struct *vma); > > + > > +void unlink_file_vma(struct vm_area_struct *vma); > > + > > +void vma_link_file(struct vm_area_struct *vma); > > + > > +int vma_link(struct mm_struct *mm, struct vm_area_struct *vma); > > + > > +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, > > + unsigned long addr, unsigned long len, pgoff_t pgoff, > > + bool *need_rmap_locks); > > + > > +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma); > > + > > +bool vma_needs_dirty_tracking(struct vm_area_struct *vma); > > +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); > > + > > +int mm_take_all_locks(struct mm_struct *mm); > > +void mm_drop_all_locks(struct mm_struct *mm); > > +unsigned long count_vma_pages_range(struct mm_struct *mm, > > + unsigned long addr, unsigned long end); > > + > > +static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) > > +{ > > + /* > > + * We want to check manually if we can change individual PTEs writable > > + * if we can't do that automatically for all PTEs in a mapping. For > > + * private mappings, that's always the case when we have write > > + * permissions as we properly have to handle COW. > > + */ > > + if (vma->vm_flags & VM_SHARED) > > + return vma_wants_writenotify(vma, vma->vm_page_prot); > > + return !!(vma->vm_flags & VM_WRITE); > > +} > > + > > +static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) > > +{ > > + return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); > > +} > > + > > +static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, > > + unsigned long min) > > +{ > > + return mas_prev(&vmi->mas, min); > > +} > > + > > +static inline int vma_iter_store_gfp(struct vma_iterator *vmi, > > + struct vm_area_struct *vma, gfp_t gfp) > > +{ > > + if (vmi->mas.status != ma_start && > > + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) > > + vma_iter_invalidate(vmi); > > + > > + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); > > + mas_store_gfp(&vmi->mas, vma, gfp); > > + if (unlikely(mas_is_err(&vmi->mas))) > > + return -ENOMEM; > > + > > + return 0; > > +} > > + > > + > > +/* > > + * These three helpers classifies VMAs for virtual memory accounting. > > + */ > > + > > +/* > > + * Executable code area - executable, not writable, not stack > > + */ > > +static inline bool is_exec_mapping(vm_flags_t flags) > > +{ > > + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; > > +} > > + > > +/* > > + * Stack area (including shadow stacks) > > + * > > + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: > > + * do_mmap() forbids all other combinations. > > + */ > > +static inline bool is_stack_mapping(vm_flags_t flags) > > +{ > > + return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); > > +} > > + > > +/* > > + * Data area - private, writable, not stack > > + */ > > +static inline bool is_data_mapping(vm_flags_t flags) > > +{ > > + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; > > +} > > + > > + > > +static inline void vma_iter_config(struct vma_iterator *vmi, > > + unsigned long index, unsigned long last) > > +{ > > + __mas_set_range(&vmi->mas, index, last - 1); > > +} > > + > > +static inline void vma_iter_reset(struct vma_iterator *vmi) > > +{ > > + mas_reset(&vmi->mas); > > +} > > + > > +static inline > > +struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) > > +{ > > + return mas_prev_range(&vmi->mas, min); > > +} > > + > > +static inline > > +struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) > > +{ > > + return mas_next_range(&vmi->mas, max); > > +} > > + > > +static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, > > + unsigned long max, unsigned long size) > > +{ > > + return mas_empty_area(&vmi->mas, min, max - 1, size); > > +} > > + > > +static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, > > + unsigned long max, unsigned long size) > > +{ > > + return mas_empty_area_rev(&vmi->mas, min, max - 1, size); > > +} > > + > > +/* > > + * VMA Iterator functions shared between nommu and mmap > > + */ > > +static inline int vma_iter_prealloc(struct vma_iterator *vmi, > > + struct vm_area_struct *vma) > > +{ > > + return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); > > +} > > + > > +static inline void vma_iter_clear(struct vma_iterator *vmi) > > +{ > > + mas_store_prealloc(&vmi->mas, NULL); > > +} > > + > > +static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) > > +{ > > + return mas_walk(&vmi->mas); > > +} > > + > > +/* Store a VMA with preallocated memory */ > > +static inline void vma_iter_store(struct vma_iterator *vmi, > > + struct vm_area_struct *vma) > > +{ > > + > > +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) > > + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && > > + vmi->mas.index > vma->vm_start)) { > > + pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", > > + vmi->mas.index, vma->vm_start, vma->vm_start, > > + vma->vm_end, vmi->mas.index, vmi->mas.last); > > + } > > + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && > > + vmi->mas.last < vma->vm_start)) { > > + pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", > > + vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, > > + vmi->mas.index, vmi->mas.last); > > + } > > +#endif > > + > > + if (vmi->mas.status != ma_start && > > + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) > > + vma_iter_invalidate(vmi); > > + > > + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); > > + mas_store_prealloc(&vmi->mas, vma); > > +} > > + > > +static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) > > +{ > > + return vmi->mas.index; > > +} > > + > > +static inline unsigned long vma_iter_end(struct vma_iterator *vmi) > > +{ > > + return vmi->mas.last + 1; > > +} > > + > > +static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, > > + unsigned long count) > > +{ > > + return mas_expected_entries(&vmi->mas, count); > > +} > > + > > +#endif /* __MM_VMA_H */ > > diff --git a/mm/vma_internal.h b/mm/vma_internal.h > > new file mode 100644 > > index 000000000000..51b2010f30c0 > > --- /dev/null > > +++ b/mm/vma_internal.h > > @@ -0,0 +1,143 @@ > > +/* SPDX-License-Identifier: GPL-2.0-or-later */ > > +/* > > + * vma_internal.h > > + * > > + * Headers required by vma.c, which can be substituted accordingly when testing > > + * VMA functionality. > > + */ > > + > > This should probably have header guards for the testing side? > > > +/* For fundamental mm types and VMA_ITERATOR(), > > + * tlb_gather_mmu(), tlb_finish_mmu(). > > + */ > > +#include <linux/mm_types.h> > > + > > +/* For mapping_can_writeback(). */ > > +#include <linux/backing-dev.h> > > + > > +/* > > + * For test_bit(), test_and_set_bit(), __test_and_set_bit(), > > + * test_and_clear_bit(). > > + */ > > +#include <linux/bitops.h> > > + > > +/* For WARN_ON(), WARN_ON_ONCE(), BUG_ON(). */ > > +#include <linux/bug.h> > > + > > +/* For ERR_PTR(). */ > > +#include <linux/err.h> > > + > > +/* For fput(). */ > > +#include <linux/file.h> > > + > > +/* > > + * For get_file(), mapping_unmap_writable(), i_mmap_lock_write(), > > + * i_mmap_unlock_write(). > > + */ > > +#include <linux/fs.h> > > + > > +/* For vma_adjust_trans_huge(). */ > > +#include <linux/huge_mm.h> > > + > > +/* For is_vm_hugetlb_page(). */ > > +#include <linux/hugetlb_inline.h> > > + > > +/* For might_sleep(). */ > > +#include <linux/kernel.h> > > + > > +/* For khugepaged_enter_vma(). */ > > +#include <linux/khugepaged.h> > > + > > +/* For maple tree operations. */ > > +#include <linux/maple_tree.h> > > + > > +/* For mpol_put(), vma_policy(), vma_dup_policy(), mpol_equal(). */ > > +#include <linux/mempolicy.h> > > + > > +/* > > + * For VM flags, update_hiwater_rss(), __vm_area_free(), vm_area_free(), > > + * vm_area_dup(), unmap_vmas(), vma_start_write(), vma_prev(), vma_next(), > > + * vma_iter_free(), vm_area_free(), vm_stat_account(), vma_is_shared_maywrite(), > > + * vma_interval_tree_remove(), anon_vma_interval_tree_remove(), > > + * anon_vma_interval_tree_insert(), vma_assert_write_locked(), for_each_vma(), > > + * vma_mark_detached(), for_each_vma_range(), vma_iter_set(), > > + * vma_iter_prev_range(), vma_iter_clear_gfp(), PAGE_ALIGN(), vma_find(), > > + * find_vma_intersection(), vma_lookup(), vma_is_anonymous(), find_vma_prev(). > > + */ > > +#include <linux/mm.h> > > + > > +/* For VM_WARN_ON(), VM_WARN_ON_ONCE_MM(), VM_BUG_ON_VMA(). */ > > +#include <linux/mmdebug.h> > > + > > +/* For list_is_singular(), list_for_each_entry(). */ > > +#include <linux/list.h> > > + > > +/* For anon_vma_name_eq(). */ > > +#include <linux/mm_inline.h> > > + > > +/* For vm_unacct_memory(), vma_pages(). */ > > +#include <linux/mman.h> > > + > > +/* For mmap_write_unlock(), mmap_write_downgrade(), mmap_assert_write_locked(). */ > > +#include <linux/mmap_lock.h> > > + > > +/* For DEFINE_MUTEX(), mutex_lock(), mutex_unlock(). */ > > +#include <linux/mutex.h> > > + > > +/* For AS_MM_ALL_LOCKS. */ > > +#include <linux/pagemap.h> > > + > > +/* For PHYS_PFN(). */ > > +#include <linux/pfn.h> > > + > > +/* For rcu_read_lock(), rcu_read_unlock(). */ > > +#include <linux/rcupdate.h> > > + > > +/* > > + * For anon_vma_clone(), anon_vma_lock_write(), anon_vma_clone(), > > + * unlink_anon_vmas(), anon_vma_unlock_write(), anon_vma_merge(). > > + */ > > +#include <linux/rmap.h> > > + > > +/* For down_write_nest_lock(). */ > > +#include <linux/rwsem.h> > > + > > +/* For signal_pending(). */ > > +#include <linux/sched/signal.h> > > + > > +/* For lru_add_drain(). */ > > +#include <linux/swap.h> > > + > > +/* For uprobe_mmap(), uprobe_munmap(). */ > > +#include <linux/uprobes.h> > > +/* > > + * For struct vm_userfaultfd_ctx, is_mergeable_vm_userfaultfd_ctx(), > > + * userfaultfd_unmap_prep, userfaultfd_wp(). > > + */ > > +#include <linux/userfaultfd_k.h> > > + > > +/* For BUG(). */ > > +#include <linux/bug.h> > > + > > +/* For flush_dcache_mmap_lock(), flush_dcache_mmap_unlock(). */ > > +#include <linux/cacheflush.h> > > + > > +/* For current. */ > > +#include <asm/current.h> > > + > > +/* For PAGE_SHIFT, etc. */ > > +#include <asm/page_types.h> > > + > > +/* For pgprot_val(). */ > > +#include <asm/pgtable_types.h> > > + > > +/* For struct mmu_gather. */ > > +#include <asm/tlb.h> > > + > > +/* For arch_unmap(). */ > > +#include <linux/mmu_context.h> > > + > > +/* > > + * For free_pgtables(), vma_set_range(), can_modify_mm(), > > + * vma_soft_dirty_enabled(). > > + */ > > +#include "internal.h" > > -- > > 2.45.1 > >