On Fri, Aug 09, 2024 at 11:23:30AM GMT, Liam R. Howlett wrote: > * Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> [240805 08:14]: > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer > > to use vma_expand() rather than invoke a full vma_merge() operation. > > > > Abstract this logic and eliminate all of the open-coding, and also use the > > same logic for all cases where we add new VMAs to, rather than ultimately > > use vma_merge(), rather use vma_expand(). > > > > We implement this by replacing vma_merge_new_vma() with this newly > > abstracted logic. > > > > Doing so removes duplication and simplifies VMA merging in all such cases, > > laying the ground for us to eliminate the merging of new VMAs in > > vma_merge() altogether. > > > > This makes it far easier to understand what is happening in these cases > > avoiding confusion, bugs and allowing for future optimisation. > > > > As a result of this change we are also able to make vma_prepare(), > > init_vma_prep(), vma_complete(), can_vma_merge_before() and > > can_vma_merge_after() static and internal to vma.c. > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > > --- > > mm/mmap.c | 79 ++--- > > mm/vma.c | 482 +++++++++++++++++++------------ > > mm/vma.h | 51 +--- > > tools/testing/vma/vma_internal.h | 6 + > > 4 files changed, 324 insertions(+), 294 deletions(-) > > > > diff --git a/mm/mmap.c b/mm/mmap.c > > index f6593a81f73d..c03f50f46396 100644 > > --- a/mm/mmap.c > > +++ b/mm/mmap.c > > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > { > > struct mm_struct *mm = current->mm; > > struct vm_area_struct *vma = NULL; > > - struct vm_area_struct *next, *prev, *merge; > > - pgoff_t pglen = len >> PAGE_SHIFT; > > + struct vm_area_struct *merge; > > unsigned long charged = 0; > > unsigned long end = addr + len; > > bool writable_file_mapping = false; > > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > vm_flags |= VM_ACCOUNT; > > } > > > > - next = vmg.next = vma_next(&vmi); > > - prev = vmg.prev = vma_prev(&vmi); > > - if (vm_flags & VM_SPECIAL) { > > - if (prev) > > - vma_iter_next_range(&vmi); > > - goto cannot_expand; > > - } > > - > > - /* Attempt to expand an old mapping */ > > - /* Check next */ > > - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) { > > - /* We can adjust this as can_vma_merge_after() doesn't touch */ > > - vmg.end = next->vm_end; > > - vma = vmg.vma = next; > > - vmg.pgoff = next->vm_pgoff - pglen; > > - > > - /* We may merge our NULL anon_vma with non-NULL in next. */ > > - vmg.anon_vma = vma->anon_vma; > > - } > > - > > - /* Check prev */ > > - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) { > > - vmg.start = prev->vm_start; > > - vma = vmg.vma = prev; > > - vmg.pgoff = prev->vm_pgoff; > > - } else if (prev) { > > - vma_iter_next_range(&vmi); > > - } > > - > > - /* Actually expand, if possible */ > > - if (vma && !vma_expand(&vmg)) { > > - khugepaged_enter_vma(vma, vm_flags); > > + vma = vma_merge_new_vma(&vmg); > > + if (vma) > > goto expanded; > > - } > > - > > - if (vma == prev) > > - vma_iter_set(&vmi, addr); > > -cannot_expand: > > > > /* > > * Determine the object being mapped and call the appropriate > > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > * If vm_flags changed after call_mmap(), we should try merge > > * vma again as we may succeed this time. > > */ > > - if (unlikely(vm_flags != vma->vm_flags && prev)) { > > - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma, > > - vma->vm_start, vma->vm_end, > > - vma->vm_pgoff); > > + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { > > + merge = vma_merge_new_vma(&vmg); > > + > > if (merge) { > > /* > > * ->mmap() can change vma->vm_file and fput > > @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > > > vma_iter_set(&vmi, vma->vm_end); > > /* Undo any partial mapping done by a device driver. */ > > - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, > > + unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start, > > vma->vm_end, vma->vm_end, true); > > } > > if (writable_file_mapping) > > @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > > unsigned long addr, unsigned long len, unsigned long flags) > > { > > struct mm_struct *mm = current->mm; > > - struct vma_prepare vp; > > > > /* > > * Check against address space limits by the changed size > > @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > > */ > > if (vma && vma->vm_end == addr) { > > struct vma_merge_struct vmg = { > > + .vmi = vmi, > > .prev = vma, > > + .next = NULL, > > + .start = addr, > > + .end = addr + len, > > .flags = flags, > > .pgoff = addr >> PAGE_SHIFT, > > + .file = vma->vm_file, > > + .anon_vma = vma->anon_vma, > > + .policy = vma_policy(vma), > > + .uffd_ctx = vma->vm_userfaultfd_ctx, > > + .anon_name = anon_vma_name(vma), > > }; > > > > - if (can_vma_merge_after(&vmg)) { > > - vma_iter_config(vmi, vma->vm_start, addr + len); > > - if (vma_iter_prealloc(vmi, vma)) > > - goto unacct_fail; > > - > > - vma_start_write(vma); > > - > > - init_vma_prep(&vp, vma); > > - vma_prepare(&vp); > > - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); > > - vma->vm_end = addr + len; > > - vm_flags_set(vma, VM_SOFTDIRTY); > > - vma_iter_store(vmi, vma); > > - > > - vma_complete(&vp, vmi, mm); > > - khugepaged_enter_vma(vma, flags); > > + if (vma_merge_new_vma(&vmg)) > > goto out; > > This is very convoluted to follow. It seems vma_merge_new_vma() will do > what is necessary by finding out that it can merge after, then call > vma_expand() which calls commit merge(), which sets the iterator to > vmg->start, but vmg->start isn't set to vma->vm_start, it is set to addr > here.. it's actually set to prev->vm_start in vma_merge_new_vma(). Sorry, it's kind of hard to make a change like this all that lovely to follow. The only extra checks before it checks mergeability are prev - which we set to vma, so is not NULL (except in the case of first vma, which is wasteful, but a one-off) and an is_special and next check. So this isn't _hugely_ terrible. As to the vmi positioning... I thought there might be some things that we could improve on this :) However, we set prev == vma here, so vmg->start = vma->vm_start, vmg->end = addr + len which is the same as before right? I do notice that we've incorrectly removed the vm_flags_set(VM_SOFTDIRTY) though... will add that back in. Ugh. Again, so frustrating to not have these functions testable. I'd like to find a way to move things around if possible at some point. But if we're worried about call stack maybe not feasible. > > This is getting really hard to trace what happens. I'm also concerned > that the overhead of following all these checks will cost performance on > the brk system call? I'll take a look at mm-tests results. > > Maybe we can have a way to set up the vmg and call the right function to > just make the above happen? We know with the can_vma_merge_after() that > it is going to work, so could we just call vma_start_write() and > commit_merge()? I'm happy to add an enum or something to set a specific mode if we want, but maybe worth looking at scalability results first to see if there's really a regression? I mean from our discussions on irc, it sounds like this is very possible so we could figure something out. > > Also, vma_merge_new_vma() could fail because it's out of memory so it > should goto unacct_fail.. but we now don't know if it's because the > merge wasn't allowed or if we are out of memory.. Yes this is a mistake, damn it. Will fix. Grumble about untestability of these functions x2. As per your comment below I think simplest way may be to have an error or outcome field or some such that we can check to see _why_ things failed. > > > - } > > } > > > > if (vma) > > diff --git a/mm/vma.c b/mm/vma.c > > index 55615392e8d2..a404cf718f9e 100644 > > --- a/mm/vma.c > > +++ b/mm/vma.c > > @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp, > > * > > * We assume the vma may be removed as part of the merge. > > */ > > -bool > > -can_vma_merge_before(struct vma_merge_struct *vmg) > > +static bool can_vma_merge_before(struct vma_merge_struct *vmg) > > { > > pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); > > > > @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg) > > * > > * We assume that vma is not removed as part of the merge. > > */ > > -bool can_vma_merge_after(struct vma_merge_struct *vmg) > > +static bool can_vma_merge_after(struct vma_merge_struct *vmg) > > { > > if (is_mergeable_vma(vmg, false) && > > is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { > > @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg) > > return false; > > } > > > > +static void __vma_link_file(struct vm_area_struct *vma, > > + struct address_space *mapping) > > +{ > > + if (vma_is_shared_maywrite(vma)) > > + mapping_allow_writable(mapping); > > + > > + flush_dcache_mmap_lock(mapping); > > + vma_interval_tree_insert(vma, &mapping->i_mmap); > > + flush_dcache_mmap_unlock(mapping); > > +} > > + > > +/* > > + * Requires inode->i_mapping->i_mmap_rwsem > > + */ > > +static void __remove_shared_vm_struct(struct vm_area_struct *vma, > > + struct address_space *mapping) > > +{ > > + if (vma_is_shared_maywrite(vma)) > > + mapping_unmap_writable(mapping); > > + > > + flush_dcache_mmap_lock(mapping); > > + vma_interval_tree_remove(vma, &mapping->i_mmap); > > + flush_dcache_mmap_unlock(mapping); > > +} > > + > > +/* > > + * vma_prepare() - Helper function for handling locking VMAs prior to altering > > + * @vp: The initialized vma_prepare struct > > + */ > > +static void vma_prepare(struct vma_prepare *vp) > > +{ > > + if (vp->file) { > > + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); > > + > > + if (vp->adj_next) > > + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, > > + vp->adj_next->vm_end); > > + > > + i_mmap_lock_write(vp->mapping); > > + if (vp->insert && vp->insert->vm_file) { > > + /* > > + * Put into interval tree now, so instantiated pages > > + * are visible to arm/parisc __flush_dcache_page > > + * throughout; but we cannot insert into address > > + * space until vma start or end is updated. > > + */ > > + __vma_link_file(vp->insert, > > + vp->insert->vm_file->f_mapping); > > + } > > + } > > + > > + if (vp->anon_vma) { > > + anon_vma_lock_write(vp->anon_vma); > > + anon_vma_interval_tree_pre_update_vma(vp->vma); > > + if (vp->adj_next) > > + anon_vma_interval_tree_pre_update_vma(vp->adj_next); > > + } > > + > > + if (vp->file) { > > + flush_dcache_mmap_lock(vp->mapping); > > + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); > > + if (vp->adj_next) > > + vma_interval_tree_remove(vp->adj_next, > > + &vp->mapping->i_mmap); > > + } > > + > > +} > > + > > +/* > > + * vma_complete- Helper function for handling the unlocking after altering VMAs, > > + * or for inserting a VMA. > > + * > > + * @vp: The vma_prepare struct > > + * @vmi: The vma iterator > > + * @mm: The mm_struct > > + */ > > +static void vma_complete(struct vma_prepare *vp, > > + struct vma_iterator *vmi, struct mm_struct *mm) > > +{ > > + if (vp->file) { > > + if (vp->adj_next) > > + vma_interval_tree_insert(vp->adj_next, > > + &vp->mapping->i_mmap); > > + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); > > + flush_dcache_mmap_unlock(vp->mapping); > > + } > > + > > + if (vp->remove && vp->file) { > > + __remove_shared_vm_struct(vp->remove, vp->mapping); > > + if (vp->remove2) > > + __remove_shared_vm_struct(vp->remove2, vp->mapping); > > + } else if (vp->insert) { > > + /* > > + * split_vma has split insert from vma, and needs > > + * us to insert it before dropping the locks > > + * (it may either follow vma or precede it). > > + */ > > + vma_iter_store(vmi, vp->insert); > > + mm->map_count++; > > + } > > + > > + if (vp->anon_vma) { > > + anon_vma_interval_tree_post_update_vma(vp->vma); > > + if (vp->adj_next) > > + anon_vma_interval_tree_post_update_vma(vp->adj_next); > > + anon_vma_unlock_write(vp->anon_vma); > > + } > > + > > + if (vp->file) { > > + i_mmap_unlock_write(vp->mapping); > > + uprobe_mmap(vp->vma); > > + > > + if (vp->adj_next) > > + uprobe_mmap(vp->adj_next); > > + } > > + > > + if (vp->remove) { > > +again: > > + vma_mark_detached(vp->remove, true); > > + if (vp->file) { > > + uprobe_munmap(vp->remove, vp->remove->vm_start, > > + vp->remove->vm_end); > > + fput(vp->file); > > + } > > + if (vp->remove->anon_vma) > > + anon_vma_merge(vp->vma, vp->remove); > > + mm->map_count--; > > + mpol_put(vma_policy(vp->remove)); > > + if (!vp->remove2) > > + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); > > + vm_area_free(vp->remove); > > + > > + /* > > + * In mprotect's case 6 (see comments on vma_merge), > > + * we are removing both mid and next vmas > > + */ > > + if (vp->remove2) { > > + vp->remove = vp->remove2; > > + vp->remove2 = NULL; > > + goto again; > > + } > > + } > > + if (vp->insert && vp->file) > > + uprobe_mmap(vp->insert); > > + validate_mm(mm); > > +} > > + > > +/* > > + * init_vma_prep() - Initializer wrapper for vma_prepare struct > > + * @vp: The vma_prepare struct > > + * @vma: The vma that will be altered once locked > > + */ > > +static void init_vma_prep(struct vma_prepare *vp, > > + struct vm_area_struct *vma) > > +{ > > + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); > > +} > > + > > /* > > * Close a vm structure and free it. > > */ > > @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) > > vm_unacct_memory(nr_accounted); > > } > > > > -/* > > - * init_vma_prep() - Initializer wrapper for vma_prepare struct > > - * @vp: The vma_prepare struct > > - * @vma: The vma that will be altered once locked > > - */ > > -void init_vma_prep(struct vma_prepare *vp, > > - struct vm_area_struct *vma) > > -{ > > - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); > > -} > > - > > -/* > > - * Requires inode->i_mapping->i_mmap_rwsem > > - */ > > -static void __remove_shared_vm_struct(struct vm_area_struct *vma, > > - struct address_space *mapping) > > -{ > > - if (vma_is_shared_maywrite(vma)) > > - mapping_unmap_writable(mapping); > > - > > - flush_dcache_mmap_lock(mapping); > > - vma_interval_tree_remove(vma, &mapping->i_mmap); > > - flush_dcache_mmap_unlock(mapping); > > -} > > - > > /* > > * vma has some anon_vma assigned, and is already inserted on that > > * anon_vma's interval trees. > > @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) > > anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); > > } > > > > -static void __vma_link_file(struct vm_area_struct *vma, > > - struct address_space *mapping) > > -{ > > - if (vma_is_shared_maywrite(vma)) > > - mapping_allow_writable(mapping); > > - > > - flush_dcache_mmap_lock(mapping); > > - vma_interval_tree_insert(vma, &mapping->i_mmap); > > - flush_dcache_mmap_unlock(mapping); > > -} > > - > > -/* > > - * vma_prepare() - Helper function for handling locking VMAs prior to altering > > - * @vp: The initialized vma_prepare struct > > - */ > > -void vma_prepare(struct vma_prepare *vp) > > -{ > > - if (vp->file) { > > - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); > > - > > - if (vp->adj_next) > > - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, > > - vp->adj_next->vm_end); > > - > > - i_mmap_lock_write(vp->mapping); > > - if (vp->insert && vp->insert->vm_file) { > > - /* > > - * Put into interval tree now, so instantiated pages > > - * are visible to arm/parisc __flush_dcache_page > > - * throughout; but we cannot insert into address > > - * space until vma start or end is updated. > > - */ > > - __vma_link_file(vp->insert, > > - vp->insert->vm_file->f_mapping); > > - } > > - } > > - > > - if (vp->anon_vma) { > > - anon_vma_lock_write(vp->anon_vma); > > - anon_vma_interval_tree_pre_update_vma(vp->vma); > > - if (vp->adj_next) > > - anon_vma_interval_tree_pre_update_vma(vp->adj_next); > > - } > > - > > - if (vp->file) { > > - flush_dcache_mmap_lock(vp->mapping); > > - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); > > - if (vp->adj_next) > > - vma_interval_tree_remove(vp->adj_next, > > - &vp->mapping->i_mmap); > > - } > > - > > -} > > - > > /* > > * dup_anon_vma() - Helper function to duplicate anon_vma > > * @dst: The destination VMA > > @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm) > > } > > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ > > > > +/* > > + * vma_merge_new_vma - Attempt to merge a new VMA into address space > > + * > > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end > > + * (exclusive), which we try to merge with any adjacent VMAs if possible. > > + * > > + * We are about to add a VMA to the address space starting at @vmg->start and > > + * ending at @vmg->end. There are three different possible scenarios: > > + * > > + * 1. There is a VMA with identical properties immediately adjacent to the > > + * proposed new VMA [@vmg->start, @vmg->end) either before or after it - > > + * EXPAND that VMA: > > + * > > + * Proposed: |-----| or |-----| > > + * Existing: |----| |----| > > + * > > + * 2. There are VMAs with identical properties immediately adjacent to the > > + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - > > + * EXPAND the former and REMOVE the latter: > > + * > > + * Proposed: |-----| > > + * Existing: |----| |----| > > + * > > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those > > + * VMAs do not have identical attributes - NO MERGE POSSIBLE. > > We still have diagrams, that's too bad. But they're cute ones! Upgrade right? > > > + * > > + * In instances where we can merge, this function returns the expanded VMA which > > + * will have its range adjusted accordingly and the underlying maple tree also > > + * adjusted. > > + * > > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer > > + * to the VMA we expanded. > > + * > > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if > > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the > > + * expanded range. > > + * > > + * ASSUMPTIONS: > > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. > > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty. > > + */ > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > > +{ > > + bool is_special = vmg->flags & VM_SPECIAL; > > + struct vm_area_struct *prev = vmg->prev; > > + struct vm_area_struct *next = vmg->next; > > + unsigned long start = vmg->start; > > + unsigned long end = vmg->end; > > + pgoff_t pgoff = vmg->pgoff; > > + pgoff_t pglen = PHYS_PFN(end - start); > > + > > + VM_WARN_ON(vmg->vma); > > + > > + if (!prev && !next) { > > + /* > > + * Since the caller must have determined that the requested > > + * range is empty, vmg->vmi will be left pointing at the VMA > > + * immediately prior. > > + */ > > + next = vmg->next = vma_next(vmg->vmi); > > + prev = vmg->prev = vma_prev(vmg->vmi); > > + > > + /* Avoid maple tree re-walk. */ > > + if (is_special && prev) > > + vma_iter_next_range(vmg->vmi); > > + } > > + > > + /* If special mapping or no adjacent VMAs, nothing to merge. */ > > + if (is_special || (!prev && !next)) > > + return NULL; > > + > > + /* If we can merge with the following VMA, adjust vmg accordingly. */ > > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) { > > + /* > > + * We can adjust this here as can_vma_merge_after() doesn't > > + * touch vmg->end. > > + */ > > + vmg->end = next->vm_end; > > + vmg->vma = next; > > + vmg->pgoff = next->vm_pgoff - pglen; > > + > > + vmg->anon_vma = next->anon_vma; > > + } > > + > > + /* If we can merge with the previous VMA, adjust vmg accordingly. */ > > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) { > > + vmg->start = prev->vm_start; > > + vmg->vma = prev; > > + vmg->pgoff = prev->vm_pgoff; > > + } else if (prev) { > > + vma_iter_next_range(vmg->vmi); > > + } > > + > > + /* > > + * Now try to expand adjacent VMA(s). This takes care of removing the > > + * following VMA if we have VMAs on both sides. > > + */ > > + if (vmg->vma && !vma_expand(vmg)) { > > + khugepaged_enter_vma(vmg->vma, vmg->flags); > > + return vmg->vma; > > + } > > + > > + /* If expansion failed, reset state. Allows us to retry merge later. */ > > + vmg->vma = NULL; > > + vmg->anon_vma = NULL; > > + vmg->start = start; > > + vmg->end = end; > > + vmg->pgoff = pgoff; > > + if (vmg->vma == prev) > > + vma_iter_set(vmg->vmi, start); > > + > > + return NULL; > > +} > > Can we split this up a bit? I was thinking that, for the brk() case, we > need to know if we can merge prev and if that merge fails. I was > thinking something that you create a vmg with whatever, then call > can_merge_prev, and that'd do the block above and change the vmg as > required. We could have a can_merge_next that does the same, then we > need to prepare the change (dup anon vma, preallocate for maple tree, > locking, whatever), then commit. Yeah that's not a bad idea, that could actually help really help clarify what's going on. Then could have a sort of state machine that indicates that we've already adjusted vmg parameters for the merge. I'm thinking though of a vma_merge_new_vma() / vma_merge_modified_vma() that invokes different code to figure out how to expand. I will have a fiddle around and see what I can figure out that makes sense. > > There could still be the function above, but with smaller widgets to do > what we need so we gain flexibility in what we decide to check - prev > only in brk(). > > I'm not sure if we'd need one for expanding vs existing or if we could > check !vmg->vma to figure that out.. > > This would also have the effect of self-documenting what is going on. > For brk, it would look like this: > > if (vmg_expand_prev()) { > if (vmg_prepare()) > goto no_mem; > vmg_commit(); > } > > I think this would change your exposed interface, at least for brk() - > or a wrapper for this, but small widgets may gain us some > self-documented code? > > If you really don't like the exposure of the interface, the vmg could > have a return so we can see if we ran out of memory? I really don't like can_vma_merge_xxx() being exposed, it's very clearly an internal interface. As mentioned above we can have some kind of way of passing back an error code. Obviously if testing indicates stack size/perf is a problem we can begrudgingly accept the interface leak :'(. Will check that. > > > + > > /* > > * vma_expand - Expand an existing VMA > > * > > @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm) > > * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with > > * vmg->next needs to be handled by the caller. > > * > > - * Returns: 0 on success > > + * Returns: 0 on success. > > + * > > + * ASSUMPTIONS: > > + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock. > > + * - The caller must have set @vmg->prev and @vmg->next. > > */ > > int vma_expand(struct vma_merge_struct *vmg) > > { > > @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg) > > return 0; > > } > > > > -/* > > - * vma_complete- Helper function for handling the unlocking after altering VMAs, > > - * or for inserting a VMA. > > - * > > - * @vp: The vma_prepare struct > > - * @vmi: The vma iterator > > - * @mm: The mm_struct > > - */ > > -void vma_complete(struct vma_prepare *vp, > > - struct vma_iterator *vmi, struct mm_struct *mm) > > -{ > > - if (vp->file) { > > - if (vp->adj_next) > > - vma_interval_tree_insert(vp->adj_next, > > - &vp->mapping->i_mmap); > > - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); > > - flush_dcache_mmap_unlock(vp->mapping); > > - } > > - > > - if (vp->remove && vp->file) { > > - __remove_shared_vm_struct(vp->remove, vp->mapping); > > - if (vp->remove2) > > - __remove_shared_vm_struct(vp->remove2, vp->mapping); > > - } else if (vp->insert) { > > - /* > > - * split_vma has split insert from vma, and needs > > - * us to insert it before dropping the locks > > - * (it may either follow vma or precede it). > > - */ > > - vma_iter_store(vmi, vp->insert); > > - mm->map_count++; > > - } > > - > > - if (vp->anon_vma) { > > - anon_vma_interval_tree_post_update_vma(vp->vma); > > - if (vp->adj_next) > > - anon_vma_interval_tree_post_update_vma(vp->adj_next); > > - anon_vma_unlock_write(vp->anon_vma); > > - } > > - > > - if (vp->file) { > > - i_mmap_unlock_write(vp->mapping); > > - uprobe_mmap(vp->vma); > > - > > - if (vp->adj_next) > > - uprobe_mmap(vp->adj_next); > > - } > > - > > - if (vp->remove) { > > -again: > > - vma_mark_detached(vp->remove, true); > > - if (vp->file) { > > - uprobe_munmap(vp->remove, vp->remove->vm_start, > > - vp->remove->vm_end); > > - fput(vp->file); > > - } > > - if (vp->remove->anon_vma) > > - anon_vma_merge(vp->vma, vp->remove); > > - mm->map_count--; > > - mpol_put(vma_policy(vp->remove)); > > - if (!vp->remove2) > > - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); > > - vm_area_free(vp->remove); > > - > > - /* > > - * In mprotect's case 6 (see comments on vma_merge), > > - * we are removing both mid and next vmas > > - */ > > - if (vp->remove2) { > > - vp->remove = vp->remove2; > > - vp->remove2 = NULL; > > - goto again; > > - } > > - } > > - if (vp->insert && vp->file) > > - uprobe_mmap(vp->insert); > > - validate_mm(mm); > > -} > > - > > /* > > * do_vmi_align_munmap() - munmap the aligned region from @start to @end. > > * @vmi: The vma iterator > > @@ -1261,20 +1378,6 @@ struct vm_area_struct > > return vma_modify(&vmg); > > } > > > > -/* > > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller > > - * must ensure that [start, end) does not overlap any existing VMA. > > - */ > > -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > > -{ > > - if (!vmg->prev) { > > - vmg->prev = vma_prev(vmg->vmi); > > - vma_iter_set(vmg->vmi, vmg->start); > > - } > > - > > - return vma_merge(vmg); > > -} > > - > > /* > > * Expand vma by delta bytes, potentially merging with an immediately adjacent > > * VMA with identical properties. > > @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, > > .anon_name = anon_vma_name(vma), > > }; > > > > - /* vma is specified as prev, so case 1 or 2 will apply. */ > > - return vma_merge(&vmg); > > + return vma_merge_new_vma(&vmg); > > } > > > > void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) > > @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, > > struct vm_area_struct *vma = *vmap; > > unsigned long vma_start = vma->vm_start; > > struct mm_struct *mm = vma->vm_mm; > > - struct vm_area_struct *new_vma, *prev; > > + struct vm_area_struct *new_vma; > > bool faulted_in_anon_vma = true; > > VMA_ITERATOR(vmi, mm, addr); > > + struct vma_merge_struct vmg = { > > + .vmi = &vmi, > > + .start = addr, > > + .end = addr + len, > > + .flags = vma->vm_flags, > > + .pgoff = pgoff, > > + .file = vma->vm_file, > > + .anon_vma = vma->anon_vma, > > + .policy = vma_policy(vma), > > + .uffd_ctx = vma->vm_userfaultfd_ctx, > > + .anon_name = anon_vma_name(vma), > > + }; > > > > /* > > * If anonymous vma has not yet been faulted, update new pgoff > > * to match new location, to increase its chance of merging. > > */ > > if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { > > - pgoff = addr >> PAGE_SHIFT; > > + pgoff = vmg.pgoff = addr >> PAGE_SHIFT; > > faulted_in_anon_vma = false; > > } > > > > - new_vma = find_vma_prev(mm, addr, &prev); > > + new_vma = find_vma_prev(mm, addr, &vmg.prev); > > if (new_vma && new_vma->vm_start < addr + len) > > return NULL; /* should never get here */ > > > > - new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff); > > + vmg.next = vma_next(&vmi); > > + vma_prev(&vmi); > > + > > + new_vma = vma_merge_new_vma(&vmg); > > + > > if (new_vma) { > > /* > > * Source vma may have been merged into new_vma > > diff --git a/mm/vma.h b/mm/vma.h > > index 50459f9e4c7f..bbb173053f34 100644 > > --- a/mm/vma.h > > +++ b/mm/vma.h > > @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); > > /* Required for expand_downwards(). */ > > void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); > > > > -/* Required for do_brk_flags(). */ > > -void vma_prepare(struct vma_prepare *vp); > > - > > -/* Required for do_brk_flags(). */ > > -void init_vma_prep(struct vma_prepare *vp, > > - struct vm_area_struct *vma); > > - > > -/* Required for do_brk_flags(). */ > > -void vma_complete(struct vma_prepare *vp, > > - struct vma_iterator *vmi, struct mm_struct *mm); > > - > > int vma_expand(struct vma_merge_struct *vmg); > > int vma_shrink(struct vma_merge_struct *vmg); > > > > @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas, > > struct vm_area_struct *next, unsigned long start, > > unsigned long end, unsigned long tree_end, bool mm_wr_locked); > > > > -/* > > - * Can we merge the VMA described by vmg into the following VMA vmg->next? > > - * > > - * Required by mmap_region(). > > - */ > > -bool can_vma_merge_before(struct vma_merge_struct *vmg); > > - > > -/* > > - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev? > > - * > > - * Required by mmap_region() and do_brk_flags(). > > - */ > > -bool can_vma_merge_after(struct vma_merge_struct *vmg); > > - > > /* We are about to modify the VMA's flags. */ > > struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, > > struct vm_area_struct *prev, > > @@ -133,31 +108,7 @@ struct vm_area_struct > > unsigned long new_flags, > > struct vm_userfaultfd_ctx new_ctx); > > > > -struct vm_area_struct > > -*vma_merge_new_vma(struct vma_merge_struct *vmg); > > - > > -/* Temporary convenience wrapper. */ > > -static inline struct vm_area_struct > > -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev, > > - struct vm_area_struct *vma, unsigned long start, > > - unsigned long end, pgoff_t pgoff) > > -{ > > - struct vma_merge_struct vmg = { > > - .vmi = vmi, > > - .prev = prev, > > - .start = start, > > - .end = end, > > - .flags = vma->vm_flags, > > - .file = vma->vm_file, > > - .anon_vma = vma->anon_vma, > > - .pgoff = pgoff, > > - .policy = vma_policy(vma), > > - .uffd_ctx = vma->vm_userfaultfd_ctx, > > - .anon_name = anon_vma_name(vma), > > - }; > > - > > - return vma_merge_new_vma(&vmg); > > -} > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg); > > > > /* > > * Temporary wrapper around vma_merge() so we can have a common interface for > > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h > > index 40797a819d3d..a39a734282d0 100644 > > --- a/tools/testing/vma/vma_internal.h > > +++ b/tools/testing/vma/vma_internal.h > > @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi) > > mas_destroy(&vmi->mas); > > } > > > > +static inline > > +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) > > +{ > > + return mas_next_range(&vmi->mas, ULONG_MAX); > > +} > > + > > static inline void vm_acct_memory(long pages) > > { > > } > > -- > > 2.45.2 > >