On 12/1/21 15:29, Liam Howlett wrote: > From: "Liam R. Howlett" <Liam.Howlett@xxxxxxxxxx> > > Avoid allocating a new VMA when it a vma modification can occur. When a > brk() can expand or contract a VMA, then the single store operation will > only modify one index of the maple tree instead of causing a node to > split or coalesce. This avoids unnecessary allocations/frees of maple > tree nodes and VMAs. > > Use the advanced API for the maple tree to avoid unnecessary walks of > the tree. > > Signed-off-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> > --- > mm/mmap.c | 258 +++++++++++++++++++++++++++++++++++++++++++----------- > 1 file changed, 207 insertions(+), 51 deletions(-) > > +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, > + unsigned long addr, unsigned long len, > + unsigned long flags) > { > struct mm_struct *mm = current->mm; > - struct vm_area_struct *vma, *prev; > - pgoff_t pgoff = addr >> PAGE_SHIFT; > + struct vm_area_struct *prev = NULL; > int error; > unsigned long mapped_addr; > validate_mm_mt(mm); > @@ -2740,11 +2854,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, > if (error) > return error; > > - /* Clear old maps, set up prev and uf */ > - if (munmap_vma_range(mm, addr, len, &prev, uf)) > - return -ENOMEM; > - > - /* Check against address space limits *after* clearing old maps... */ > + /* Check against address space limits by the changed size */ Can that cause spurious ENOMEM because now the check assumes 'len' worth of purely new pages and no reuse? > if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) > return -ENOMEM; > > @@ -2754,28 +2864,57 @@ static int do_brk_flags(unsigned long addr, unsigned long len, > if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) > return -ENOMEM; > > - /* Can we just expand an old private anonymous mapping? */ > - vma = vma_merge(mm, prev, addr, addr + len, flags, > - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); > - if (vma) > - goto out; > + mas->last = addr + len - 1; > + if (vma) { > + /* Expand the existing vma if possible; almost never a singular > + * list, so this will almost always fail. */ > > - /* > - * create a vma struct for an anonymous mapping > - */ > - vma = vm_area_alloc(mm); > - if (!vma) { > - vm_unacct_memory(len >> PAGE_SHIFT); > - return -ENOMEM; > + if ((!vma->anon_vma || > + list_is_singular(&vma->anon_vma_chain)) && Hmm I feel uneasy about this part that mimics what vma_merge() does. Looks like something e.g. we can easily forget to adjust when changing vma_merge() itself. Is this optimization worth the trouble given the comment above "so this will almost always fail"? > + ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)){ > + mas->index = vma->vm_start; > + > + vma_adjust_trans_huge(vma, addr, addr + len, 0); > + if (vma->anon_vma) { > + anon_vma_lock_write(vma->anon_vma); > + anon_vma_interval_tree_pre_update_vma(vma); > + } > + vma->vm_end = addr + len; > + vma->vm_flags |= VM_SOFTDIRTY; > + if (mas_store_gfp(mas, vma, GFP_KERNEL)) > + goto mas_mod_fail; > + > + if (vma->anon_vma) { > + anon_vma_interval_tree_post_update_vma(vma); > + anon_vma_unlock_write(vma->anon_vma); > + } > + khugepaged_enter_vma_merge(vma, flags); > + goto out; > + } > + prev = vma; > } > + mas->index = addr; > + mas_walk(mas); > + > + /* create a vma struct for an anonymous mapping */ > + vma = vm_area_alloc(mm); > + if (!vma) > + goto vma_alloc_fail; > > vma_set_anonymous(vma); > vma->vm_start = addr; > vma->vm_end = addr + len; > - vma->vm_pgoff = pgoff; > + vma->vm_pgoff = addr >> PAGE_SHIFT; > vma->vm_flags = flags; > vma->vm_page_prot = vm_get_page_prot(flags); > - vma_link(mm, vma, prev); > + if (vma_mas_store(vma, mas)) > + goto mas_store_fail; > + > + if (!prev) > + prev = mas_prev(mas, 0); > + > + __vma_link_list(mm, vma, prev); > + mm->map_count++; > out: > perf_event_mmap(vma); > mm->total_vm += len >> PAGE_SHIFT; > @@ -2785,15 +2924,31 @@ static int do_brk_flags(unsigned long addr, unsigned long len, > vma->vm_flags |= VM_SOFTDIRTY; > validate_mm_mt(mm); > return 0; > + > +mas_store_fail: > + vm_area_free(vma); > +vma_alloc_fail: > + vm_unacct_memory(len >> PAGE_SHIFT); > + return -ENOMEM; > + > +mas_mod_fail: > + vma->vm_end = addr; > + if (vma->anon_vma) { > + anon_vma_interval_tree_post_update_vma(vma); > + anon_vma_unlock_write(vma->anon_vma); > + } > + return -ENOMEM; > + > } > > int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) > { > struct mm_struct *mm = current->mm; > + struct vm_area_struct *vma = NULL; > unsigned long len; > int ret; > bool populate; > - LIST_HEAD(uf); > + MA_STATE(mas, &mm->mm_mt, addr, addr); > > len = PAGE_ALIGN(request); > if (len < request) > @@ -2804,10 +2959,11 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) > if (mmap_write_lock_killable(mm)) > return -EINTR; > > - ret = do_brk_flags(addr, len, flags, &uf); > + // This vma left intentionally blank. This comment using unintentionally bad syntax (// vs /* */) Also if we leave it blank it means this path won't ever expand an existing vma, while previously it could succeed the vma_merge, no? Or all callers of vm_brk_flags() in a scenario where there's no expand anyway? Maybe just have a more verbose comment... > + mas_walk(&mas); > + ret = do_brk_flags(&mas, vma, addr, len, flags); > populate = ((mm->def_flags & VM_LOCKED) != 0); > mmap_write_unlock(mm); > - userfaultfd_unmap_complete(mm, &uf); Looks like this part is removed completely from vm_brk_flags() paths? OK it seems the whole patch makes some asumption that vm_brk_flags() never has to unmap a pre-existing area, and in the brk() syscall this is now delegated to do_brk_munmap(), and do_brk_flags() loses the support. While it might be safe, it should be discussed in the patch that vm_brk_flags() didn't actually need to support the unmap part, because x y z. And best if there are some DEBUG_VM based assertions supporting that. But then again, is the optimized scenario happening often enough to warrant it? > if (populate && !ret) > mm_populate(addr, len); > return ret;