Thanks for taking a look! :) I know this one is a bit painful... On Mon, Mar 03, 2025 at 12:12:07PM -0500, Liam R. Howlett wrote: > * Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> [250303 06:08]: > > Place checks into a separate function so the mremap() system call is less > > egregiously long, remove unnecessary mremap_to() offset_in_page() check and > > just check that earlier so we keep all such basic checks together. > > > > Separate out the VMA in-place expansion, hugetlb and expand/move logic into > > separate, readable functions. > > > > De-duplicate code where possible, add comments and ensure that all error > > handling explicitly specifies the error at the point of it occurring rather > > than setting a prefixed error value and implicitly setting (which is bug > > prone). > > > > This lays the groundwork for subsequent patches further simplifying and > > extending the mremap() implementation. > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > > --- > > mm/mremap.c | 405 ++++++++++++++++++++++++++++++++-------------------- > > 1 file changed, 251 insertions(+), 154 deletions(-) > > > > diff --git a/mm/mremap.c b/mm/mremap.c > > index c3e4c86d0b8d..c4abda8dfc57 100644 > > --- a/mm/mremap.c > > +++ b/mm/mremap.c > > @@ -942,33 +942,14 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, > > unsigned long ret; > > unsigned long map_flags = 0; > > > > - if (offset_in_page(new_addr)) > > - return -EINVAL; > > - > > + /* Is the new length or address silly? */ > > if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) > > return -EINVAL; > > > > - /* Ensure the old/new locations do not overlap */ > > + /* Ensure the old/new locations do not overlap. */ > > if (addr + old_len > new_addr && new_addr + new_len > addr) > > return -EINVAL; > > > > - /* > > - * move_vma() need us to stay 4 maps below the threshold, otherwise > > - * it will bail out at the very beginning. > > - * That is a problem if we have already unmaped the regions here > > - * (new_addr, and old_addr), because userspace will not know the > > - * state of the vma's after it gets -ENOMEM. > > - * So, to avoid such scenario we can pre-compute if the whole > > - * operation has high chances to success map-wise. > > - * Worst-scenario case is when both vma's (new_addr and old_addr) get > > - * split in 3 before unmapping it. > > - * That means 2 more maps (1 for each) to the ones we already hold. > > - * Check whether current map count plus 2 still leads us to 4 maps below > > - * the threshold, otherwise return -ENOMEM here to be more safe. > > - */ > > - if ((mm->map_count + 2) >= sysctl_max_map_count - 3) > > - return -ENOMEM; > > - > > if (flags & MREMAP_FIXED) { > > /* > > * In mremap_to(). > > @@ -1035,6 +1016,218 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) > > return 1; > > } > > > > +/* Do the mremap() flags require that the new_addr parameter be specified? */ > > +static bool implies_new_addr(unsigned long flags) > > +{ > > + return flags & (MREMAP_FIXED | MREMAP_DONTUNMAP); > > +} > > + > > +/* > > + * Are the parameters passed to mremap() valid? If so return 0, otherwise return > > + * error. > > + */ > > +static unsigned long check_mremap_params(unsigned long addr, > > + unsigned long flags, > > + unsigned long old_len, > > + unsigned long new_len, > > + unsigned long new_addr) > > If you use two tabs for the arguments this will be more compact and more > readable. Sure, but a later commits eliminates this :) > > > +{ > > + /* Ensure no unexpected flag values. */ > > + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) > > + return -EINVAL; > > + > > + /* Start address must be page-aligned. */ > > + if (offset_in_page(addr)) > > + return -EINVAL; > > + > > + /* > > + * We allow a zero old-len as a special case > > + * for DOS-emu "duplicate shm area" thing. But > > + * a zero new-len is nonsensical. > > + */ > > + if (!PAGE_ALIGN(new_len)) > > + return -EINVAL; > > + > > + /* Remainder of checks are for cases with specific new_addr. */ > > + if (!implies_new_addr(flags)) > > + return 0; > > + > > + /* The new address must be page-aligned. */ > > + if (offset_in_page(new_addr)) > > + return -EINVAL; > > + > > + /* A fixed address implies a move. */ > > + if (!(flags & MREMAP_MAYMOVE)) > > + return -EINVAL; > > + > > + /* MREMAP_DONTUNMAP does not allow resizing in the process. */ > > + if (flags & MREMAP_DONTUNMAP && old_len != new_len) > > + return -EINVAL; > > + > > + /* > > + * move_vma() need us to stay 4 maps below the threshold, otherwise > > + * it will bail out at the very beginning. > > + * That is a problem if we have already unmaped the regions here > > + * (new_addr, and old_addr), because userspace will not know the > > + * state of the vma's after it gets -ENOMEM. > > + * So, to avoid such scenario we can pre-compute if the whole > > + * operation has high chances to success map-wise. > > + * Worst-scenario case is when both vma's (new_addr and old_addr) get > > + * split in 3 before unmapping it. > > + * That means 2 more maps (1 for each) to the ones we already hold. > > + * Check whether current map count plus 2 still leads us to 4 maps below > > + * the threshold, otherwise return -ENOMEM here to be more safe. > > + */ > > + if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) > > + return -ENOMEM; > > + > > + return 0; > > +} > > + > > +/* > > + * We know we can expand the VMA in-place by delta pages, so do so. > > + * > > + * If we discover the VMA is locked, update mm_struct statistics accordingly and > > + * indicate so to the caller. > > + */ > > +static unsigned long expand_vma_inplace(struct vm_area_struct *vma, > > + unsigned long delta, bool *locked) > > +{ > > + struct mm_struct *mm = current->mm; > > + long pages = delta >> PAGE_SHIFT; > > + VMA_ITERATOR(vmi, mm, vma->vm_end); > > + long charged = 0; > > + > > + if (vma->vm_flags & VM_ACCOUNT) { > > + if (security_vm_enough_memory_mm(mm, pages)) > > + return -ENOMEM; > > + > > + charged = pages; > > + } > > + > > + /* > > + * Function vma_merge_extend() is called on the > > + * extension we are adding to the already existing vma, > > + * vma_merge_extend() will merge this extension with the > > + * already existing vma (expand operation itself) and > > + * possibly also with the next vma if it becomes > > + * adjacent to the expanded vma and otherwise > > + * compatible. > > + */ > > + vma = vma_merge_extend(&vmi, vma, delta); > > + if (!vma) { > > + vm_unacct_memory(charged); > > + return -ENOMEM; > > + } > > + > > + vm_stat_account(mm, vma->vm_flags, pages); > > + if (vma->vm_flags & VM_LOCKED) { > > + mm->locked_vm += pages; > > + *locked = true; > > + } > > + > > + return 0; > > +} > > + > > +static bool align_hugetlb(struct vm_area_struct *vma, > > + unsigned long addr, > > + unsigned long new_addr, > > + unsigned long *old_len_ptr, > > + unsigned long *new_len_ptr, > > + unsigned long *delta_ptr) > > +{ > > + unsigned long old_len = *old_len_ptr; > > + unsigned long new_len = *new_len_ptr; > > + struct hstate *h __maybe_unused = hstate_vma(vma); > > + > > + old_len = ALIGN(old_len, huge_page_size(h)); > > + new_len = ALIGN(new_len, huge_page_size(h)); > > + > > + /* addrs must be huge page aligned */ > > + if (addr & ~huge_page_mask(h)) > > + return false; > > + if (new_addr & ~huge_page_mask(h)) > > + return false; > > + > > + /* > > + * Don't allow remap expansion, because the underlying hugetlb > > + * reservation is not yet capable to handle split reservation. > > + */ > > + if (new_len > old_len) > > + return false; > > + > > + *old_len_ptr = old_len; > > + *new_len_ptr = new_len; > > + *delta_ptr = abs_diff(old_len, new_len); > > + return true; > > +} > > + > > +/* > > + * We are mremap()'ing without specifying a fixed address to move to, but are > > + * requesting that the VMA's size be increased. > > + * > > + * Try to do so in-place, if this fails, then move the VMA to a new location to > > + * action the change. > > + */ > > +static unsigned long expand_vma(struct vm_area_struct *vma, > > + unsigned long addr, unsigned long old_len, > > + unsigned long new_len, unsigned long flags, > > + bool *locked_ptr, unsigned long *new_addr_ptr, > > + struct vm_userfaultfd_ctx *uf_ptr, > > + struct list_head *uf_unmap_ptr) > > +{ > > + unsigned long err; > > + unsigned long map_flags; > > + unsigned long new_addr; /* We ignore any user-supplied one. */ > > + pgoff_t pgoff; > > + > > + err = resize_is_valid(vma, addr, old_len, new_len, flags); > > + if (err) > > + return err; > > + > > + /* > > + * [addr, old_len) spans precisely to the end of the VMA, so try to > > + * expand it in-place. > > + */ > > + if (old_len == vma->vm_end - addr && > > + vma_expandable(vma, new_len - old_len)) { > > + err = expand_vma_inplace(vma, new_len - old_len, locked_ptr); > > You use delta twice here (new_len - old_len). I assume this is > different in the next patches. > > > + if (IS_ERR_VALUE(err)) > > + return err; > > Doesn't expand_vma_inplace() return 0 on success, error otherwise? Yeah, this is copying some already dubious logic from the original (trying to _somewhat_ minimise the delta here). At any rate, a later commit addresses this! > > > + > > + /* > > + * We want to populate the newly expanded portion of the VMA to > > + * satisfy the expectation that mlock()'ing a VMA maintains all > > + * of its pages in memory. > > + */ > > + if (*locked_ptr) > > + *new_addr_ptr = addr; > > + > > + /* OK we're done! */ > > + return addr; > > + } > > + > > + /* > > + * We weren't able to just expand or shrink the area, > > + * we need to create a new one and move it. > > + */ > > + > > So it's more of an expand_or_move_vma()? I think that's misleading, because it would be expand_or_move_and_expand_vma() or expand_in_place_or_move_and_expand()... Unavoidably we have to abbreviate, I tried to differentiate between the two cases with the 'in place' stuff. So we _try_ to do it in place, if not we have to expand elsewhere. > > > + /* We're not allowed to move the VMA, so error out. */ > > + if (!(flags & MREMAP_MAYMOVE)) > > + return -ENOMEM; > > and by flags we mean... the flags from the syscall. This gets confusing > with map_flags. At least there's only two and not six flags. 3 flags (MREMAP_FIXED, MREMAP_MAYMOVE, MREMAP_DONTUNMAP) :>) This becomes clearer later, I'm not sure saying mremap_flags really adds anything but noise though. > > > + > > + /* Find a new location to move the VMA to. */ > > + map_flags = (vma->vm_flags & VM_MAYSHARE) ? MAP_SHARED : 0; > > + pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); > > + new_addr = get_unmapped_area(vma->vm_file, 0, new_len, pgoff, map_flags); > > + if (IS_ERR_VALUE(new_addr)) > > + return new_addr; > > + *new_addr_ptr = new_addr; > > + > > + return move_vma(vma, addr, old_len, new_len, new_addr, > > + locked_ptr, flags, uf_ptr, uf_unmap_ptr); > > +} > > + > > /* > > * Expand (or shrink) an existing mapping, potentially moving it at the > > * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) > > @@ -1048,7 +1241,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, > > { > > struct mm_struct *mm = current->mm; > > struct vm_area_struct *vma; > > - unsigned long ret = -EINVAL; > > + unsigned long ret; > > + unsigned long delta; > > bool locked = false; > > struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; > > LIST_HEAD(uf_unmap_early); > > @@ -1067,70 +1261,38 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, > > */ > > addr = untagged_addr(addr); > > > > - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) > > - return ret; > > - > > - if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) > > - return ret; > > - > > - /* > > - * MREMAP_DONTUNMAP is always a move and it does not allow resizing > > - * in the process. > > - */ > > - if (flags & MREMAP_DONTUNMAP && > > - (!(flags & MREMAP_MAYMOVE) || old_len != new_len)) > > - return ret; > > - > > - > > - if (offset_in_page(addr)) > > + ret = check_mremap_params(addr, flags, old_len, new_len, new_addr); > > + if (ret) > > return ret; > > > > old_len = PAGE_ALIGN(old_len); > > new_len = PAGE_ALIGN(new_len); > > + delta = abs_diff(old_len, new_len); > > > > - /* > > - * We allow a zero old-len as a special case > > - * for DOS-emu "duplicate shm area" thing. But > > - * a zero new-len is nonsensical. > > - */ > > - if (!new_len) > > - return ret; > > - > > - if (mmap_write_lock_killable(current->mm)) > > + if (mmap_write_lock_killable(mm)) > > return -EINTR; > > + > > vma = vma_lookup(mm, addr); > > if (!vma) { > > ret = -EFAULT; > > goto out; > > } > > > > - /* Don't allow remapping vmas when they have already been sealed */ > > + /* If mseal()'d, mremap() is prohibited. */ > > if (!can_modify_vma(vma)) { > > ret = -EPERM; > > goto out; > > } > > This could be delayed to the munmap() call, which will also check to > ensure this would fail. > > It doesn't really cost anything though so I don't think it's worth it > here. Maybe something we can keep in mind for the future... Happy to address but I think would be better in a later commit, as this one is more like 'keep things the same but restructure', later commits do deeper changes. > > > > > - if (is_vm_hugetlb_page(vma)) { > > - struct hstate *h __maybe_unused = hstate_vma(vma); > > - > > - old_len = ALIGN(old_len, huge_page_size(h)); > > - new_len = ALIGN(new_len, huge_page_size(h)); > > - > > - /* addrs must be huge page aligned */ > > - if (addr & ~huge_page_mask(h)) > > - goto out; > > - if (new_addr & ~huge_page_mask(h)) > > - goto out; > > - > > - /* > > - * Don't allow remap expansion, because the underlying hugetlb > > - * reservation is not yet capable to handle split reservation. > > - */ > > - if (new_len > old_len) > > - goto out; > > + /* Align to hugetlb page size, if required. */ > > + if (is_vm_hugetlb_page(vma) && > > + !align_hugetlb(vma, addr, new_addr, &old_len, &new_len, &delta)) { > > + ret = -EINVAL; > > + goto out; > > } > > > > - if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) { > > + /* Are we RELOCATING the VMA to a SPECIFIC address? */ > > + if (implies_new_addr(flags)) { > > ret = mremap_to(addr, old_len, new_addr, new_len, > > &locked, flags, &uf, &uf_unmap_early, > > &uf_unmap); > > @@ -1138,109 +1300,44 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, > > } > > > > /* > > - * Always allow a shrinking remap: that just unmaps > > - * the unnecessary pages.. > > - * do_vmi_munmap does all the needed commit accounting, and > > - * unlocks the mmap_lock if so directed. > > + * From here on in we are only RESIZING the VMA, attempting to do so > > + * in-place, moving the VMA if we cannot. > > */ > > - if (old_len >= new_len) { > > - VMA_ITERATOR(vmi, mm, addr + new_len); > > > > - if (old_len == new_len) { > > - ret = addr; > > - goto out; > > - } > > + /* NO-OP CASE - resizing to the same size. */ > > + if (new_len == old_len) { > > + ret = addr; > > + goto out; > > + } > > + > > + /* SHRINK CASE. Can always be done in-place. */ > > + if (new_len < old_len) { > > + VMA_ITERATOR(vmi, mm, addr + new_len); > > > > - ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len, > > + /* > > + * Simply unmap the shrunken portion of the VMA. This does all > > + * the needed commit accounting, unlocking the mmap lock. > > + */ > > + ret = do_vmi_munmap(&vmi, mm, addr + new_len, delta, > > &uf_unmap, true); > > if (ret) > > goto out; > > > > + /* We succeeded, mmap lock released for us. */ > > ret = addr; > > goto out_unlocked; > > } > > > > - /* > > - * Ok, we need to grow.. > > - */ > > - ret = resize_is_valid(vma, addr, old_len, new_len, flags); > > - if (ret) > > - goto out; > > - > > - /* old_len exactly to the end of the area.. > > - */ > > - if (old_len == vma->vm_end - addr) { > > - unsigned long delta = new_len - old_len; > > - > > - /* can we just expand the current mapping? */ > > - if (vma_expandable(vma, delta)) { > > - long pages = delta >> PAGE_SHIFT; > > - VMA_ITERATOR(vmi, mm, vma->vm_end); > > - long charged = 0; > > - > > - if (vma->vm_flags & VM_ACCOUNT) { > > - if (security_vm_enough_memory_mm(mm, pages)) { > > - ret = -ENOMEM; > > - goto out; > > - } > > - charged = pages; > > - } > > - > > - /* > > - * Function vma_merge_extend() is called on the > > - * extension we are adding to the already existing vma, > > - * vma_merge_extend() will merge this extension with the > > - * already existing vma (expand operation itself) and > > - * possibly also with the next vma if it becomes > > - * adjacent to the expanded vma and otherwise > > - * compatible. > > - */ > > - vma = vma_merge_extend(&vmi, vma, delta); > > - if (!vma) { > > - vm_unacct_memory(charged); > > - ret = -ENOMEM; > > - goto out; > > - } > > + /* EXPAND case. We try to do in-place, if we can't, then we move it. */ > > + ret = expand_vma(vma, addr, old_len, new_len, flags, &locked, &new_addr, > > + &uf, &uf_unmap); > > > > - vm_stat_account(mm, vma->vm_flags, pages); > > - if (vma->vm_flags & VM_LOCKED) { > > - mm->locked_vm += pages; > > - locked = true; > > - new_addr = addr; > > - } > > - ret = addr; > > - goto out; > > - } > > - } > > - > > - /* > > - * We weren't able to just expand or shrink the area, > > - * we need to create a new one and move it.. > > - */ > > - ret = -ENOMEM; > > - if (flags & MREMAP_MAYMOVE) { > > - unsigned long map_flags = 0; > > - if (vma->vm_flags & VM_MAYSHARE) > > - map_flags |= MAP_SHARED; > > - > > - new_addr = get_unmapped_area(vma->vm_file, 0, new_len, > > - vma->vm_pgoff + > > - ((addr - vma->vm_start) >> PAGE_SHIFT), > > - map_flags); > > - if (IS_ERR_VALUE(new_addr)) { > > - ret = new_addr; > > - goto out; > > - } > > - > > - ret = move_vma(vma, addr, old_len, new_len, new_addr, > > - &locked, flags, &uf, &uf_unmap); > > - } > > out: > > if (offset_in_page(ret)) > > locked = false; > > - mmap_write_unlock(current->mm); > > + mmap_write_unlock(mm); > > if (locked && new_len > old_len) > > - mm_populate(new_addr + old_len, new_len - old_len); > > + mm_populate(new_addr + old_len, delta); > > out_unlocked: > > userfaultfd_unmap_complete(mm, &uf_unmap_early); > > mremap_userfaultfd_complete(&uf, addr, ret, old_len); > > -- > > 2.48.1 > >