I did some experiments on the rss counter bug. The next patch is made for linux-next-20240613 with commit 1c29a32ce65f4cd0f1c0f9 reverted. Then I simply inlined the code of do_vmi_unmap() and do_vmi_align_munmap() into mmap_region(). This version of the code works fine and does not show the rss counter bug. diff --git a/mm/mmap.c b/mm/mmap.c index f95af72ddc9f..0f020c535c83 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -733,7 +733,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_iter_store(vmi, vma); vma_complete(&vp, vmi, vma->vm_mm); - validate_mm(vma->vm_mm); return 0; nomem: @@ -2911,6 +2910,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct vm_area_struct *next, *prev, *merge; pgoff_t pglen = len >> PAGE_SHIFT; unsigned long charged = 0; + struct vma_munmap_struct vms; + struct ma_state mas_detach; unsigned long end = addr + len; unsigned long merge_start = addr, merge_end = end; bool writable_file_mapping = false; @@ -2933,12 +2934,46 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -ENOMEM; } - /* Unmap any existing mapping in the area */ - error = do_vmi_munmap(&vmi, mm, addr, len, uf, false); - if (error == -EPERM) - return error; - else if (error) - return -ENOMEM; + /* Find the first overlapping VMA */ + vma = vma_find(&vmi, end); + if (vma) { + struct maple_tree mt_detach; + + /* + * Check if memory is sealed before arch_unmap. + * Prevent unmapping a sealed VMA. + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(mm, addr, end))) { + return -EPERM; + } + + /* arch_unmap() might do unmaps itself. */ + arch_unmap(mm, addr, end); + + mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); + mt_on_stack(mt_detach); + mas_init(&mas_detach, &mt_detach, 0); + + init_vma_munmap(&vms, &vmi, vma, addr, end, uf, false); + error = vms_gather_munmap_vmas(&vms, &mas_detach); + if (error) { + validate_mm(mm); + return -ENOMEM; + } + + vma = NULL; + error = vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL); + if (error) { + abort_munmap_vmas(&mas_detach); + return -ENOMEM; + } + + /* Point of no return */ + vms_complete_munmap_vmas(&vms, &mas_detach); + } else { + // TODO + } /* * Private writable mapping: check memory availability The next patch now moves the call to vms_complete_munmap_vmas() towards the end of mmap_region(). This code is also free of the rss counter bug. commit a4b24bb18dde627792297455befcc465e45be66d Author: Bert Karwatzki <spasswolf@xxxxxx> Date: Thu Jun 20 17:02:08 2024 +0200 mm: mmap: push back vms_complete_munmap_vmas() In order to to debug the rss counter bug we're going to push back vms_complete_munmap_vmas() in mmap_region. Signed-off-by: Bert Karwatzki <spasswolf@xxxxxx> diff --git a/mm/mmap.c b/mm/mmap.c index 0f020c535c83..4fb9dd2e6d6e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2970,9 +2970,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Point of no return */ - vms_complete_munmap_vmas(&vms, &mas_detach); } else { - // TODO + vms.end = 0; + vms.nr_pages = 0; } /* @@ -3016,6 +3016,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_iter_next_range(&vmi); } + if (vms.end) { + vms_complete_munmap_vmas(&vms, &mas_detach); + vms.end = 0; // avoid double unmap below + } + /* Actually expand, if possible */ if (vma && !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { @@ -3026,7 +3031,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (vma == prev) vma_iter_set(&vmi, addr); cannot_expand: - + if (vms.end) + vms_complete_munmap_vmas(&vms, &mas_detach); /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but The next patch move vms_complete_munmap_vmas() a little further beyond the call to vma_expand(). This code contain the rss counter bug. commit 02d6be2410fa503d008f4cc8dcd1518ca56f8793 Author: Bert Karwatzki <spasswolf@xxxxxx> Date: Thu Jun 20 20:07:13 2024 +0200 mm: mmap: push back vms_complete_munmap_vmas() This commit actually show the rss counter bug, while the previus does not! Signed-off-by: Bert Karwatzki <spasswolf@xxxxxx> diff --git a/mm/mmap.c b/mm/mmap.c index 4fb9dd2e6d6e..c5f4b4b6fb84 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3016,14 +3016,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_iter_next_range(&vmi); } - if (vms.end) { - vms_complete_munmap_vmas(&vms, &mas_detach); - vms.end = 0; // avoid double unmap below - } - /* Actually expand, if possible */ if (vma && !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { + if (vms.end) { + vms_complete_munmap_vmas(&vms, &mas_detach); + } khugepaged_enter_vma(vma, vm_flags); goto expanded; } So there might be some unwanted interaction between vms_complete_munmap_vmas though I've no yet figured out what exactly is happening. Hope this will be helpful in solving the problem. Bert Karwatzki