From: "Liam R. Howlett" <Liam.Howlett@xxxxxxxxxx> Instead of zeroing the vma tree and then overwriting the area, let the area be overwritten and then clean up the gathered vmas using vms_complete_munmap_vmas(). If a driver is mapping over an existing vma, then clear the ptes before the call_mmap() invocation. If the vma has a vm_ops->close(), then call the close() function. This is done using the vms_clear_ptes() and vms_close_vmas() helpers. This has the side effect of needing to call open() on the vmas if the mmap_region() fails later on. Temporarily keep track of the number of pages that will be removed and reduce the charged amount. This commit drops the validate_mm() call in the vma_expand() function. It is necessary to drop the validate as it would fail since the mm map_count would be incorrect during a vma expansion, prior to the cleanup from vms_complete_munmap_vmas(). Clean up the error handing of the vms_gather_munmap_vmas() by calling the verification within the function. Note that before this change, a MAP_FIXED could fail and leave a gap in the vma tree. With this change, a MAP_FIXED failure will fail without creating a gap and leave *a* vma in the area (may have been split) and attept to restore them to an operational state (re-attached and vm_ops->open()'ed if they were vm_ops->close()'d). Signed-off-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> --- mm/internal.h | 2 + mm/mmap.c | 119 +++++++++++++++++++++++++++++++------------------- 2 files changed, 76 insertions(+), 45 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index ec8441362c28..5bd60cb9fcbb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1503,6 +1503,8 @@ struct vma_munmap_struct { unsigned long stack_vm; unsigned long data_vm; bool unlock; /* Unlock after the munmap */ + bool clear_ptes; /* If there are outstanding PTE to be cleared */ + bool closed; /* vma->vm_ops->close() called already */ }; void __meminit __init_single_page(struct page *page, unsigned long pfn, diff --git a/mm/mmap.c b/mm/mmap.c index 20da0d039c95..0b7aa2c46cec 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -170,10 +170,11 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) /* * Close a vm structure and free it. */ -static void remove_vma(struct vm_area_struct *vma, bool unreachable) +static +void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed) { might_sleep(); - if (vma->vm_ops && vma->vm_ops->close) + if (!closed && vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (vma->vm_file) fput(vma->vm_file); @@ -401,17 +402,21 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) } static unsigned long count_vma_pages_range(struct mm_struct *mm, - unsigned long addr, unsigned long end) + unsigned long addr, unsigned long end, + unsigned long *nr_accounted) { VMA_ITERATOR(vmi, mm, addr); struct vm_area_struct *vma; unsigned long nr_pages = 0; + *nr_accounted = 0; for_each_vma_range(vmi, vma, end) { unsigned long vm_start = max(addr, vma->vm_start); unsigned long vm_end = min(end, vma->vm_end); nr_pages += PHYS_PFN(vm_end - vm_start); + if (vma->vm_flags & VM_ACCOUNT) + *nr_accounted += PHYS_PFN(vm_end - vm_start); } return nr_pages; @@ -527,6 +532,8 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms, vms->exec_vm = vms->stack_vm = vms->data_vm = 0; vms->unmap_start = FIRST_USER_ADDRESS; vms->unmap_end = USER_PGTABLES_CEILING; + vms->clear_ptes = false; /* No PTEs to clear yet */ + vms->closed = false; } /* @@ -735,7 +742,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_iter_store(vmi, vma); vma_complete(&vp, vmi, vma->vm_mm); - validate_mm(vma->vm_mm); return 0; nomem: @@ -2597,23 +2603,31 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, * * Reattach any detached vmas and free up the maple tree used to track the vmas. */ -static inline void abort_munmap_vmas(struct ma_state *mas_detach) +static inline void abort_munmap_vmas(struct ma_state *mas_detach, bool closed) { struct vm_area_struct *vma; mas_set(mas_detach, 0); - mas_for_each(mas_detach, vma, ULONG_MAX) + mas_for_each(mas_detach, vma, ULONG_MAX) { + if (closed && vma->vm_ops && vma->vm_ops->close && + vma->vm_ops->open) + vma->vm_ops->open(vma); + vma_mark_detached(vma, false); + } __mt_destroy(mas_detach->tree); } -static void vms_complete_pte_clear(struct vma_munmap_struct *vms, +static inline void vms_clear_ptes(struct vma_munmap_struct *vms, struct ma_state *mas_detach, bool mm_wr_locked) { struct mmu_gather tlb; + if (!vms->clear_ptes) + return; + /* * We can free page tables without write-locking mmap_lock because VMAs * were isolated before we downgraded mmap_lock. @@ -2629,6 +2643,23 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms, free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked); tlb_finish_mmu(&tlb); + vms->clear_ptes = false; +} + +static inline void +vms_close_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) +{ + struct vm_area_struct *vma; + + if (!vms->vma_count) + return; + + mas_set(mas_detach, 0); + mas_for_each(mas_detach, vma, ULONG_MAX) + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + + vms->closed = true; } /* @@ -2652,7 +2683,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, if (vms->unlock) mmap_write_downgrade(mm); - vms_complete_pte_clear(vms, mas_detach, !vms->unlock); + vms_clear_ptes(vms, mas_detach, !vms->unlock); /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); /* Stat accounting */ @@ -2663,7 +2694,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, /* Remove and clean up vmas */ mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) - remove_vma(vma, false); + remove_vma(vma, /* unreachable = */ false, vms->closed); vm_unacct_memory(vms->nr_accounted); validate_mm(mm); @@ -2804,14 +2835,18 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, while (vma_iter_addr(vms->vmi) > vms->start) vma_iter_prev_range(vms->vmi); + /* There are now PTEs that need to be cleared */ + vms->clear_ptes = true; + return 0; userfaultfd_error: munmap_gather_failed: end_split_failed: - abort_munmap_vmas(mas_detach); + abort_munmap_vmas(mas_detach, /* closed = */ false); start_split_failed: map_count_exceeded: + validate_mm(vms->mm); return error; } @@ -2855,9 +2890,9 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; clear_tree_failed: - abort_munmap_vmas(&mas_detach); -gather_failed: + abort_munmap_vmas(&mas_detach, /* closed = */ false); validate_mm(mm); +gather_failed: return error; } @@ -2945,24 +2980,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long merge_start = addr, merge_end = end; bool writable_file_mapping = false; pgoff_t vm_pgoff; - int error; + int error = -ENOMEM; VMA_ITERATOR(vmi, mm, addr); + unsigned long nr_pages, nr_accounted; - /* Check against address space limit. */ - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { - unsigned long nr_pages; - - /* - * MAP_FIXED may remove pages of mappings that intersects with - * requested mapping. Account for the pages it would unmap. - */ - nr_pages = count_vma_pages_range(mm, addr, end); - - if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) - return -ENOMEM; - } + nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted); + /* + * Check against address space limit. + * MAP_FIXED may remove pages of mappings that intersects with requested + * mapping. Account for the pages it would unmap. + */ + if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; if (unlikely(!can_modify_mm(mm, addr, end))) return -EPERM; @@ -2979,14 +3009,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, mas_init(&mas_detach, &mt_detach, /* addr = */ 0); /* Prepare to unmap any existing mapping in the area */ if (vms_gather_munmap_vmas(&vms, &mas_detach)) - goto gather_failed; - - /* Remove any existing mappings from the vma tree */ - if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL)) - goto clear_tree_failed; + return -ENOMEM; - /* Unmap any existing mapping in the area */ - vms_complete_munmap_vmas(&vms, &mas_detach); next = vms.next; prev = vms.prev; vma = NULL; @@ -3002,8 +3026,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; + charged -= nr_accounted; if (security_vm_enough_memory_mm(mm, charged)) - return -ENOMEM; + goto abort_munmap; + vms.nr_accounted = 0; vm_flags |= VM_ACCOUNT; } @@ -3052,10 +3078,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(mm); - if (!vma) { - error = -ENOMEM; + if (!vma) goto unacct_error; - } vma_iter_config(&vmi, addr, end); vma_set_range(vma, addr, end, pgoff); @@ -3064,6 +3088,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (file) { vma->vm_file = get_file(file); + /* call_mmap() may map PTE, so ensure there are no existing PTEs */ + vms_clear_ptes(&vms, &mas_detach, /* mm_wr_locked = */ true); + vms_close_vmas(&vms, &mas_detach); error = call_mmap(file, vma); if (error) goto unmap_and_free_vma; @@ -3154,6 +3181,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, expanded: perf_event_mmap(vma); + /* Unmap any existing mapping in the area */ + if (vms.nr_pages) + vms_complete_munmap_vmas(&vms, &mas_detach); + vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || @@ -3201,14 +3232,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unacct_error: if (charged) vm_unacct_memory(charged); - validate_mm(mm); - return error; -clear_tree_failed: - abort_munmap_vmas(&mas_detach); -gather_failed: +abort_munmap: + if (vms.nr_pages) + abort_munmap_vmas(&mas_detach, vms.closed); validate_mm(mm); - return -ENOMEM; + return error; } static int __vm_munmap(unsigned long start, size_t len, bool unlock) @@ -3549,7 +3578,7 @@ void exit_mmap(struct mm_struct *mm) do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - remove_vma(vma, true); + remove_vma(vma, /* unreachable = */ true, /* closed = */ false); count++; cond_resched(); vma = vma_next(&vmi); -- 2.43.0