On 10/23/24 22:38, Lorenzo Stoakes wrote: > We have seen bugs and resource leaks arise from the complexity of the > __mmap_region() function. This, and the generally deeply fragile error > handling logic and complexity which makes understanding the function > difficult make it highly desirable to refactor it into something readable. > > Achieve this by separating the function into smaller logical parts which > are easier to understand and follow, and which importantly very > significantly simplify the error handling. > > Note that we now call vms_abort_munmap_vmas() in more error paths than we > used to, however in cases where no abort need occur, vms->nr_pages will be > equal to zero and we simply exit this function without doing more than we > would have done previously. > > Importantly, the invocation of the driver mmap hook via mmap_file() now has > very simple and obvious handling (this was previously the most problematic > part of the mmap() operation). > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> Reviewed-by: Vlastimil Babka <vbabka@xxxxxxx> Some nits: > +static int __mmap_prepare(struct mmap_state *map, struct vma_merge_struct *vmg, > + struct list_head *uf) > { > - struct mm_struct *mm = current->mm; > - struct vm_area_struct *vma = NULL; > - pgoff_t pglen = PHYS_PFN(len); > - unsigned long charged = 0; > - struct vma_munmap_struct vms; > - struct ma_state mas_detach; > - struct maple_tree mt_detach; > - unsigned long end = addr + len; > int error; > - VMA_ITERATOR(vmi, mm, addr); > - VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff); > - > - vmg.file = file; > - /* Find the first overlapping VMA */ > - vma = vma_find(&vmi, end); > - init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false); > - if (vma) { > - mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); > - mt_on_stack(mt_detach); > - mas_init(&mas_detach, &mt_detach, /* addr = */ 0); > + struct vma_iterator *vmi = map->vmi; > + struct vma_munmap_struct *vms = &map->vms; > + > + /* Find the first overlapping VMA and initialise unmap state. */ > + vms->vma = vma_find(vmi, map->end); > + init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf, > + /* unlock = */ false); > + > + /* OK, we have overlapping VMAs - prepare to unmap them. */ > + if (vms->vma) { > + mt_init_flags(&map->mt_detach, > + vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); > + mt_on_stack(map->mt_detach); > + mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0); > /* Prepare to unmap any existing mapping in the area */ > - error = vms_gather_munmap_vmas(&vms, &mas_detach); > - if (error) > - goto gather_failed; > + error = vms_gather_munmap_vmas(vms, &map->mas_detach); > + if (error) { > + /* On error VMAs will already have been reattached. */ > + vms->nr_pages = 0; > + return error; > + } > > - vmg.next = vms.next; > - vmg.prev = vms.prev; > - vma = NULL; > + map->next = vms->next; > + map->prev = vms->prev; > } else { > - vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev); > + map->next = vma_iter_next_rewind(vmi, &map->prev); > } > > + /* Set up vmg for merge attempt. */ > + vmg->file = map->file; > + vmg->prev = map->prev; > + vmg->next = map->next; It seems arbitrary to do this here. vmg->flags are set in __mmap_region(). Why not all of this? We wouldn't need to pass vmg to __mmap_prepare() at all? Do any of the values chenge between here and returning and vmg needs to capture them as they are now? AFAICS not. > + > /* Check against address space limit. */ > - if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) { > - error = -ENOMEM; > - goto abort_munmap; > - } > + if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages)) > + return -ENOMEM; > > - /* > - * Private writable mapping: check memory availability > - */ > - if (accountable_mapping(file, vm_flags)) { > - charged = pglen; > - charged -= vms.nr_accounted; > - if (charged) { > - error = security_vm_enough_memory_mm(mm, charged); > + /* Private writable mapping: check memory availability. */ > + if (accountable_mapping(map->file, map->flags)) { > + map->charged = map->pglen; > + map->charged -= vms->nr_accounted; > + if (map->charged) { > + error = security_vm_enough_memory_mm(map->mm, map->charged); > if (error) > - goto abort_munmap; > + return error; > } > > - vms.nr_accounted = 0; > - vm_flags |= VM_ACCOUNT; > - vmg.flags = vm_flags; > + vms->nr_accounted = 0; > + map->flags |= VM_ACCOUNT; > } > > /* > - * clear PTEs while the vma is still in the tree so that rmap > + * Clear PTEs while the vma is still in the tree so that rmap > * cannot race with the freeing later in the truncate scenario. > * This is also needed for mmap_file(), which is why vm_ops > * close function is called. > */ > - vms_clean_up_area(&vms, &mas_detach); > - vma = vma_merge_new_range(&vmg); > - if (vma) > - goto expanded; > + vms_clean_up_area(vms, &map->mas_detach); > + > + return 0; > +} > + <snip> > +static int __mmap_new_vma(struct mmap_state *map, struct vma_merge_struct *vmg, > + struct vm_area_struct **vmap) > +{ > + struct vma_iterator *vmi = map->vmi; > + int error = 0; > + bool merged = false; > + struct vm_area_struct *vma; > + > /* > * Determine the object being mapped and call the appropriate > * specific mapper. the address has already been validated, but > * not unmapped, but the maps are removed from the list. > */ > - vma = vm_area_alloc(mm); > - if (!vma) { > - error = -ENOMEM; > - goto unacct_error; > - } > + vma = vm_area_alloc(map->mm); > + if (!vma) > + return -ENOMEM; > > - vma_iter_config(&vmi, addr, end); > - vma_set_range(vma, addr, end, pgoff); > - vm_flags_init(vma, vm_flags); > - vma->vm_page_prot = vm_get_page_prot(vm_flags); > + vma_iter_config(vmi, map->addr, map->end); > + vma_set_range(vma, map->addr, map->end, map->pgoff); > + vm_flags_init(vma, map->flags); > + vma->vm_page_prot = vm_get_page_prot(map->flags); > > - if (vma_iter_prealloc(&vmi, vma)) { > + if (vma_iter_prealloc(vmi, vma)) { > error = -ENOMEM; > goto free_vma; > } > > - if (file) { > - vma->vm_file = get_file(file); > - error = mmap_file(file, vma); > - if (error) > - goto unmap_and_free_file_vma; > - > - /* Drivers cannot alter the address of the VMA. */ > - WARN_ON_ONCE(addr != vma->vm_start); > - /* > - * Drivers should not permit writability when previously it was > - * disallowed. > - */ > - VM_WARN_ON_ONCE(vm_flags != vma->vm_flags && > - !(vm_flags & VM_MAYWRITE) && > - (vma->vm_flags & VM_MAYWRITE)); > - > - vma_iter_config(&vmi, addr, end); > - /* > - * If vm_flags changed after mmap_file(), we should try merge > - * vma again as we may succeed this time. > - */ > - if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { > - struct vm_area_struct *merge; > - > - vmg.flags = vma->vm_flags; > - /* If this fails, state is reset ready for a reattempt. */ > - merge = vma_merge_new_range(&vmg); > - > - if (merge) { > - /* > - * ->mmap() can change vma->vm_file and fput > - * the original file. So fput the vma->vm_file > - * here or we would add an extra fput for file > - * and cause general protection fault > - * ultimately. > - */ > - fput(vma->vm_file); > - vm_area_free(vma); > - vma = merge; > - /* Update vm_flags to pick up the change. */ > - vm_flags = vma->vm_flags; > - goto file_expanded; > - } > - vma_iter_config(&vmi, addr, end); > - } > - > - vm_flags = vma->vm_flags; > - } else if (vm_flags & VM_SHARED) { > + if (map->file) > + error = __mmap_new_file_vma(map, vmg, &vma, &merged); > + else if (map->flags & VM_SHARED) > error = shmem_zero_setup(vma); > - if (error) > - goto free_iter_vma; > - } else { > + else > vma_set_anonymous(vma); > - } > + > + if (error) > + goto free_iter_vma; > + > + if (merged) > + goto file_expanded; > > #ifdef CONFIG_SPARC64 > /* TODO: Fix SPARC ADI! */ > - WARN_ON_ONCE(!arch_validate_flags(vm_flags)); > + WARN_ON_ONCE(!arch_validate_flags(map->flags)); > #endif > > /* Lock the VMA since it is modified after insertion into VMA tree */ > vma_start_write(vma); > - vma_iter_store(&vmi, vma); > - mm->map_count++; > + vma_iter_store(vmi, vma); > + map->mm->map_count++; > vma_link_file(vma); > > /* > * vma_merge_new_range() calls khugepaged_enter_vma() too, the below > * call covers the non-merge case. > */ > - khugepaged_enter_vma(vma, vma->vm_flags); > + khugepaged_enter_vma(vma, map->flags); > > file_expanded: > - file = vma->vm_file; > ksm_add_vma(vma); I'm wondering if this could go to the "non file expanded" region above. If we expanded a vma, it was either in ksm and stays in ksm, or it was not eligible and that couldn't have changed by expanding? > -expanded: > + *vmap = vma; > + return 0; > + > +free_iter_vma: > + vma_iter_free(vmi); > +free_vma: > + vm_area_free(vma); > + return error; > +} > +