Re: [PATCH v3 13/16] mm/mmap: Avoid zeroing vma tree in mmap_region()

Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> · Mon, 8 Jul 2024 13:18:38 +0100

On Thu, Jul 04, 2024 at 02:27:15PM GMT, Liam R. Howlett wrote:
> From: "Liam R. Howlett" <Liam.Howlett@xxxxxxxxxx>
>
> Instead of zeroing the vma tree and then overwriting the area, let the
> area be overwritten and then clean up the gathered vmas using
> vms_complete_munmap_vmas().
>
> In the case of a driver mapping over existing vmas, the PTEs are cleared
> using the helper vms_complete_pte_clear().
>
> Temporarily keep track of the number of pages that will be removed and
> reduce the charged amount.
>
> This also drops the validate_mm() call in the vma_expand() function.
> It is necessary to drop the validate as it would fail since the mm
> map_count would be incorrect during a vma expansion, prior to the
> cleanup from vms_complete_munmap_vmas().
>
> Signed-off-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx>
> ---
>  mm/internal.h |  1 +
>  mm/mmap.c     | 61 ++++++++++++++++++++++++++++++---------------------
>  2 files changed, 37 insertions(+), 25 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 4c9f06669cc4..fae4a1bba732 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1503,6 +1503,7 @@ struct vma_munmap_struct {
>  	unsigned long stack_vm;
>  	unsigned long data_vm;
>  	bool unlock;			/* Unlock after the munmap */
> +	bool cleared_ptes;		/* If the PTE are cleared already */
>  };
>
>  void __meminit __init_single_page(struct page *page, unsigned long pfn,
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 5d458c5f080e..0c334eeae8cd 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -401,17 +401,21 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
>  }
>
>  static unsigned long count_vma_pages_range(struct mm_struct *mm,
> -		unsigned long addr, unsigned long end)
> +		unsigned long addr, unsigned long end,
> +		unsigned long *nr_accounted)
>  {
>  	VMA_ITERATOR(vmi, mm, addr);
>  	struct vm_area_struct *vma;
>  	unsigned long nr_pages = 0;
>
> +	*nr_accounted = 0;
>  	for_each_vma_range(vmi, vma, end) {
>  		unsigned long vm_start = max(addr, vma->vm_start);
>  		unsigned long vm_end = min(end, vma->vm_end);
>
>  		nr_pages += PHYS_PFN(vm_end - vm_start);
> +		if (vma->vm_flags & VM_ACCOUNT)
> +			*nr_accounted += PHYS_PFN(vm_end - vm_start);

We're duplicating the PHYS_PFN(vm_end - vm_start) thing, probably worth
adding something like:

unsigned long num_pages = PHYS_PFN(vm_end - vm_start);

Side-note, but it'd be nice to sort out the inconsistency of PHYS_PFN()
vs. (end - start) >> PAGE_SHIFT. This is probably not a huge deal though...

>  	}
>
>  	return nr_pages;
> @@ -522,6 +526,7 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
>  	vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
>  	vms->unmap_start = FIRST_USER_ADDRESS;
>  	vms->unmap_end = USER_PGTABLES_CEILING;
> +	vms->cleared_ptes = false;
>  }
>
>  /*
> @@ -730,7 +735,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	vma_iter_store(vmi, vma);
>
>  	vma_complete(&vp, vmi, vma->vm_mm);
> -	validate_mm(vma->vm_mm);

Since we're dropping this here, do we need to re-add this back somehwere
where we are confident the state will be consistent?

>  	return 0;
>
>  nomem:
> @@ -2612,6 +2616,9 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
>  {
>  	struct mmu_gather tlb;
>
> +	if (vms->cleared_ptes)
> +		return;
> +
>  	/*
>  	 * We can free page tables without write-locking mmap_lock because VMAs
>  	 * were isolated before we downgraded mmap_lock.
> @@ -2624,6 +2631,7 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
>  	mas_set(mas_detach, 1);
>  	free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked);
>  	tlb_finish_mmu(&tlb);
> +	vms->cleared_ptes = true;
>  }
>
>  /*
> @@ -2936,24 +2944,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  	unsigned long merge_start = addr, merge_end = end;
>  	bool writable_file_mapping = false;
>  	pgoff_t vm_pgoff;
> -	int error;
> +	int error = -ENOMEM;
>  	VMA_ITERATOR(vmi, mm, addr);
> +	unsigned long nr_pages, nr_accounted;
>
> -	/* Check against address space limit. */
> -	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> -		unsigned long nr_pages;
> -
> -		/*
> -		 * MAP_FIXED may remove pages of mappings that intersects with
> -		 * requested mapping. Account for the pages it would unmap.
> -		 */
> -		nr_pages = count_vma_pages_range(mm, addr, end);
> -
> -		if (!may_expand_vm(mm, vm_flags,
> -					(len >> PAGE_SHIFT) - nr_pages))
> -			return -ENOMEM;
> -	}
> +	nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);
>
> +	/* Check against address space limit. */
> +	/*
> +	 * MAP_FIXED may remove pages of mappings that intersects with requested
> +	 * mapping. Account for the pages it would unmap.
> +	 */

Utter pedantry, but could these comments be combined? Bit ugly to have one
after another like this.

> +	if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
> +		return -ENOMEM;
>
>  	if (unlikely(!can_modify_mm(mm, addr, end)))
>  		return -EPERM;
> @@ -2971,14 +2974,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  		if (vms_gather_munmap_vmas(&vms, &mas_detach))
>  			return -ENOMEM;
>
> -		if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL))
> -			return -ENOMEM;
> -
> -		vms_complete_munmap_vmas(&vms, &mas_detach);
>  		next = vms.next;
>  		prev = vms.prev;
>  		vma = NULL;
>  	} else {
> +		/* Minimal setup of vms */
> +		vms.nr_pages = 0;

I'm not a huge fan of having vms be uninitialised other than this field and
then to rely on no further code change accidentally using an uninitialised
field. This is kind of asking for bugs.

Can we not find a way to sensibly initialise it somehow?

>  		next = vma_next(&vmi);
>  		prev = vma_prev(&vmi);
>  		if (prev)
> @@ -2990,8 +2991,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  	 */
>  	if (accountable_mapping(file, vm_flags)) {
>  		charged = len >> PAGE_SHIFT;
> +		charged -= nr_accounted;
>  		if (security_vm_enough_memory_mm(mm, charged))
> -			return -ENOMEM;
> +			goto abort_munmap;
> +		vms.nr_accounted = 0;

This is kind of expanding the 'vms possibly unitialised apart from selected
fields' pattern, makes me worry.

>  		vm_flags |= VM_ACCOUNT;
>  	}
>
> @@ -3040,10 +3043,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  	 * not unmapped, but the maps are removed from the list.
>  	 */
>  	vma = vm_area_alloc(mm);
> -	if (!vma) {
> -		error = -ENOMEM;
> +	if (!vma)
>  		goto unacct_error;
> -	}
>
>  	vma_iter_config(&vmi, addr, end);
>  	vma_set_range(vma, addr, end, pgoff);
> @@ -3052,6 +3053,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>
>  	if (file) {
>  		vma->vm_file = get_file(file);
> +		/* call_mmap() map PTE, so ensure there are no existing PTEs */

Typo? Should this be 'call_mmap() maps PTEs, so ensure there are no
existing PTEs'? I feel like this could be reworded something like:

'call_map() may map PTEs, so clear any that may be pending unmap ahead of
time.'

> +		if (vms.nr_pages)
> +			vms_complete_pte_clear(&vms, &mas_detach, true);
>  		error = call_mmap(file, vma);
>  		if (error)
>  			goto unmap_and_free_vma;
> @@ -3142,6 +3146,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  expanded:
>  	perf_event_mmap(vma);
>
> +	if (vms.nr_pages)
> +		vms_complete_munmap_vmas(&vms, &mas_detach);
> +

Hang on, if we already did this in the if (file) branch above, might we end
up calling this twice? I didn't see vms.nr_pages get set to zero or
decremented anywhere (unless I missed it)?

>  	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
>  	if (vm_flags & VM_LOCKED) {
>  		if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
> @@ -3189,6 +3196,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  unacct_error:
>  	if (charged)
>  		vm_unacct_memory(charged);
> +
> +abort_munmap:
> +	if (vms.nr_pages)
> +		abort_munmap_vmas(&mas_detach);
>  	validate_mm(mm);
>  	return error;
>  }
> --
> 2.43.0
>

In general I like the approach and you've made it very clear how you've
altered this behaviour.

However I have a few concerns (as well some trivial comments) above. With
those cleared up we'll be good to go!