Re: [PATCH] mm: remove unnecessary page_table_lock on stack expansion

"Liam R. Howlett" <Liam.Howlett@xxxxxxxxxx> · Tue, 5 Nov 2024 10:07:03 -0500

* Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> [241101 14:46]:
> Ever since commit 8d7071af8907 ("mm: always expand the stack with the mmap
> write lock held") we have been expanding the stack with the mmap write lock
> held.
> 
> This is true in all code paths:
> 
> get_arg_page()
>   -> expand_downwards()
> setup_arg_pages()
>   -> expand_stack_locked()
>     -> expand_downwards() / expand_upwards()
> lock_mm_and_find_vma()
>   -> expand_stack_locked()
>     -> expand_downwards() / expand_upwards()
> create_elf_tables()
>   -> find_extend_vma_locked()
>     -> expand_stack_locked()
> expand_stack()
>   -> vma_expand_down()
>     -> expand_downwards()
> expand_stack()
>   -> vma_expand_up()
>     -> expand_upwards()
> 
> Each of which acquire the mmap write lock before doing so. Despite this, we
> maintain code that acquires a page table lock in the expand_upwards() and
> expand_downwards() code, stating that we hold a shared mmap lock and thus
> this is necessary.
> 
> It is not, we do not have to worry about concurrent VMA expansions so we
> can simply drop this, and update comments accordingly.
> 
> We do not even need be concerned with racing page faults, as
> vma_start_write() is invoked in both cases.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx>

Reviewed-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx>

> ---
>  mm/mmap.c | 38 ++++++--------------------------------
>  1 file changed, 6 insertions(+), 32 deletions(-)
> 
> diff --git a/mm/mmap.c b/mm/mmap.c
> index f904b3bba962..386429f7db5a 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1039,6 +1039,8 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>  	if (!(vma->vm_flags & VM_GROWSUP))
>  		return -EFAULT;
>  
> +	mmap_assert_write_locked(mm);
> +
>  	/* Guard against exceeding limits of the address space. */
>  	address &= PAGE_MASK;
>  	if (address >= (TASK_SIZE & PAGE_MASK))
> @@ -1074,11 +1076,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>  
>  	/* Lock the VMA before expanding to prevent concurrent page faults */
>  	vma_start_write(vma);
> -	/*
> -	 * vma->vm_start/vm_end cannot change under us because the caller
> -	 * is required to hold the mmap_lock in read mode.  We need the
> -	 * anon_vma lock to serialize against concurrent expand_stacks.
> -	 */
> +	/* We update the anon VMA tree. */
>  	anon_vma_lock_write(vma->anon_vma);
>  
>  	/* Somebody else might have raced and expanded it already */
> @@ -1092,16 +1090,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>  		if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
>  			error = acct_stack_growth(vma, size, grow);
>  			if (!error) {
> -				/*
> -				 * We only hold a shared mmap_lock lock here, so
> -				 * we need to protect against concurrent vma
> -				 * expansions.  anon_vma_lock_write() doesn't
> -				 * help here, as we don't guarantee that all
> -				 * growable vmas in a mm share the same root
> -				 * anon vma.  So, we reuse mm->page_table_lock
> -				 * to guard against concurrent vma expansions.
> -				 */
> -				spin_lock(&mm->page_table_lock);
>  				if (vma->vm_flags & VM_LOCKED)
>  					mm->locked_vm += grow;
>  				vm_stat_account(mm, vma->vm_flags, grow);
> @@ -1110,7 +1098,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>  				/* Overwrite old entry in mtree. */
>  				vma_iter_store(&vmi, vma);
>  				anon_vma_interval_tree_post_update_vma(vma);
> -				spin_unlock(&mm->page_table_lock);
>  
>  				perf_event_mmap(vma);
>  			}
> @@ -1137,6 +1124,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
>  	if (!(vma->vm_flags & VM_GROWSDOWN))
>  		return -EFAULT;
>  
> +	mmap_assert_write_locked(mm);
> +
>  	address &= PAGE_MASK;
>  	if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
>  		return -EPERM;
> @@ -1166,11 +1155,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
>  
>  	/* Lock the VMA before expanding to prevent concurrent page faults */
>  	vma_start_write(vma);
> -	/*
> -	 * vma->vm_start/vm_end cannot change under us because the caller
> -	 * is required to hold the mmap_lock in read mode.  We need the
> -	 * anon_vma lock to serialize against concurrent expand_stacks.
> -	 */
> +	/* We update the anon VMA tree. */
>  	anon_vma_lock_write(vma->anon_vma);
>  
>  	/* Somebody else might have raced and expanded it already */
> @@ -1184,16 +1169,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
>  		if (grow <= vma->vm_pgoff) {
>  			error = acct_stack_growth(vma, size, grow);
>  			if (!error) {
> -				/*
> -				 * We only hold a shared mmap_lock lock here, so
> -				 * we need to protect against concurrent vma
> -				 * expansions.  anon_vma_lock_write() doesn't
> -				 * help here, as we don't guarantee that all
> -				 * growable vmas in a mm share the same root
> -				 * anon vma.  So, we reuse mm->page_table_lock
> -				 * to guard against concurrent vma expansions.
> -				 */
> -				spin_lock(&mm->page_table_lock);
>  				if (vma->vm_flags & VM_LOCKED)
>  					mm->locked_vm += grow;
>  				vm_stat_account(mm, vma->vm_flags, grow);
> @@ -1203,7 +1178,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
>  				/* Overwrite old entry in mtree. */
>  				vma_iter_store(&vmi, vma);
>  				anon_vma_interval_tree_post_update_vma(vma);
> -				spin_unlock(&mm->page_table_lock);
>  
>  				perf_event_mmap(vma);
>  			}
> -- 
> 2.47.0
>