Re: [PATCH 5.15.y] mm: resolve faulty mmap_region() error path behaviour

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Nov 14, 2024 at 05:30:32PM +0000, Lorenzo Stoakes wrote:
> The mmap_region() function is somewhat terrifying, with spaghetti-like
> control flow and numerous means by which issues can arise and incomplete
> state, memory leaks and other unpleasantness can occur.
>
> A large amount of the complexity arises from trying to handle errors late
> in the process of mapping a VMA, which forms the basis of recently
> observed issues with resource leaks and observable inconsistent state.
>
> Taking advantage of previous patches in this series we move a number of
> checks earlier in the code, simplifying things by moving the core of the
> logic into a static internal function __mmap_region().
>
> Doing this allows us to perform a number of checks up front before we do
> any real work, and allows us to unwind the writable unmap check
> unconditionally as required and to perform a CONFIG_DEBUG_VM_MAPLE_TREE
> validation unconditionally also.
>
> We move a number of things here:
>
> 1. We preallocate memory for the iterator before we call the file-backed
>    memory hook, allowing us to exit early and avoid having to perform
>    complicated and error-prone close/free logic. We carefully free
>    iterator state on both success and error paths.
>
> 2. The enclosing mmap_region() function handles the mapping_map_writable()
>    logic early. Previously the logic had the mapping_map_writable() at the
>    point of mapping a newly allocated file-backed VMA, and a matching
>    mapping_unmap_writable() on success and error paths.
>
>    We now do this unconditionally if this is a file-backed, shared writable
>    mapping. If a driver changes the flags to eliminate VM_MAYWRITE, however
>    doing so does not invalidate the seal check we just performed, and we in
>    any case always decrement the counter in the wrapper.
>
>    We perform a debug assert to ensure a driver does not attempt to do the
>    opposite.
>
> 3. We also move arch_validate_flags() up into the mmap_region()
>    function. This is only relevant on arm64 and sparc64, and the check is
>    only meaningful for SPARC with ADI enabled. We explicitly add a warning
>    for this arch if a driver invalidates this check, though the code ought
>    eventually to be fixed to eliminate the need for this.
>
> With all of these measures in place, we no longer need to explicitly close
> the VMA on error paths, as we place all checks which might fail prior to a
> call to any driver mmap hook.
>
> This eliminates an entire class of errors, makes the code easier to reason
> about and more robust.

For avoidance of doubt, NACK this and the rest of the 5.15.y series, will
resend.

>
> Link: https://lkml.kernel.org/r/6e0becb36d2f5472053ac5d544c0edfe9b899e25.1730224667.git.lorenzo.stoakes@xxxxxxxxxx
> Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx>
> Reported-by: Jann Horn <jannh@xxxxxxxxxx>
> Reviewed-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx>
> Reviewed-by: Vlastimil Babka <vbabka@xxxxxxx>
> Tested-by: Mark Brown <broonie@xxxxxxxxxx>
> Cc: Andreas Larsson <andreas@xxxxxxxxxxx>
> Cc: Catalin Marinas <catalin.marinas@xxxxxxx>
> Cc: David S. Miller <davem@xxxxxxxxxxxxx>
> Cc: Helge Deller <deller@xxxxxx>
> Cc: James E.J. Bottomley <James.Bottomley@xxxxxxxxxxxxxxxxxxxxx>
> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> Cc: Peter Xu <peterx@xxxxxxxxxx>
> Cc: Will Deacon <will@xxxxxxxxxx>
> Cc: <stable@xxxxxxxxxxxxxxx>
> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> (cherry picked from commit 5de195060b2e251a835f622759550e6202167641)
> ---
>  mm/mmap.c | 73 +++++++++++++++++++++++++++++++++++--------------------
>  1 file changed, 47 insertions(+), 26 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index a766b1c1af32..f8a2f15fc5a2 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1716,7 +1716,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
>  	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
>  }
>
> -unsigned long mmap_region(struct file *file, unsigned long addr,
> +static unsigned long __mmap_region(struct file *file, unsigned long addr,
>  		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
>  		struct list_head *uf)
>  {
> @@ -1780,16 +1780,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  	vma->vm_pgoff = pgoff;
>
>  	if (file) {
> -		if (vm_flags & VM_SHARED) {
> -			error = mapping_map_writable(file->f_mapping);
> -			if (error)
> -				goto free_vma;
> -		}
> -
>  		vma->vm_file = get_file(file);
>  		error = mmap_file(file, vma);
>  		if (error)
> -			goto unmap_and_free_vma;
> +			goto unmap_and_free_file_vma;
>
>  		/* Can addr have changed??
>  		 *
> @@ -1800,6 +1794,14 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  		 */
>  		WARN_ON_ONCE(addr != vma->vm_start);
>
> +		/*
> +		 * Drivers should not permit writability when previously it was
> +		 * disallowed.
> +		 */
> +		VM_WARN_ON_ONCE(vm_flags != vma->vm_flags &&
> +				!(vm_flags & VM_MAYWRITE) &&
> +				(vma->vm_flags & VM_MAYWRITE));
> +
>  		addr = vma->vm_start;
>
>  		/* If vm_flags changed after mmap_file(), we should try merge vma again
> @@ -1818,7 +1820,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  				vma = merge;
>  				/* Update vm_flags to pick up the change. */
>  				vm_flags = vma->vm_flags;
> -				goto unmap_writable;
> +				goto file_expanded;
>  			}
>  		}
>
> @@ -1831,20 +1833,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  		vma_set_anonymous(vma);
>  	}
>
> -	/* Allow architectures to sanity-check the vm_flags */
> -	if (!arch_validate_flags(vma->vm_flags)) {
> -		error = -EINVAL;
> -		if (file)
> -			goto close_and_free_vma;
> -		else
> -			goto free_vma;
> -	}
> +#ifdef CONFIG_SPARC64
> +	/* TODO: Fix SPARC ADI! */
> +	WARN_ON_ONCE(!arch_validate_flags(vm_flags));
> +#endif
>
>  	vma_link(mm, vma, prev, rb_link, rb_parent);
> -	/* Once vma denies write, undo our temporary denial count */
> -unmap_writable:
> -	if (file && vm_flags & VM_SHARED)
> -		mapping_unmap_writable(file->f_mapping);
> +file_expanded:
>  	file = vma->vm_file;
>  out:
>  	perf_event_mmap(vma);
> @@ -1875,16 +1870,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>
>  	return addr;
>
> -close_and_free_vma:
> -	vma_close(vma);
> -unmap_and_free_vma:
> +unmap_and_free_file_vma:
>  	fput(vma->vm_file);
>  	vma->vm_file = NULL;
>
>  	/* Undo any partial mapping done by a device driver. */
>  	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
> -	if (vm_flags & VM_SHARED)
> -		mapping_unmap_writable(file->f_mapping);
>  free_vma:
>  	vm_area_free(vma);
>  unacct_error:
> @@ -2907,6 +2898,36 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
>  	return __do_munmap(mm, start, len, uf, false);
>  }
>
> +unsigned long mmap_region(struct file *file, unsigned long addr,
> +			  unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
> +			  struct list_head *uf)
> +{
> +	unsigned long ret;
> +	bool writable_file_mapping = false;
> +
> +	/* Allow architectures to sanity-check the vm_flags. */
> +	if (!arch_validate_flags(vm_flags))
> +		return -EINVAL;
> +
> +	/* Map writable and ensure this isn't a sealed memfd. */
> +	if (file && (vm_flags & VM_SHARED)) {
> +		int error = mapping_map_writable(file->f_mapping);
> +
> +		if (error)
> +			return error;
> +		writable_file_mapping = true;
> +	}
> +
> +	ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
> +
> +	/* Clear our write mapping regardless of error. */
> +	if (writable_file_mapping)
> +		mapping_unmap_writable(file->f_mapping);
> +
> +	validate_mm(current->mm);
> +	return ret;
> +}
> +
>  static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
>  {
>  	int ret;
> --
> 2.47.0
>




[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux