On Thu, Nov 14, 2024 at 05:30:32PM +0000, Lorenzo Stoakes wrote: > The mmap_region() function is somewhat terrifying, with spaghetti-like > control flow and numerous means by which issues can arise and incomplete > state, memory leaks and other unpleasantness can occur. > > A large amount of the complexity arises from trying to handle errors late > in the process of mapping a VMA, which forms the basis of recently > observed issues with resource leaks and observable inconsistent state. > > Taking advantage of previous patches in this series we move a number of > checks earlier in the code, simplifying things by moving the core of the > logic into a static internal function __mmap_region(). > > Doing this allows us to perform a number of checks up front before we do > any real work, and allows us to unwind the writable unmap check > unconditionally as required and to perform a CONFIG_DEBUG_VM_MAPLE_TREE > validation unconditionally also. > > We move a number of things here: > > 1. We preallocate memory for the iterator before we call the file-backed > memory hook, allowing us to exit early and avoid having to perform > complicated and error-prone close/free logic. We carefully free > iterator state on both success and error paths. > > 2. The enclosing mmap_region() function handles the mapping_map_writable() > logic early. Previously the logic had the mapping_map_writable() at the > point of mapping a newly allocated file-backed VMA, and a matching > mapping_unmap_writable() on success and error paths. > > We now do this unconditionally if this is a file-backed, shared writable > mapping. If a driver changes the flags to eliminate VM_MAYWRITE, however > doing so does not invalidate the seal check we just performed, and we in > any case always decrement the counter in the wrapper. > > We perform a debug assert to ensure a driver does not attempt to do the > opposite. > > 3. We also move arch_validate_flags() up into the mmap_region() > function. This is only relevant on arm64 and sparc64, and the check is > only meaningful for SPARC with ADI enabled. We explicitly add a warning > for this arch if a driver invalidates this check, though the code ought > eventually to be fixed to eliminate the need for this. > > With all of these measures in place, we no longer need to explicitly close > the VMA on error paths, as we place all checks which might fail prior to a > call to any driver mmap hook. > > This eliminates an entire class of errors, makes the code easier to reason > about and more robust. For avoidance of doubt, NACK this and the rest of the 5.15.y series, will resend. > > Link: https://lkml.kernel.org/r/6e0becb36d2f5472053ac5d544c0edfe9b899e25.1730224667.git.lorenzo.stoakes@xxxxxxxxxx > Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails") > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > Reported-by: Jann Horn <jannh@xxxxxxxxxx> > Reviewed-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> > Reviewed-by: Vlastimil Babka <vbabka@xxxxxxx> > Tested-by: Mark Brown <broonie@xxxxxxxxxx> > Cc: Andreas Larsson <andreas@xxxxxxxxxxx> > Cc: Catalin Marinas <catalin.marinas@xxxxxxx> > Cc: David S. Miller <davem@xxxxxxxxxxxxx> > Cc: Helge Deller <deller@xxxxxx> > Cc: James E.J. Bottomley <James.Bottomley@xxxxxxxxxxxxxxxxxxxxx> > Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> > Cc: Peter Xu <peterx@xxxxxxxxxx> > Cc: Will Deacon <will@xxxxxxxxxx> > Cc: <stable@xxxxxxxxxxxxxxx> > Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > (cherry picked from commit 5de195060b2e251a835f622759550e6202167641) > --- > mm/mmap.c | 73 +++++++++++++++++++++++++++++++++++-------------------- > 1 file changed, 47 insertions(+), 26 deletions(-) > > diff --git a/mm/mmap.c b/mm/mmap.c > index a766b1c1af32..f8a2f15fc5a2 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -1716,7 +1716,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) > return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; > } > > -unsigned long mmap_region(struct file *file, unsigned long addr, > +static unsigned long __mmap_region(struct file *file, unsigned long addr, > unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, > struct list_head *uf) > { > @@ -1780,16 +1780,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > vma->vm_pgoff = pgoff; > > if (file) { > - if (vm_flags & VM_SHARED) { > - error = mapping_map_writable(file->f_mapping); > - if (error) > - goto free_vma; > - } > - > vma->vm_file = get_file(file); > error = mmap_file(file, vma); > if (error) > - goto unmap_and_free_vma; > + goto unmap_and_free_file_vma; > > /* Can addr have changed?? > * > @@ -1800,6 +1794,14 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > */ > WARN_ON_ONCE(addr != vma->vm_start); > > + /* > + * Drivers should not permit writability when previously it was > + * disallowed. > + */ > + VM_WARN_ON_ONCE(vm_flags != vma->vm_flags && > + !(vm_flags & VM_MAYWRITE) && > + (vma->vm_flags & VM_MAYWRITE)); > + > addr = vma->vm_start; > > /* If vm_flags changed after mmap_file(), we should try merge vma again > @@ -1818,7 +1820,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > vma = merge; > /* Update vm_flags to pick up the change. */ > vm_flags = vma->vm_flags; > - goto unmap_writable; > + goto file_expanded; > } > } > > @@ -1831,20 +1833,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > vma_set_anonymous(vma); > } > > - /* Allow architectures to sanity-check the vm_flags */ > - if (!arch_validate_flags(vma->vm_flags)) { > - error = -EINVAL; > - if (file) > - goto close_and_free_vma; > - else > - goto free_vma; > - } > +#ifdef CONFIG_SPARC64 > + /* TODO: Fix SPARC ADI! */ > + WARN_ON_ONCE(!arch_validate_flags(vm_flags)); > +#endif > > vma_link(mm, vma, prev, rb_link, rb_parent); > - /* Once vma denies write, undo our temporary denial count */ > -unmap_writable: > - if (file && vm_flags & VM_SHARED) > - mapping_unmap_writable(file->f_mapping); > +file_expanded: > file = vma->vm_file; > out: > perf_event_mmap(vma); > @@ -1875,16 +1870,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > return addr; > > -close_and_free_vma: > - vma_close(vma); > -unmap_and_free_vma: > +unmap_and_free_file_vma: > fput(vma->vm_file); > vma->vm_file = NULL; > > /* Undo any partial mapping done by a device driver. */ > unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); > - if (vm_flags & VM_SHARED) > - mapping_unmap_writable(file->f_mapping); > free_vma: > vm_area_free(vma); > unacct_error: > @@ -2907,6 +2898,36 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, > return __do_munmap(mm, start, len, uf, false); > } > > +unsigned long mmap_region(struct file *file, unsigned long addr, > + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, > + struct list_head *uf) > +{ > + unsigned long ret; > + bool writable_file_mapping = false; > + > + /* Allow architectures to sanity-check the vm_flags. */ > + if (!arch_validate_flags(vm_flags)) > + return -EINVAL; > + > + /* Map writable and ensure this isn't a sealed memfd. */ > + if (file && (vm_flags & VM_SHARED)) { > + int error = mapping_map_writable(file->f_mapping); > + > + if (error) > + return error; > + writable_file_mapping = true; > + } > + > + ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); > + > + /* Clear our write mapping regardless of error. */ > + if (writable_file_mapping) > + mapping_unmap_writable(file->f_mapping); > + > + validate_mm(current->mm); > + return ret; > +} > + > static int __vm_munmap(unsigned long start, size_t len, bool downgrade) > { > int ret; > -- > 2.47.0 >