* jeffxu@xxxxxxxxxxxx <jeffxu@xxxxxxxxxxxx> [240122 10:29]: > From: Jeff Xu <jeffxu@xxxxxxxxxxxx> > > The new mseal() is an syscall on 64 bit CPU, and with > following signature: > > int mseal(void addr, size_t len, unsigned long flags) > addr/len: memory range. > flags: reserved. > > mseal() blocks following operations for the given memory range. > > 1> Unmapping, moving to another location, and shrinking the size, > via munmap() and mremap(), can leave an empty space, therefore can > be replaced with a VMA with a new set of attributes. > > 2> Moving or expanding a different VMA into the current location, > via mremap(). > > 3> Modifying a VMA via mmap(MAP_FIXED). > > 4> Size expansion, via mremap(), does not appear to pose any specific > risks to sealed VMAs. It is included anyway because the use case is > unclear. In any case, users can rely on merging to expand a sealed VMA. > > 5> mprotect() and pkey_mprotect(). > > 6> Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous > memory, when users don't have write permission to the memory. Those > behaviors can alter region contents by discarding pages, effectively a > memset(0) for anonymous memory. > > In addition: mmap() has two related changes. > > The PROT_SEAL bit in prot field of mmap(). When present, it marks > the map sealed since creation. > > The MAP_SEALABLE bit in the flags field of mmap(). When present, it marks > the map as sealable. A map created without MAP_SEALABLE will not support > sealing, i.e. mseal() will fail. > > Applications that don't care about sealing will expect their behavior > unchanged. For those that need sealing support, opt-in by adding > MAP_SEALABLE in mmap(). > > I would like to formally acknowledge the valuable contributions > received during the RFC process, which were instrumental > in shaping this patch: > > Jann Horn: raising awareness and providing valuable insights on the > destructive madvise operations. > Linus Torvalds: assisting in defining system call signature and scope. > Pedro Falcato: suggesting sealing in the mmap(). > Theo de Raadt: sharing the experiences and insights gained from > implementing mimmutable() in OpenBSD. > > Finally, the idea that inspired this patch comes from Stephen Röttger’s > work in Chrome V8 CFI. > > Signed-off-by: Jeff Xu <jeffxu@xxxxxxxxxxxx> > --- > include/linux/mm.h | 48 ++++ > include/linux/syscalls.h | 1 + > include/uapi/asm-generic/mman-common.h | 8 + > mm/Makefile | 4 + > mm/madvise.c | 12 + > mm/mmap.c | 27 ++ > mm/mprotect.c | 10 + > mm/mremap.c | 31 +++ > mm/mseal.c | 343 +++++++++++++++++++++++++ > 9 files changed, 484 insertions(+) > create mode 100644 mm/mseal.c > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index f5a97dec5169..bdd9a53e9291 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h None of this can live in mm/internal.h ? > @@ -328,6 +328,14 @@ extern unsigned int kobjsize(const void *objp); > #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) > #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ > > +#ifdef CONFIG_64BIT > +/* VM is sealable, in vm_flags */ > +#define VM_SEALABLE _BITUL(63) > + > +/* VM is sealed, in vm_flags */ > +#define VM_SEALED _BITUL(62) > +#endif > + > #ifdef CONFIG_ARCH_HAS_PKEYS > # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 > # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ > @@ -4182,4 +4190,44 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn) > return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE); > } > > +#ifdef CONFIG_64BIT > +static inline int can_do_mseal(unsigned long flags) > +{ > + if (flags) > + return -EINVAL; > + > + return 0; > +} > + > +bool can_modify_mm(struct mm_struct *mm, unsigned long start, > + unsigned long end); > +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, > + unsigned long end, int behavior); > +unsigned long get_mmap_seals(unsigned long prot, > + unsigned long flags); > +#else > +static inline int can_do_mseal(unsigned long flags) > +{ > + return -EPERM; > +} > + > +static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start, > + unsigned long end) > +{ > + return true; > +} > + > +static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, > + unsigned long end, int behavior) > +{ > + return true; > +} > + > +static inline unsigned long get_mmap_seals(unsigned long prot, > + unsigned long flags) > +{ > + return 0; > +} > +#endif > + > #endif /* _LINUX_MM_H */ ... > diff --git a/mm/mmap.c b/mm/mmap.c > index b78e83d351d2..32bc2179aed0 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -1213,6 +1213,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > { > struct mm_struct *mm = current->mm; > int pkey = 0; > + unsigned long vm_seals; > > *populate = 0; > > @@ -1233,6 +1234,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > if (flags & MAP_FIXED_NOREPLACE) > flags |= MAP_FIXED; > > + vm_seals = get_mmap_seals(prot, flags); > + > if (!(flags & MAP_FIXED)) > addr = round_hint_to_min(addr); > > @@ -1261,6 +1264,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > return -EEXIST; > } > > + /* > + * Check if the address range is sealed for do_mmap(). > + * can_modify_mm assumes we have acquired the lock on MM. > + */ > + if (!can_modify_mm(mm, addr, addr + len)) > + return -EPERM; > + This is called after get_unmapped_area(), so this area is either going to be MAP_FIXED and return the "hint" addr or it's going to be empty. You can probably avoid walking the VMAs in the non-FIXED case. This would remove the overhead of your check in the most common case. > if (prot == PROT_EXEC) { > pkey = execute_only_pkey(mm); > if (pkey < 0) > @@ -1376,6 +1386,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > vm_flags |= VM_NORESERVE; > } > > + vm_flags |= vm_seals; > addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); > if (!IS_ERR_VALUE(addr) && > ((vm_flags & VM_LOCKED) || > @@ -2679,6 +2690,14 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, > if (end == start) > return -EINVAL; > > + /* > + * Check if memory is sealed before arch_unmap. > + * Prevent unmapping a sealed VMA. > + * can_modify_mm assumes we have acquired the lock on MM. > + */ > + if (!can_modify_mm(mm, start, end)) > + return -EPERM; > + This function is currently called from mmap_region(), so we are going to run this check twice as you have it; once in do_mmap() then again in mma_region() -> do_vmi_munmap(). This effectively doubles your impact to MAP_FIXED calls. > /* arch_unmap() might do unmaps itself. */ > arch_unmap(mm, start, end); > > @@ -3102,6 +3121,14 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, > { > struct mm_struct *mm = vma->vm_mm; > > + /* > + * Check if memory is sealed before arch_unmap. > + * Prevent unmapping a sealed VMA. > + * can_modify_mm assumes we have acquired the lock on MM. > + */ > + if (!can_modify_mm(mm, start, end)) > + return -EPERM; > + I am sure you've looked at the callers, from what I found there are two: The brk call uses this function, so it may check more than one VMA in that path. Will the brk VMAs potentially be msealed? I guess someone could do that? The other place this is use is in ipc/shm.c whhere the start/end is just the vma start/end, so we only really need to check that one vma. Is there a way to avoid walking the tree for the single known VMA? Does it make sense to deny mseal writing to brk VMAs? > arch_unmap(mm, start, end); > return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); > } ... Ah, I see them now. Yes, this is what I expected to see. Does this not have any impact on mmap/munmap benchmarks? > +bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) > +{ > + struct vm_area_struct *vma; > + > + VMA_ITERATOR(vmi, mm, start); > + > + /* going through each vma to check. */ > + for_each_vma_range(vmi, vma, end) { > + if (!can_modify_vma(vma)) > + return false; > + } > + > + /* Allow by default. */ > + return true; > +} > + > +/* > + * Check if the vmas of a memory range are allowed to be modified by madvise. > + * the memory ranger can have a gap (unallocated memory). > + * return true, if it is allowed. > + */ > +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, > + int behavior) > +{ > + struct vm_area_struct *vma; > + > + VMA_ITERATOR(vmi, mm, start); > + > + if (!is_madv_discard(behavior)) > + return true; > + > + /* going through each vma to check. */ > + for_each_vma_range(vmi, vma, end) > + if (is_ro_anon(vma) && !can_modify_vma(vma)) > + return false; > + > + /* Allow by default. */ > + return true; > +} > + ... > +static int check_mm_seal(unsigned long start, unsigned long end) > +{ > + struct vm_area_struct *vma; > + unsigned long nstart = start; > + > + VMA_ITERATOR(vmi, current->mm, start); > + > + /* going through each vma to check. */ > + for_each_vma_range(vmi, vma, end) { > + if (vma->vm_start > nstart) > + /* unallocated memory found. */ > + return -ENOMEM; Ah, another potential user for a contiguous iterator of VMAs. > + > + if (!can_add_vma_seal(vma)) > + return -EACCES; > + > + if (vma->vm_end >= end) > + return 0; > + > + nstart = vma->vm_end; > + } > + > + return -ENOMEM; > +} > + > +/* > + * Apply sealing. > + */ > +static int apply_mm_seal(unsigned long start, unsigned long end) > +{ > + unsigned long nstart; > + struct vm_area_struct *vma, *prev; > + > + VMA_ITERATOR(vmi, current->mm, start); > + > + vma = vma_iter_load(&vmi); > + /* > + * Note: check_mm_seal should already checked ENOMEM case. > + * so vma should not be null, same for the other ENOMEM cases. The start to end is contiguous, right? > + */ > + prev = vma_prev(&vmi); > + if (start > vma->vm_start) > + prev = vma; > + > + nstart = start; > + for_each_vma_range(vmi, vma, end) { > + int error; > + unsigned long tmp; > + vm_flags_t newflags; > + > + newflags = vma->vm_flags | VM_SEALED; > + tmp = vma->vm_end; > + if (tmp > end) > + tmp = end; > + error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); > + if (error) > + return error; > + tmp = vma_iter_end(&vmi); > + nstart = tmp; You set tmp before using it unconditionally to vma->vm_end above, so you can set nstart = vma_iter_end(&vmi) here. But, also we know the VMAs are contiguous from your check_mm_seal() call, so we know nstart == vma->vm_start on the next loop. ...