Re: [PATCH v7 2/4] mseal: add mseal syscall

"Liam R. Howlett" <Liam.Howlett@xxxxxxxxxx> · Tue, 23 Jan 2024 13:14:57 -0500

* jeffxu@xxxxxxxxxxxx <jeffxu@xxxxxxxxxxxx> [240122 10:29]:
> From: Jeff Xu <jeffxu@xxxxxxxxxxxx>
> 
> The new mseal() is an syscall on 64 bit CPU, and with
> following signature:
> 
> int mseal(void addr, size_t len, unsigned long flags)
> addr/len: memory range.
> flags: reserved.
> 
> mseal() blocks following operations for the given memory range.
> 
> 1> Unmapping, moving to another location, and shrinking the size,
>    via munmap() and mremap(), can leave an empty space, therefore can
>    be replaced with a VMA with a new set of attributes.
> 
> 2> Moving or expanding a different VMA into the current location,
>    via mremap().
> 
> 3> Modifying a VMA via mmap(MAP_FIXED).
> 
> 4> Size expansion, via mremap(), does not appear to pose any specific
>    risks to sealed VMAs. It is included anyway because the use case is
>    unclear. In any case, users can rely on merging to expand a sealed VMA.
> 
> 5> mprotect() and pkey_mprotect().
> 
> 6> Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous
>    memory, when users don't have write permission to the memory. Those
>    behaviors can alter region contents by discarding pages, effectively a
>    memset(0) for anonymous memory.
> 
> In addition: mmap() has two related changes.
> 
> The PROT_SEAL bit in prot field of mmap(). When present, it marks
> the map sealed since creation.
> 
> The MAP_SEALABLE bit in the flags field of mmap(). When present, it marks
> the map as sealable. A map created without MAP_SEALABLE will not support
> sealing, i.e. mseal() will fail.
> 
> Applications that don't care about sealing will expect their behavior
> unchanged. For those that need sealing support, opt-in by adding
> MAP_SEALABLE in mmap().
> 
> I would like to formally acknowledge the valuable contributions
> received during the RFC process, which were instrumental
> in shaping this patch:
> 
> Jann Horn: raising awareness and providing valuable insights on the
> destructive madvise operations.
> Linus Torvalds: assisting in defining system call signature and scope.
> Pedro Falcato: suggesting sealing in the mmap().
> Theo de Raadt: sharing the experiences and insights gained from
> implementing mimmutable() in OpenBSD.
> 
> Finally, the idea that inspired this patch comes from Stephen Röttger’s
> work in Chrome V8 CFI.
> 
> Signed-off-by: Jeff Xu <jeffxu@xxxxxxxxxxxx>
> ---
>  include/linux/mm.h                     |  48 ++++
>  include/linux/syscalls.h               |   1 +
>  include/uapi/asm-generic/mman-common.h |   8 +
>  mm/Makefile                            |   4 +
>  mm/madvise.c                           |  12 +
>  mm/mmap.c                              |  27 ++
>  mm/mprotect.c                          |  10 +
>  mm/mremap.c                            |  31 +++
>  mm/mseal.c                             | 343 +++++++++++++++++++++++++
>  9 files changed, 484 insertions(+)
>  create mode 100644 mm/mseal.c
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f5a97dec5169..bdd9a53e9291 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h

None of this can live in mm/internal.h ?

> @@ -328,6 +328,14 @@ extern unsigned int kobjsize(const void *objp);
>  #define VM_HIGH_ARCH_5	BIT(VM_HIGH_ARCH_BIT_5)
>  #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
>  
> +#ifdef CONFIG_64BIT
> +/* VM is sealable, in vm_flags */
> +#define VM_SEALABLE	_BITUL(63)
> +
> +/* VM is sealed, in vm_flags */
> +#define VM_SEALED	_BITUL(62)
> +#endif
> +
>  #ifdef CONFIG_ARCH_HAS_PKEYS
>  # define VM_PKEY_SHIFT	VM_HIGH_ARCH_BIT_0
>  # define VM_PKEY_BIT0	VM_HIGH_ARCH_0	/* A protection key is a 4-bit value */
> @@ -4182,4 +4190,44 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
>  	return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
>  }
>  
> +#ifdef CONFIG_64BIT
> +static inline int can_do_mseal(unsigned long flags)
> +{
> +	if (flags)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +bool can_modify_mm(struct mm_struct *mm, unsigned long start,
> +		unsigned long end);
> +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
> +		unsigned long end, int behavior);
> +unsigned long get_mmap_seals(unsigned long prot,
> +		unsigned long flags);
> +#else
> +static inline int can_do_mseal(unsigned long flags)
> +{
> +	return -EPERM;
> +}
> +
> +static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start,
> +		unsigned long end)
> +{
> +	return true;
> +}
> +
> +static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
> +		unsigned long end, int behavior)
> +{
> +	return true;
> +}
> +
> +static inline unsigned long get_mmap_seals(unsigned long prot,
> +	unsigned long flags)
> +{
> +	return 0;
> +}
> +#endif
> +
>  #endif /* _LINUX_MM_H */

...

> diff --git a/mm/mmap.c b/mm/mmap.c
> index b78e83d351d2..32bc2179aed0 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1213,6 +1213,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>  {
>  	struct mm_struct *mm = current->mm;
>  	int pkey = 0;
> +	unsigned long vm_seals;
>  
>  	*populate = 0;
>  
> @@ -1233,6 +1234,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>  	if (flags & MAP_FIXED_NOREPLACE)
>  		flags |= MAP_FIXED;
>  
> +	vm_seals = get_mmap_seals(prot, flags);
> +
>  	if (!(flags & MAP_FIXED))
>  		addr = round_hint_to_min(addr);
>  
> @@ -1261,6 +1264,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>  			return -EEXIST;
>  	}
>  
> +	/*
> +	 * Check if the address range is sealed for do_mmap().
> +	 * can_modify_mm assumes we have acquired the lock on MM.
> +	 */
> +	if (!can_modify_mm(mm, addr, addr + len))
> +		return -EPERM;
> +

This is called after get_unmapped_area(), so this area is either going
to be MAP_FIXED and return the "hint" addr or it's going to be empty.
You can probably avoid walking the VMAs in the non-FIXED case.  This
would remove the overhead of your check in the most common case.

>  	if (prot == PROT_EXEC) {
>  		pkey = execute_only_pkey(mm);
>  		if (pkey < 0)
> @@ -1376,6 +1386,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>  			vm_flags |= VM_NORESERVE;
>  	}
>  
> +	vm_flags |= vm_seals;
>  	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
>  	if (!IS_ERR_VALUE(addr) &&
>  	    ((vm_flags & VM_LOCKED) ||
> @@ -2679,6 +2690,14 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
>  	if (end == start)
>  		return -EINVAL;
>  
> +	/*
> +	 * Check if memory is sealed before arch_unmap.
> +	 * Prevent unmapping a sealed VMA.
> +	 * can_modify_mm assumes we have acquired the lock on MM.
> +	 */
> +	if (!can_modify_mm(mm, start, end))
> +		return -EPERM;
> +

This function is currently called from mmap_region(), so we are going to
run this check twice as you have it; once in do_mmap() then again in
mma_region() -> do_vmi_munmap().  This effectively doubles your impact
to MAP_FIXED calls.

>  	 /* arch_unmap() might do unmaps itself.  */
>  	arch_unmap(mm, start, end);
>  
> @@ -3102,6 +3121,14 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  
> +	/*
> +	 * Check if memory is sealed before arch_unmap.
> +	 * Prevent unmapping a sealed VMA.
> +	 * can_modify_mm assumes we have acquired the lock on MM.
> +	 */
> +	if (!can_modify_mm(mm, start, end))
> +		return -EPERM;
> +

I am sure you've looked at the callers, from what I found there are two:

The brk call uses this function, so it may check more than one VMA in
that path.  Will the brk VMAs potentially be msealed?  I guess someone
could do that?

The other place this is use is in ipc/shm.c whhere the start/end is just
the vma start/end, so we only really need to check that one vma.

Is there a way to avoid walking the tree for the single known VMA?  Does
it make sense to deny mseal writing to brk VMAs?

>  	arch_unmap(mm, start, end);
>  	return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
>  }

...

Ah, I see them now.  Yes, this is what I expected to see.  Does this not
have any impact on mmap/munmap benchmarks?

> +bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
> +{
> +	struct vm_area_struct *vma;
> +
> +	VMA_ITERATOR(vmi, mm, start);
> +
> +	/* going through each vma to check. */
> +	for_each_vma_range(vmi, vma, end) {
> +		if (!can_modify_vma(vma))
> +			return false;
> +	}
> +
> +	/* Allow by default. */
> +	return true;
> +}
> +
> +/*
> + * Check if the vmas of a memory range are allowed to be modified by madvise.
> + * the memory ranger can have a gap (unallocated memory).
> + * return true, if it is allowed.
> + */
> +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
> +		int behavior)
> +{
> +	struct vm_area_struct *vma;
> +
> +	VMA_ITERATOR(vmi, mm, start);
> +
> +	if (!is_madv_discard(behavior))
> +		return true;
> +
> +	/* going through each vma to check. */
> +	for_each_vma_range(vmi, vma, end)
> +		if (is_ro_anon(vma) && !can_modify_vma(vma))
> +			return false;
> +
> +	/* Allow by default. */
> +	return true;
> +}
> +

...

> +static int check_mm_seal(unsigned long start, unsigned long end)
> +{
> +	struct vm_area_struct *vma;
> +	unsigned long nstart = start;
> +
> +	VMA_ITERATOR(vmi, current->mm, start);
> +
> +	/* going through each vma to check. */
> +	for_each_vma_range(vmi, vma, end) {
> +		if (vma->vm_start > nstart)
> +			/* unallocated memory found. */
> +			return -ENOMEM;

Ah, another potential user for a contiguous iterator of VMAs.

> +
> +		if (!can_add_vma_seal(vma))
> +			return -EACCES;
> +
> +		if (vma->vm_end >= end)
> +			return 0;
> +
> +		nstart = vma->vm_end;
> +	}
> +
> +	return -ENOMEM;
> +}
> +
> +/*
> + * Apply sealing.
> + */
> +static int apply_mm_seal(unsigned long start, unsigned long end)
> +{
> +	unsigned long nstart;
> +	struct vm_area_struct *vma, *prev;
> +
> +	VMA_ITERATOR(vmi, current->mm, start);
> +
> +	vma = vma_iter_load(&vmi);
> +	/*
> +	 * Note: check_mm_seal should already checked ENOMEM case.
> +	 * so vma should not be null, same for the other ENOMEM cases.

The start to end is contiguous, right?

> +	 */
> +	prev = vma_prev(&vmi);
> +	if (start > vma->vm_start)
> +		prev = vma;
> +
> +	nstart = start;
> +	for_each_vma_range(vmi, vma, end) {
> +		int error;
> +		unsigned long tmp;
> +		vm_flags_t newflags;
> +
> +		newflags = vma->vm_flags | VM_SEALED;
> +		tmp = vma->vm_end;
> +		if (tmp > end)
> +			tmp = end;
> +		error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
> +		if (error)
> +			return error;

> +		tmp = vma_iter_end(&vmi);
> +		nstart = tmp;

You set tmp before using it unconditionally to vma->vm_end above, so you
can set nstart = vma_iter_end(&vmi) here.  But, also we know the
VMAs are contiguous from your check_mm_seal() call, so we know nstart ==
vma->vm_start on the next loop.

...