Re: [PATCH v2 2/5] mm: add PTE_MARKER_GUARD PTE marker

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 10/20/24 18:20, Lorenzo Stoakes wrote:
> Add a new PTE marker that results in any access causing the accessing
> process to segfault.
> 
> This is preferable to PTE_MARKER_POISONED, which results in the same
> handling as hardware poisoned memory, and is thus undesirable for cases
> where we simply wish to 'soft' poison a range.
> 
> This is in preparation for implementing the ability to specify guard pages
> at the page table level, i.e. ranges that, when accessed, should cause
> process termination.
> 
> Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the
> function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single
> purpose was simply incorrect.
> 
> We then reuse the same logic to determine whether a zap should clear a
> guard entry - this should only be performed on teardown and never on
> MADV_DONTNEED or the like.

Since I would have personally put MADV_FREE among "or the like" here, it's
surprising to me that it in fact it's tearing down the guard entries now. Is
that intentional? It should be at least mentioned very explicitly. But I'd
really argue against it, as MADV_FREE is to me a weaker form of
MADV_DONTNEED - the existing pages are not zapped immediately but
prioritized for reclaim. If MADV_DONTNEED leaves guard PTEs in place, why
shouldn't MADV_FREE too?

Seems to me rather currently an artifact of MADV_FREE implementation - if it
encounters hwpoison entries it will tear them down because why not, we have
detected a hw memory error and are lucky the program wants to discard the
pages and not access them, so best use the opportunity and get rid of the
PTE entries immediately (if MADV_DONTNEED doesn't do that too, it certainly
could).

But to extend this to guard PTEs which are result of an explicit userspace
action feels wrong, unless the semantics is the same for MADV_DONTEED. The
semantics chosen for MADV_DONTNEED makes sense, so MADV_FREE should behave
the same?

> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx>
> ---
>  include/linux/mm_inline.h |  2 +-
>  include/linux/swapops.h   | 26 ++++++++++++++++++++++++--
>  mm/hugetlb.c              |  3 +++
>  mm/memory.c               | 18 +++++++++++++++---
>  4 files changed, 43 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> index 355cf46a01a6..1b6a917fffa4 100644
> --- a/include/linux/mm_inline.h
> +++ b/include/linux/mm_inline.h
> @@ -544,7 +544,7 @@ static inline pte_marker copy_pte_marker(
>  {
>  	pte_marker srcm = pte_marker_get(entry);
>  	/* Always copy error entries. */
> -	pte_marker dstm = srcm & PTE_MARKER_POISONED;
> +	pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD);
>  
>  	/* Only copy PTE markers if UFFD register matches. */
>  	if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index cb468e418ea1..4d0606df0791 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -426,9 +426,15 @@ typedef unsigned long pte_marker;
>   * "Poisoned" here is meant in the very general sense of "future accesses are
>   * invalid", instead of referring very specifically to hardware memory errors.
>   * This marker is meant to represent any of various different causes of this.
> + *
> + * Note that, when encountered by the faulting logic, PTEs with this marker will
> + * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error
> + * logic.
>   */
>  #define  PTE_MARKER_POISONED			BIT(1)
> -#define  PTE_MARKER_MASK			(BIT(2) - 1)
> +/* Indicates that, on fault, this PTE will case a SIGSEGV signal to be sent. */
> +#define  PTE_MARKER_GUARD			BIT(2)
> +#define  PTE_MARKER_MASK			(BIT(3) - 1)
>  
>  static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
>  {
> @@ -461,9 +467,25 @@ static inline swp_entry_t make_poisoned_swp_entry(void)
>  }
>  
>  static inline int is_poisoned_swp_entry(swp_entry_t entry)
> +{
> +	/*
> +	 * We treat guard pages as poisoned too as these have the same semantics
> +	 * as poisoned ranges, only with different fault handling.
> +	 */
> +	return is_pte_marker_entry(entry) &&
> +		(pte_marker_get(entry) &
> +		 (PTE_MARKER_POISONED | PTE_MARKER_GUARD));
> +}
> +
> +static inline swp_entry_t make_guard_swp_entry(void)
> +{
> +	return make_pte_marker_entry(PTE_MARKER_GUARD);
> +}
> +
> +static inline int is_guard_swp_entry(swp_entry_t entry)
>  {
>  	return is_pte_marker_entry(entry) &&
> -	    (pte_marker_get(entry) & PTE_MARKER_POISONED);
> +		(pte_marker_get(entry) & PTE_MARKER_GUARD);
>  }
>  
>  /*
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 906294ac85dc..50e3f6ed73ac 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6353,6 +6353,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  				ret = VM_FAULT_HWPOISON_LARGE |
>  				      VM_FAULT_SET_HINDEX(hstate_index(h));
>  				goto out_mutex;
> +			} else if (marker & PTE_MARKER_GUARD) {
> +				ret = VM_FAULT_SIGSEGV;
> +				goto out_mutex;
>  			}
>  		}
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index 0f614523b9f4..551455cd453f 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1455,7 +1455,7 @@ static inline bool should_zap_folio(struct zap_details *details,
>  	return !folio_test_anon(folio);
>  }
>  
> -static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
> +static inline bool zap_drop_markers(struct zap_details *details)
>  {
>  	if (!details)
>  		return false;
> @@ -1476,7 +1476,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
>  	if (vma_is_anonymous(vma))
>  		return;
>  
> -	if (zap_drop_file_uffd_wp(details))
> +	if (zap_drop_markers(details))
>  		return;
>  
>  	for (;;) {
> @@ -1671,7 +1671,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>  			 * drop the marker if explicitly requested.
>  			 */
>  			if (!vma_is_anonymous(vma) &&
> -			    !zap_drop_file_uffd_wp(details))
> +			    !zap_drop_markers(details))
> +				continue;
> +		} else if (is_guard_swp_entry(entry)) {
> +			/*
> +			 * Ordinary zapping should not remove guard PTE
> +			 * markers. Only do so if we should remove PTE markers
> +			 * in general.
> +			 */
> +			if (!zap_drop_markers(details))
>  				continue;
>  		} else if (is_hwpoison_entry(entry) ||
>  			   is_poisoned_swp_entry(entry)) {
> @@ -4003,6 +4011,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
>  	if (marker & PTE_MARKER_POISONED)
>  		return VM_FAULT_HWPOISON;
>  
> +	/* Hitting a guard page is always a fatal condition. */
> +	if (marker & PTE_MARKER_GUARD)
> +		return VM_FAULT_SIGSEGV;
> +
>  	if (pte_marker_entry_uffd_wp(entry))
>  		return pte_marker_handle_uffd_wp(vmf);
>  





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux