Re: [PATCH v3 1/2] mm/uffd: UFFD_FEATURE_WP_UNPOPULATED

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 06.03.23 22:39, Peter Xu wrote:

Note that I wodnered for a second if we'd call it "UFFD_FEATURE_WP_MISSING" instead (similar to the definition of MISSING uffd that triggers when we have nothing mapped).

Just a thought.

[...]

With WP_UNPOPUATED, application like QEMU can avoid pre-read faults all the
memory before wr-protect during taking a live snapshot.  Quotting from
Muhammad's test result here [3] based on a simple program [4]:

   (1) With huge page disabled
   echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
   ./uffd_wp_perf
   Test DEFAULT: 4
   Test PRE-READ: 1111453 (pre-fault 1101011)
   Test MADVISE: 278276 (pre-fault 266378)
   Test WP-UNPOPULATE: 11712

   (2) With Huge page enabled
   echo always > /sys/kernel/mm/transparent_hugepage/enabled
   ./uffd_wp_perf
   Test DEFAULT: 4
   Test PRE-READ: 22521 (pre-fault 22348)
   Test MADVISE: 4909 (pre-fault 4743)
   Test WP-UNPOPULATE: 14448

There'll be a great perf boost for no-thp case, while for thp enabled with
extreme case of all-thp-zero WP_UNPOPULATED can be slower than MADVISE, but
that's low possibility in reality, also the overhead was not reduced but
postponed until a follow up write on any huge zero thp, so potentitially it

s/potentitially/potentially/

is faster by making the follow up writes slower.

What I realized, interrestingly not only the writes, but also the reads. In case of background snapshots we'll be reading all VM memory I think ... but we could optimize in QEMU by consulting the pagemap if there is anything mapped at all, and not read zeros in that case [an optimization brought up several times already].

I am not sure yet if we want to change the QEMU implementation. But anyhow, that's a different discussion.


[1] https://lore.kernel.org/all/20210401092226.102804-4-andrey.gruzdev@xxxxxxxxxxxxx/
[2] https://lore.kernel.org/all/Y+v2HJ8+3i%2FKzDBu@x1n/
[3] https://lore.kernel.org/all/d0eb0a13-16dc-1ac1-653a-78b7273781e3@xxxxxxxxxxxxx/
[4] https://github.com/xzpeter/clibs/blob/master/uffd-test/uffd-wp-perf.c

Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
---
  fs/userfaultfd.c                 | 14 ++++++++
  include/linux/mm_inline.h        |  6 ++++
  include/linux/userfaultfd_k.h    |  6 ++++
  include/uapi/linux/userfaultfd.h | 10 +++++-
  mm/memory.c                      | 56 ++++++++++++++++++++++--------
  mm/mprotect.c                    | 59 ++++++++++++++++++++++++++------
  6 files changed, 126 insertions(+), 25 deletions(-)

[...]

+static vm_fault_t handle_pte_missing(struct vm_fault *vmf)
+{
+	if (vma_is_anonymous(vmf->vma))
+		return do_anonymous_page(vmf);
+	else
+		return do_fault(vmf);
+}
+
  /*
   * This is actually a page-missing access, but with uffd-wp special pte
   * installed.  It means this pte was wr-protected before being unmapped.
@@ -3634,11 +3664,10 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
  	 * Just in case there're leftover special ptes even after the region
  	 * got unregistered - we can simply clear them.
  	 */
-	if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
+	if (unlikely(!userfaultfd_wp(vmf->vma)))
  		return pte_marker_clear(vmf);
- /* do_fault() can handle pte markers too like none pte */
-	return do_fault(vmf);
+	return handle_pte_missing(vmf);
  }
static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
@@ -4008,6 +4037,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
   */
  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
  {
+	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
  	struct vm_area_struct *vma = vmf->vma;
  	struct folio *folio;
  	vm_fault_t ret = 0;
@@ -4041,7 +4071,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
  						vma->vm_page_prot));
  		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
  				vmf->address, &vmf->ptl);
-		if (!pte_none(*vmf->pte)) {
+		if (vmf_pte_changed(vmf)) {
  			update_mmu_tlb(vma, vmf->address, vmf->pte);
  			goto unlock;
  		}
@@ -4081,7 +4111,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
  			&vmf->ptl);
-	if (!pte_none(*vmf->pte)) {
+	if (vmf_pte_changed(vmf)) {
  		update_mmu_tlb(vma, vmf->address, vmf->pte);
  		goto release;
  	}
@@ -4101,6 +4131,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
  	folio_add_new_anon_rmap(folio, vma, vmf->address);
  	folio_add_lru_vma(folio, vma);
  setpte:
+	if (uffd_wp)
+		entry = pte_mkuffd_wp(entry);
  	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
@@ -4268,7 +4300,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
  {
  	struct vm_area_struct *vma = vmf->vma;
-	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
+	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
  	bool write = vmf->flags & FAULT_FLAG_WRITE;
  	bool prefault = vmf->address != addr;
  	pte_t entry;
@@ -4915,12 +4947,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
  		}
  	}
- if (!vmf->pte) {
-		if (vma_is_anonymous(vmf->vma))
-			return do_anonymous_page(vmf);
-		else
-			return do_fault(vmf);
-	}
+	if (!vmf->pte)
+		return handle_pte_missing(vmf);

It would better blend in if it would be called "do_pte_missing()".

if (!pte_present(vmf->orig_pte))
  		return do_swap_page(vmf);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 231929f119d9..6a2df93158ee 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -276,7 +276,16 @@ static long change_pte_range(struct mmu_gather *tlb,
  		} else {
  			/* It must be an none page, or what else?.. */
  			WARN_ON_ONCE(!pte_none(oldpte));
-			if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
+
+			/*
+			 * Nobody plays with any none ptes besides
+			 * userfaultfd when applying the protections.
+			 */
+			if (likely(!uffd_wp))
+				continue;
+
+			if (!vma_is_anonymous(vma) ||
+			    userfaultfd_wp_unpopulated(vma)) {

I think it would make sense to replace all 3 instances of this check by a new function (userfaultfd_wp_use_markers() ? ) and move some doc from pgtable_populate_needed() in there.

  				/*
  				 * For file-backed mem, we need to be able to
  				 * wr-protect a none pte, because even if the
@@ -320,23 +329,53 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
  	return 0;
  }
-/* Return true if we're uffd wr-protecting file-backed memory, or false */
+/*
+ * Return true if we want to split huge thps in change protection
+ * procedure, false otherwise.
+ */
  static inline bool
-uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
+pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
  {
+	/*
+	 * pte markers only resides in pte level, if we need pte markers,
+	 * we need to split.  We cannot wr-protect shmem thp because file
+	 * thp is handled differently when split by erasing the pmd so far.
+	 */
  	return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
  }
/*
- * If wr-protecting the range for file-backed, populate pgtable for the case
- * when pgtable is empty but page cache exists.  When {pte|pmd|...}_alloc()
- * failed we treat it the same way as pgtable allocation failures during
- * page faults by kicking OOM and returning error.
+ * Return true if we want to populate pgtables in change protection
+ * procedure, false otherwise
+ */
+static inline bool
+pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags)
+{
+	/* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */
+	if (!(cp_flags & MM_CP_UFFD_WP))
+		return false;
+
+	/* Either if this is file-based, we need it for pte markers */
+	if (!vma_is_anonymous(vma))
+		return true;
+
+	/*
+	 * Or anonymous, we only need this if WP_ZEROPAGE enabled (to
+	 * install zero pages).

s/WP_ZEROPAGE/WP_UNPOPULATED/

+	 */
+	return userfaultfd_wp_unpopulated(vma);
+}
+



--
Thanks,

David / dhildenb





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux