[RFC v2 PATCH 10/17] mm: Reuse large folios for anonymous memory

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



When taking a write fault on an anonymous page, attempt to reuse as much
of the folio as possible if it is exclusive to the process.

This avoids a problem where an exclusive, PTE-mapped THP would
previously have all of its pages except the last one CoWed, then the
last page would be reused, causing the whole original folio to hang
around as well as all the CoWed pages. This problem is exaserbated now
that we are allocating variable-order folios for anonymous memory. The
reason for this behaviour is that a PTE-mapped THP has a reference for
each PTE and the old code thought that meant it was not exclusively
mapped, and therefore could not be reused.

We now take care to find the region that intersects the underlying
folio, the VMA and the PMD entry and for the presence of that number of
references as indicating exclusivity. Note that we are not guarranteed
that this region will cover the whole folio due to munmap and mremap.

The aim is to reuse as much as possible in one go in order to:
- reduce memory consumption
- reduce number of CoWs
- reduce time spent in fault handler

Signed-off-by: Ryan Roberts <ryan.roberts@xxxxxxx>
---
 mm/memory.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 160 insertions(+), 9 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 83835ff5a818..7e2af54fe2e0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3038,6 +3038,26 @@ struct anon_folio_range {
 	bool exclusive;
 };

+static inline unsigned long page_addr(struct page *page,
+				struct page *anchor, unsigned long anchor_addr)
+{
+	unsigned long offset;
+	unsigned long addr;
+
+	offset = (page_to_pfn(page) - page_to_pfn(anchor)) << PAGE_SHIFT;
+	addr = anchor_addr + offset;
+
+	if (anchor > page) {
+		if (addr > anchor_addr)
+			return 0;
+	} else {
+		if (addr < anchor_addr)
+			return ULONG_MAX;
+	}
+
+	return addr;
+}
+
 /*
  * Returns index of first pte that is not none, or nr if all are none.
  */
@@ -3122,6 +3142,122 @@ static int calc_anon_folio_order_alloc(struct vm_fault *vmf, int order)
 	return order;
 }

+static void calc_anon_folio_range_reuse(struct vm_fault *vmf,
+					struct folio *folio,
+					struct anon_folio_range *range_out)
+{
+	/*
+	 * The aim here is to determine the biggest range of pages that can be
+	 * reused for this CoW fault if the identified range is responsible for
+	 * all the references on the folio (i.e. it is exclusive) such that:
+	 * - All pages are contained within folio
+	 * - All pages are within VMA
+	 * - All pages are within the same pmd entry as vmf->address
+	 * - vmf->page is contained within the range
+	 * - All covered ptes must be present, physically contiguous and RO
+	 *
+	 * Note that the folio itself may not be naturally aligned in VA space
+	 * due to mremap. We take the largest range we can in order to increase
+	 * our chances of being the exclusive user of the folio, therefore
+	 * meaning we can reuse. Its possible that the folio crosses a pmd
+	 * boundary, in which case we don't follow it into the next pte because
+	 * this complicates the locking.
+	 *
+	 * Note that the caller may or may not choose to lock the pte. If
+	 * unlocked, the calculation should be considered an estimate that will
+	 * need to be validated under the lock.
+	 */
+
+	struct vm_area_struct *vma = vmf->vma;
+	struct page *page;
+	pte_t *ptep;
+	pte_t pte;
+	bool excl = true;
+	unsigned long start, end;
+	int bloops, floops;
+	int i;
+	unsigned long pfn;
+
+	/*
+	 * Iterate backwards, starting with the page immediately before the
+	 * anchor page. On exit from the loop, start is the inclusive start
+	 * virtual address of the range.
+	 */
+
+	start = page_addr(&folio->page, vmf->page, vmf->address);
+	start = max(start, vma->vm_start);
+	start = max(start, ALIGN_DOWN(vmf->address, PMD_SIZE));
+	bloops = (vmf->address - start) >> PAGE_SHIFT;
+
+	page = vmf->page - 1;
+	ptep = vmf->pte - 1;
+	pfn = page_to_pfn(vmf->page) - 1;
+
+	for (i = 0; i < bloops; i++) {
+		pte = *ptep;
+
+		if (!pte_present(pte) ||
+		    pte_write(pte) ||
+		    pte_protnone(pte) ||
+		    pte_pfn(pte) != pfn) {
+			start = vmf->address - (i << PAGE_SHIFT);
+			break;
+		}
+
+		if (excl && !PageAnonExclusive(page))
+			excl = false;
+
+		pfn--;
+		ptep--;
+		page--;
+	}
+
+	/*
+	 * Iterate forward, starting with the anchor page. On exit from the
+	 * loop, end is the exclusive end virtual address of the range.
+	 */
+
+	end = page_addr(&folio->page + folio_nr_pages(folio),
+			vmf->page, vmf->address);
+	end = min(end, vma->vm_end);
+	end = min(end, ALIGN_DOWN(vmf->address, PMD_SIZE) + PMD_SIZE);
+	floops = (end - vmf->address) >> PAGE_SHIFT;
+
+	page = vmf->page;
+	ptep = vmf->pte;
+	pfn = page_to_pfn(vmf->page);
+
+	for (i = 0; i < floops; i++) {
+		pte = *ptep;
+
+		if (!pte_present(pte) ||
+		    pte_write(pte) ||
+		    pte_protnone(pte) ||
+		    pte_pfn(pte) != pfn) {
+			end = vmf->address + (i << PAGE_SHIFT);
+			break;
+		}
+
+		if (excl && !PageAnonExclusive(page))
+			excl = false;
+
+		pfn++;
+		ptep++;
+		page++;
+	}
+
+	/*
+	 * Fixup vmf to point to the start of the range, and return number of
+	 * pages in range.
+	 */
+
+	range_out->va_start = start;
+	range_out->pg_start = vmf->page - ((vmf->address - start) >> PAGE_SHIFT);
+	range_out->pte_start = vmf->pte - ((vmf->address - start) >> PAGE_SHIFT);
+	range_out->nr = (end - start) >> PAGE_SHIFT;
+	range_out->exclusive = excl;
+}
+
 /*
  * Handle write page faults for pages that can be reused in the current vma
  *
@@ -3528,13 +3664,23 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 	/*
 	 * Private mapping: create an exclusive anonymous page copy if reuse
 	 * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
+	 * For anonymous memory, we attempt to copy/reuse in folios rather than
+	 * page-by-page. We always prefer reuse above copy, even if we can only
+	 * reuse a subset of the folio. Note that when reusing pages in a folio,
+	 * due to munmap, mremap and friends, the folio isn't guarranteed to be
+	 * naturally aligned in virtual memory space.
 	 */
 	if (folio && folio_test_anon(folio)) {
+		struct anon_folio_range range;
+		int swaprefs;
+
+		calc_anon_folio_range_reuse(vmf, folio, &range);
+
 		/*
-		 * If the page is exclusive to this process we must reuse the
-		 * page without further checks.
+		 * If the pages have already been proven to be exclusive to this
+		 * process we must reuse the pages without further checks.
 		 */
-		if (PageAnonExclusive(vmf->page))
+		if (range.exclusive)
 			goto reuse;

 		/*
@@ -3544,7 +3690,10 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 		 *
 		 * KSM doesn't necessarily raise the folio refcount.
 		 */
-		if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
+		swaprefs = folio_test_swapcache(folio) ?
+				folio_nr_pages(folio) : 0;
+		if (folio_test_ksm(folio) ||
+		    folio_ref_count(folio) > range.nr + swaprefs + 1)
 			goto copy;
 		if (!folio_test_lru(folio))
 			/*
@@ -3552,29 +3701,31 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 			 * remote LRU pagevecs or references to LRU folios.
 			 */
 			lru_add_drain();
-		if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
+		if (folio_ref_count(folio) > range.nr + swaprefs)
 			goto copy;
 		if (!folio_trylock(folio))
 			goto copy;
 		if (folio_test_swapcache(folio))
 			folio_free_swap(folio);
-		if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
+		if (folio_test_ksm(folio) ||
+		    folio_ref_count(folio) != range.nr) {
 			folio_unlock(folio);
 			goto copy;
 		}
 		/*
-		 * Ok, we've got the only folio reference from our mapping
+		 * Ok, we've got the only folio references from our mapping
 		 * and the folio is locked, it's dark out, and we're wearing
 		 * sunglasses. Hit it.
 		 */
-		page_move_anon_rmap(vmf->page, vma);
+		folio_move_anon_rmap_range(folio, range.pg_start,
+							range.nr, vma);
 		folio_unlock(folio);
 reuse:
 		if (unlikely(unshare)) {
 			pte_unmap_unlock(vmf->pte, vmf->ptl);
 			return 0;
 		}
-		wp_page_reuse(vmf, NULL);
+		wp_page_reuse(vmf, &range);
 		return 0;
 	}
 copy:
--
2.25.1





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux