Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 06/01/25 3:47 pm, Usama Arif wrote:

On 16/12/2024 16:51, Dev Jain wrote:
In contrast to PMD-collapse, we do not need to operate on two levels of pagetable
simultaneously. Therefore, downgrade the mmap lock from write to read mode. Still
take the anon_vma lock in exclusive mode so as to not waste time in the rmap path,
which is anyways going to fail since the PTEs are going to be changed. Under the PTL,
copy page contents, clear the PTEs, remove folio pins, and (try to) unmap the
old folios. Set the PTEs to the new folio using the set_ptes() API.

Signed-off-by: Dev Jain <dev.jain@xxxxxxx>
---
Note: I have been trying hard to get rid of the locks in here: we still are
taking the PTL around the page copying; dropping the PTL and taking it after
the copying should lead to a deadlock, for example:
khugepaged						madvise(MADV_COLD)
folio_lock()						lock(ptl)
lock(ptl)						folio_lock()

We can create a locked folio list, altogether drop both the locks, take the PTL,
do everything which __collapse_huge_page_isolate() does *except* the isolation and
again try locking folios, but then it will reduce efficiency of khugepaged
and almost looks like a forced solution :)
Please note the following discussion if anyone is interested:
https://lore.kernel.org/all/66bb7496-a445-4ad7-8e56-4f2863465c54@xxxxxxx/
(Apologies for not CCing the mailing list from the start)

  mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++++++++++----------
  1 file changed, 87 insertions(+), 21 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 88beebef773e..8040b130e677 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -714,24 +714,28 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
  						struct vm_area_struct *vma,
  						unsigned long address,
  						spinlock_t *ptl,
-						struct list_head *compound_pagelist)
+						struct list_head *compound_pagelist, int order)
  {
  	struct folio *src, *tmp;
  	pte_t *_pte;
  	pte_t pteval;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+	for (_pte = pte; _pte < pte + (1UL << order);
  	     _pte++, address += PAGE_SIZE) {
  		pteval = ptep_get(_pte);
  		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
  			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
  			if (is_zero_pfn(pte_pfn(pteval))) {
-				/*
-				 * ptl mostly unnecessary.
-				 */
-				spin_lock(ptl);
-				ptep_clear(vma->vm_mm, address, _pte);
-				spin_unlock(ptl);
+				if (order == HPAGE_PMD_ORDER) {
+					/*
+					* ptl mostly unnecessary.
+					*/
+					spin_lock(ptl);
+					ptep_clear(vma->vm_mm, address, _pte);
+					spin_unlock(ptl);
+				} else {
+					ptep_clear(vma->vm_mm, address, _pte);
+				}
  				ksm_might_unmap_zero_page(vma->vm_mm, pteval);
  			}
  		} else {
@@ -740,15 +744,20 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
  			src = page_folio(src_page);
  			if (!folio_test_large(src))
  				release_pte_folio(src);
-			/*
-			 * ptl mostly unnecessary, but preempt has to
-			 * be disabled to update the per-cpu stats
-			 * inside folio_remove_rmap_pte().
-			 */
-			spin_lock(ptl);
-			ptep_clear(vma->vm_mm, address, _pte);
-			folio_remove_rmap_pte(src, src_page, vma);
-			spin_unlock(ptl);
+			if (order == HPAGE_PMD_ORDER) {
+				/*
+				* ptl mostly unnecessary, but preempt has to
+				* be disabled to update the per-cpu stats
+				* inside folio_remove_rmap_pte().
+				*/
+				spin_lock(ptl);
+				ptep_clear(vma->vm_mm, address, _pte);
+				folio_remove_rmap_pte(src, src_page, vma);
+				spin_unlock(ptl);
+			} else {
+				ptep_clear(vma->vm_mm, address, _pte);
+				folio_remove_rmap_pte(src, src_page, vma);
+			}
  			free_page_and_swap_cache(src_page);
  		}
  	}
@@ -807,7 +816,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
  static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
  		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
  		unsigned long address, spinlock_t *ptl,
-		struct list_head *compound_pagelist)
+		struct list_head *compound_pagelist, int order)
  {
  	unsigned int i;
  	int result = SCAN_SUCCEED;
@@ -815,7 +824,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
  	/*
  	 * Copying pages' contents is subject to memory poison at any iteration.
  	 */
-	for (i = 0; i < HPAGE_PMD_NR; i++) {
+	for (i = 0; i < (1 << order); i++) {
  		pte_t pteval = ptep_get(pte + i);
  		struct page *page = folio_page(folio, i);
  		unsigned long src_addr = address + i * PAGE_SIZE;
@@ -834,7 +843,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
if (likely(result == SCAN_SUCCEED))
  		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
-						    compound_pagelist);
+						    compound_pagelist, order);
  	else
  		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
  						 compound_pagelist, order);
@@ -1196,7 +1205,7 @@ static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long addre
result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
  					   vma, address, pte_ptl,
-					   &compound_pagelist);
+					   &compound_pagelist, HPAGE_PMD_ORDER);
  	pte_unmap(pte);
  	if (unlikely(result != SCAN_SUCCEED))
  		goto out_up_write;
@@ -1228,6 +1237,61 @@ static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long addre
  	return result;
  }
+/* Enter with mmap read lock */
+static int vma_collapse_anon_folio(struct mm_struct *mm, unsigned long address,
+		struct vm_area_struct *vma, struct collapse_control *cc, pmd_t *pmd,
+		struct folio *folio, int order)
+{
+	int result;
+	struct mmu_notifier_range range;
+	spinlock_t *pte_ptl;
+	LIST_HEAD(compound_pagelist);
+	pte_t *pte;
+	pte_t entry;
+	int nr_pages = folio_nr_pages(folio);
+
+	anon_vma_lock_write(vma->anon_vma);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
+				address + (PAGE_SIZE << order));
+	mmu_notifier_invalidate_range_start(&range);
+
+	pte = pte_offset_map_lock(mm, pmd, address, &pte_ptl);
+	if (pte)
+		result = __collapse_huge_page_isolate(vma, address, pte, cc,
+						      &compound_pagelist, order);
+	else
+		result = SCAN_PMD_NULL;
+
+	if (unlikely(result != SCAN_SUCCEED))
+		goto out_up_read;
+
+	anon_vma_unlock_write(vma->anon_vma);
+
+	__folio_mark_uptodate(folio);
+	entry = mk_pte(&folio->page, vma->vm_page_prot);
+	entry = maybe_mkwrite(entry, vma);
+
+	result = __collapse_huge_page_copy(pte, folio, pmd, *pmd,
+					   vma, address, pte_ptl,
+					   &compound_pagelist, order);
+	if (unlikely(result != SCAN_SUCCEED))
+		goto out_up_read;
+
+	folio_ref_add(folio, nr_pages - 1);
+	folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
+	folio_add_lru_vma(folio, vma);
+	deferred_split_folio(folio, false);
Hi Dev,

You are adding the lower order folios to the deferred split queue,
but you havent changed the THP shrinker to take this into account.

Thanks for the observation!


At memory pressure you will be doing a lot of work checking the contents of
all mTHP pages which will be wasted unless you change the shrinker, something
like below (unbuilt, untested) might work:

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c89aed1510f1..f9586df40f67 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3788,7 +3788,7 @@ static bool thp_underused(struct folio *folio)
                 kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
                 if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
                         num_zero_pages++;
-                       if (num_zero_pages > khugepaged_max_ptes_none) {
+                       if (num_zero_pages > khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - folio_order(folio))) {
                                 kunmap_local(kaddr);
                                 return true;
                         }


The question is, do we want the shrinker to be run for lower order mTHPs? It can consume
a lot of CPU cycles and not be as useful as PMD order THPs. So instead of above, we could
disable THP shrinker for lower orders?

Your suggestion makes sense to me. Freeing two (PMD - 1) mTHPs gives us the same amount of memory
as freeing one PMD THP, with the downside being, one extra iteration, one extra freeing, long deferred list
etc, basically more overhead.

+	set_ptes(mm, address, pte, entry, nr_pages);
+	update_mmu_cache_range(NULL, vma, address, pte, nr_pages);
+	pte_unmap_unlock(pte, pte_ptl);
+	mmu_notifier_invalidate_range_end(&range);
+	result = SCAN_SUCCEED;
+
+out_up_read:
+	mmap_read_unlock(mm);
+	return result;
+}
+
  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
  			      int referenced, int unmapped, int order,
  			      struct collapse_control *cc)
@@ -1276,6 +1340,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
if (order == HPAGE_PMD_ORDER)
  		result = vma_collapse_anon_folio_pmd(mm, address, vma, cc, pmd, folio);
+	else
+		result = vma_collapse_anon_folio(mm, address, vma, cc, pmd, folio, order);
if (result == SCAN_SUCCEED)
  		folio = NULL;




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux