[RFC PATCH 11/14] mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse

"Zach O'Keefe" <zokeefe@xxxxxxxxxx> · Tue, 8 Mar 2022 13:34:14 -0800

The idea of hugepage collapse in process context was previously
introduced by David Rientjes to linux-mm[1].

The idea is to introduce a new madvise mode, MADV_COLLAPSE, that allows
users to request a synchronous collapse of memory.

The benefits of this approach are:

* cpu is charged to the process that wants to spend the cycles for the
  THP
* avoid unpredictable timing of khugepaged collapse
* flexible separation of sync userspace and async khugepaged THP collapse
  policies

Immediate users of this new functionality include:

* malloc implementations that manage memory in hugepage-sized chunks,
  but sometimes subrelease memory back to the system in native-sized
  chunks via MADV_DONTNEED; zapping the pmd.  Later, when the memory
  is hot, the implementation could madvise(MADV_COLLAPSE) to re-back the
  memory by THP to regain TLB performance.
* immediately back executable text by hugepages.  Current support
  provided by CONFIG_READ_ONLY_THP_FOR_FS may take too long on a large
  system.

To keep patches digestible, introduce MADV_COLLAPSE in a few stages.

Add plumbing to existing madvise infrastructure, as well as populate
uapi header files, leaving the actual madvise(MADV_COLLAPSE) handler
stubbed out.  Only privately-mapped anon memory is supported for now.

[1] https://lore.kernel.org/linux-mm/d098c392-273a-36a4-1a29-59731cdf5d3d@xxxxxxxxxx/

Signed-off-by: Zach O'Keefe <zokeefe@xxxxxxxxxx>
---
 include/linux/huge_mm.h                | 12 +++++++
 include/uapi/asm-generic/mman-common.h |  2 ++
 mm/khugepaged.c                        | 46 ++++++++++++++++++++++++++
 mm/madvise.c                           |  5 +++
 4 files changed, 65 insertions(+)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fd905b0b2c71..407b63ab4185 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -226,6 +226,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 
 int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
 		     int advice);
+int madvise_collapse(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev,
+		     unsigned long start, unsigned long end);
 void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
 			   unsigned long end, long adjust_next);
 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
@@ -383,6 +386,15 @@ static inline int hugepage_madvise(struct vm_area_struct *vma,
 	BUG();
 	return 0;
 }
+
+static inline int madvise_collapse(struct vm_area_struct *vma,
+				   struct vm_area_struct **prev,
+				   unsigned long start, unsigned long end)
+{
+	BUG();
+	return 0;
+}
+
 static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 					 unsigned long start,
 					 unsigned long end,
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 6c1aa92a92e4..6ce1f1ceb432 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -77,6 +77,8 @@
 
 #define MADV_DONTNEED_LOCKED	24	/* like DONTNEED, but drop locked pages too */
 
+#define MADV_COLLAPSE	25		/* Synchronous hugepage collapse */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 12ae765c5c32..ca1e523086ed 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2519,3 +2519,49 @@ void khugepaged_min_free_kbytes_update(void)
 		set_recommended_min_free_kbytes();
 	mutex_unlock(&khugepaged_mutex);
 }
+
+/*
+ * Returns 0 if successfully able to collapse range into THPs (or range already
+ * backed by THPs). Due to implementation detail, THPs collapsed here may be
+ * split again before this function returns.
+ */
+static int _madvise_collapse(struct mm_struct *mm,
+			     struct vm_area_struct *vma,
+			     struct vm_area_struct **prev,
+			     unsigned long start,
+			     unsigned long end, gfp_t gfp,
+			     struct collapse_control *cc)
+{
+	/* Implemented in later patch */
+	return -ENOSYS;
+}
+
+int madvise_collapse(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev, unsigned long start,
+		     unsigned long end)
+{
+	struct collapse_control cc;
+	gfp_t gfp;
+	int error;
+	struct mm_struct *mm = vma->vm_mm;
+
+	/* Requested to hold mmap_lock in read */
+	mmap_assert_locked(mm);
+
+	mmgrab(mm);
+	collapse_control_init(&cc, /* enforce_pte_scan_limits= */ false);
+	gfp = vma_thp_gfp_mask(vma);
+	lru_add_drain(); /* lru_add_drain_all() too heavy here */
+	error = _madvise_collapse(mm, vma, prev, start, end, gfp, &cc);
+	mmap_assert_locked(mm);
+	mmdrop(mm);
+
+	/*
+	 * madvise() returns EAGAIN if kernel resources are temporarily
+	 * unavailable.
+	 */
+	if (error == -ENOMEM)
+		error = -EAGAIN;
+
+	return error;
+}
diff --git a/mm/madvise.c b/mm/madvise.c
index 5b6d796e55de..292aa017c150 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -58,6 +58,7 @@ static int madvise_need_mmap_write(int behavior)
 	case MADV_FREE:
 	case MADV_POPULATE_READ:
 	case MADV_POPULATE_WRITE:
+	case MADV_COLLAPSE:
 		return 0;
 	default:
 		/* be safe, default to 1. list exceptions explicitly */
@@ -1046,6 +1047,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
 		if (error)
 			goto out;
 		break;
+	case MADV_COLLAPSE:
+		return madvise_collapse(vma, prev, start, end);
 	}
 
 	anon_name = anon_vma_name(vma);
@@ -1139,6 +1142,7 @@ madvise_behavior_valid(int behavior)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	case MADV_HUGEPAGE:
 	case MADV_NOHUGEPAGE:
+	case MADV_COLLAPSE:
 #endif
 	case MADV_DONTDUMP:
 	case MADV_DODUMP:
@@ -1328,6 +1332,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
  *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
  *		transparent huge pages so the existing pages will not be
  *		coalesced into THP and new pages will not be allocated as THP.
+ *  MADV_COLLAPSE - synchronously coalesce pages into new THP.
  *  MADV_DONTDUMP - the application wants to prevent pages in the given range
  *		from being included in its core dump.
  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
-- 
2.35.1.616.g0bdcbb4464-goog