[PATCH 1/1] mm: introduce MADV_DEMOTE/MADV_PROMOTE

BiscuitOS Broiler <zhang.renze@xxxxxxx> · Thu, 1 Aug 2024 14:50:35 +0800

In a tiered memory architecture, when a process does not access memory
in the fast nodes for a long time, the kernel will demote the memory
to slower memory through a reclamation mechanism. This frees up the
fast memory for other processes. When the process accesses the demoted
memory again, the tiered memory system will, following certain
policies, promote it back to fast memory. Since memory demotion and
promotion in a tiered memory system do not occur instantly but require
a gradual process, this can severely impact the performance of programs
in high-performance computing scenarios.

This patch introduces new MADV_DEMOTE and MADV_PROMOTE hints to the
madvise syscall. MADV_DEMOTE can mark a range of memory pages as cold
pages and immediately demote them to slow memory. MADV_PROMOTE can mark
a range of memory pages as hot pages and immediately promote them to
fast memory, allowing applications to better balance large memory
capacity with latency.

Signed-off-by: BiscuitOS Broiler <zhang.renze@xxxxxxx>
---
 arch/alpha/include/uapi/asm/mman.h           |   3 +
 arch/mips/include/uapi/asm/mman.h            |   3 +
 arch/parisc/include/uapi/asm/mman.h          |   3 +
 arch/xtensa/include/uapi/asm/mman.h          |   3 +
 include/uapi/asm-generic/mman-common.h       |   3 +
 mm/internal.h                                |   1 +
 mm/madvise.c                                 | 251 +++++++++++++++++++
 mm/vmscan.c                                  |  57 +++++
 tools/include/uapi/asm-generic/mman-common.h |   3 +
 9 files changed, 327 insertions(+)

diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 763929e814e9..98e7609d51ab 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -78,6 +78,9 @@

 #define MADV_COLLAPSE  25              /* Synchronous hugepage collapse */

+#define MADV_DEMOTE    26              /* Demote page into slow node */
+#define MADV_PROMOTE   27              /* Promote page into fast node */
+
 /* compatibility flags */
 #define MAP_FILE       0

diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 9c48d9a21aa0..aae4cd01c20d 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -105,6 +105,9 @@

 #define MADV_COLLAPSE  25              /* Synchronous hugepage collapse */

+#define MADV_DEMOTE    26              /* Demote page into slow node */
+#define MADV_PROMOTE   27              /* Promote page into fast node */
+
 /* compatibility flags */
 #define MAP_FILE       0

diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 68c44f99bc93..8b50ac91d0c9 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -72,6 +72,9 @@

 #define MADV_COLLAPSE  25              /* Synchronous hugepage collapse */

+#define MADV_DEMOTE    26              /* Demote page into slow node */
+#define MADV_PROMOTE   27              /* Promote page into fast node */
+
 #define MADV_HWPOISON     100          /* poison a page for testing */
 #define MADV_SOFT_OFFLINE 101          /* soft offline page for testing */

diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 1ff0c858544f..8f820d4f5901 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -113,6 +113,9 @@

 #define MADV_COLLAPSE  25              /* Synchronous hugepage collapse */

+#define MADV_DEMOTE    26              /* Demote page into slow node */
+#define MADV_PROMOTE   27              /* Promote page into fast node */
+
 /* compatibility flags */
 #define MAP_FILE       0

diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 6ce1f1ceb432..52222c2245a8 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -79,6 +79,9 @@

 #define MADV_COLLAPSE  25              /* Synchronous hugepage collapse */

+#define MADV_DEMOTE    26              /* Demote page into slow node */
+#define MADV_PROMOTE   27              /* Promote page into fast node */
+
 /* compatibility flags */
 #define MAP_FILE       0

diff --git a/mm/internal.h b/mm/internal.h
index 7a3bcc6d95e7..105c2621e335 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1096,6 +1096,7 @@ extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
 extern void set_pageblock_order(void);
 struct folio *alloc_migrate_folio(struct folio *src, unsigned long private);
 unsigned long reclaim_pages(struct list_head *folio_list);
+unsigned long demotion_pages(struct list_head *folio_list);
 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
                                            struct list_head *folio_list);
 /* The ALLOC_WMARK bits are used as an index to zone->watermark */
diff --git a/mm/madvise.c b/mm/madvise.c
index 89089d84f8df..9e41936a2dc5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -31,6 +31,9 @@
 #include <linux/swapops.h>
 #include <linux/shmem_fs.h>
 #include <linux/mmu_notifier.h>
+#include <linux/memory-tiers.h>
+#include <linux/migrate.h>
+#include <linux/sched/numa_balancing.h>

 #include <asm/tlb.h>

@@ -56,6 +59,8 @@ static int madvise_need_mmap_write(int behavior)
        case MADV_DONTNEED_LOCKED:
        case MADV_COLD:
        case MADV_PAGEOUT:
+       case MADV_DEMOTE:
+       case MADV_PROMOTE:
        case MADV_FREE:
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
@@ -639,6 +644,242 @@ static long madvise_pageout(struct vm_area_struct *vma,
        return 0;
 }

+static int madvise_demotion_pte_range(pmd_t *pmd,
+                               unsigned long addr, unsigned long end,
+                               struct mm_walk *walk)
+{
+       struct mmu_gather *tlb = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+       struct mm_struct *mm = tlb->mm;
+       pte_t *start_pte, *pte, ptent;
+       struct folio *folio = NULL;
+       LIST_HEAD(folio_list);
+       spinlock_t *ptl;
+       int nid;
+
+       if (fatal_signal_pending(current))
+               return -EINTR;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (pmd_trans_huge(*pmd))
+               return 0;
+#endif
+       tlb_change_page_size(tlb, PAGE_SIZE);
+       start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+       if (!start_pte)
+               return 0;
+       flush_tlb_batched_pending(mm);
+       arch_enter_lazy_mmu_mode();
+       for (; addr < end; pte++, addr += PAGE_SIZE) {
+               ptent = ptep_get(pte);
+
+               if (pte_none(ptent))
+                       continue;
+
+               if (!pte_present(ptent))
+                       continue;
+
+               folio = vm_normal_folio(vma, addr, ptent);
+               if (!folio || folio_is_zone_device(folio))
+                       continue;
+
+               if (folio_test_large(folio))
+                       continue;
+
+               if (!folio_test_anon(folio))
+                       continue;
+
+               nid = folio_nid(folio);
+               if (!node_is_toptier(nid))
+                       continue;
+
+               /* no tiered memory node */
+               if (next_demotion_node(nid) == NUMA_NO_NODE)
+                       continue;
+
+               /*
+                * Do not interfere with other mappings of this folio and
+                * non-LRU folio. If we have a large folio at this point, we
+                * know it is fully mapped so if its mapcount is the same as its
+                * number of pages, it must be exclusive.
+                */
+               if (!folio_test_lru(folio) ||
+                   folio_mapcount(folio) != folio_nr_pages(folio))
+                       continue;
+
+               folio_clear_referenced(folio);
+               folio_test_clear_young(folio);
+               if (folio_test_active(folio))
+                       folio_set_workingset(folio);
+               if (folio_isolate_lru(folio)) {
+                       if (folio_test_unevictable(folio))
+                               folio_putback_lru(folio);
+                       else
+                               list_add(&folio->lru, &folio_list);
+               }
+       }
+
+       if (start_pte) {
+               arch_leave_lazy_mmu_mode();
+               pte_unmap_unlock(start_pte, ptl);
+       }
+
+       demotion_pages(&folio_list);
+       cond_resched();
+
+       return 0;
+}
+
+static const struct mm_walk_ops demotion_walk_ops = {
+       .pmd_entry = madvise_demotion_pte_range,
+       .walk_lock = PGWALK_RDLOCK,
+};
+
+static void madvise_demotion_page_range(struct mmu_gather *tlb,
+                            struct vm_area_struct *vma,
+                            unsigned long addr, unsigned long end)
+{
+       tlb_start_vma(tlb, vma);
+       walk_page_range(vma->vm_mm, addr, end, &demotion_walk_ops, tlb);
+       tlb_end_vma(tlb, vma);
+}
+
+static long madvise_demotion(struct vm_area_struct *vma,
+                       struct vm_area_struct **prev,
+                       unsigned long start_addr, unsigned long end_addr)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       struct mmu_gather tlb;
+
+       *prev = vma;
+       if (!can_madv_lru_vma(vma))
+               return -EINVAL;
+
+       if (!numa_demotion_enabled && !vma_is_anonymous(vma) &&
+                               (vma->vm_flags & VM_MAYSHARE))
+               return 0;
+
+       lru_add_drain();
+       tlb_gather_mmu(&tlb, mm);
+       madvise_demotion_page_range(&tlb, vma, start_addr, end_addr);
+       tlb_finish_mmu(&tlb);
+
+       return 0;
+}
+
+static int madvise_promotion_pte_range(pmd_t *pmd,
+                               unsigned long addr, unsigned long end,
+                               struct mm_walk *walk)
+{
+       struct mmu_gather *tlb = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+       struct mm_struct *mm = tlb->mm;
+       struct folio *folio = NULL;
+       LIST_HEAD(folio_list);
+       int nid, target_nid;
+       pte_t *pte, ptent;
+       spinlock_t *ptl;
+
+       if (fatal_signal_pending(current))
+               return -EINTR;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (pmd_trans_huge(*pmd))
+               return 0;
+#endif
+       tlb_change_page_size(tlb, PAGE_SIZE);
+       pte = pte_offset_map_nolock(vma->vm_mm, pmd, addr, &ptl);
+       if (!pte)
+               return 0;
+       flush_tlb_batched_pending(mm);
+       arch_enter_lazy_mmu_mode();
+       for (; addr < end; pte++, addr += PAGE_SIZE) {
+               ptent = ptep_get(pte);
+
+               if (pte_none(ptent))
+                       continue;
+
+               if (!pte_present(ptent))
+                       continue;
+
+               folio = vm_normal_folio(vma, addr, ptent);
+               if (!folio || folio_is_zone_device(folio))
+                       continue;
+
+               if (folio_test_large(folio))
+                       continue;
+
+               if (!folio_test_anon(folio))
+                       continue;
+
+               /* skip page on fast node */
+               nid = folio_nid(folio);
+               if (node_is_toptier(nid))
+                       continue;
+
+               if (!folio_test_lru(folio) ||
+                   folio_mapcount(folio) != folio_nr_pages(folio))
+                       continue;
+
+               /* force update folio last access time */
+               folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
+
+               target_nid = numa_node_id();
+               if (!should_numa_migrate_memory(current, folio, nid, target_nid))
+                       continue;
+
+               /* prepare pormote */
+               if (!folio_isolate_lru(folio))
+                       continue;
+
+               /* promote page directly */
+               migrate_misplaced_folio(folio, vma, target_nid);
+               tlb_remove_tlb_entry(tlb, pte, addr);
+       }
+
+       arch_leave_lazy_mmu_mode();
+       cond_resched();
+
+       return 0;
+}
+
+static const struct mm_walk_ops promotion_walk_ops = {
+       .pmd_entry = madvise_promotion_pte_range,
+       .walk_lock = PGWALK_RDLOCK,
+};
+
+static void madvise_promotion_page_range(struct mmu_gather *tlb,
+                            struct vm_area_struct *vma,
+                            unsigned long addr, unsigned long end)
+{
+       tlb_start_vma(tlb, vma);
+       walk_page_range(vma->vm_mm, addr, end, &promotion_walk_ops, tlb);
+       tlb_end_vma(tlb, vma);
+}
+
+static long madvise_promotion(struct vm_area_struct *vma,
+                       struct vm_area_struct **prev,
+                       unsigned long start_addr, unsigned long end_addr)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       struct mmu_gather tlb;
+
+       *prev = vma;
+       if (!can_madv_lru_vma(vma))
+               return -EINVAL;
+
+       if (!numa_demotion_enabled && !vma_is_anonymous(vma) &&
+                               (vma->vm_flags & VM_MAYSHARE))
+               return 0;
+
+       lru_add_drain();
+       tlb_gather_mmu(&tlb, mm);
+       madvise_promotion_page_range(&tlb, vma, start_addr, end_addr);
+       tlb_finish_mmu(&tlb);
+
+       return 0;
+}
+
 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)

@@ -1040,6 +1281,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
                return madvise_cold(vma, prev, start, end);
        case MADV_PAGEOUT:
                return madvise_pageout(vma, prev, start, end);
+       case MADV_DEMOTE:
+               return madvise_demotion(vma, prev, start, end);
+       case MADV_PROMOTE:
+               return madvise_promotion(vma, prev, start, end);
        case MADV_FREE:
        case MADV_DONTNEED:
        case MADV_DONTNEED_LOCKED:
@@ -1179,6 +1424,8 @@ madvise_behavior_valid(int behavior)
        case MADV_FREE:
        case MADV_COLD:
        case MADV_PAGEOUT:
+       case MADV_DEMOTE:
+       case MADV_PROMOTE:
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
 #ifdef CONFIG_KSM
@@ -1210,6 +1457,8 @@ static bool process_madvise_behavior_valid(int behavior)
        switch (behavior) {
        case MADV_COLD:
        case MADV_PAGEOUT:
+       case MADV_DEMOTE:
+       case MADV_PROMOTE:
        case MADV_WILLNEED:
        case MADV_COLLAPSE:
                return true;
@@ -1391,6 +1640,8 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
  *             triggering read faults if required
  *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
  *             triggering write faults if required
+ *  MADV_DEMOTE  - the application forces pages into slow node.
+ *  MADV_PROMOTE - the application forces pages into fast node.
  *
  * return values:
  *  zero    - success
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c89d0551655e..88d7a1dd05a0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2185,6 +2185,63 @@ unsigned long reclaim_pages(struct list_head *folio_list)
        return nr_reclaimed;
 }

+static unsigned int demotion_folio_list(struct list_head *folio_list,
+                                     struct pglist_data *pgdat)
+{
+       struct reclaim_stat dummy_stat;
+       unsigned int nr_demoted;
+       struct folio *folio;
+       struct scan_control sc = {
+               .gfp_mask = GFP_KERNEL,
+               .may_writepage = 1,
+               .may_unmap = 1,
+               .may_swap = 1,
+               .no_demotion = 0,
+       };
+
+       nr_demoted = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true);
+       while (!list_empty(folio_list)) {
+               folio = lru_to_folio(folio_list);
+               list_del(&folio->lru);
+               folio_putback_lru(folio);
+       }
+
+       return nr_demoted;
+}
+
+unsigned long demotion_pages(struct list_head *folio_list)
+{
+       unsigned int nr_demoted = 0;
+       LIST_HEAD(node_folio_list);
+       unsigned int noreclaim_flag;
+       int nid;
+
+       if (list_empty(folio_list))
+               return nr_demoted;
+
+       noreclaim_flag = memalloc_noreclaim_save();
+
+       nid = folio_nid(lru_to_folio(folio_list));
+       do {
+               struct folio *folio = lru_to_folio(folio_list);
+
+               if (nid == folio_nid(folio)) {
+                       folio_clear_active(folio);
+                       list_move(&folio->lru, &node_folio_list);
+                       continue;
+               }
+
+               nr_demoted += demotion_folio_list(&node_folio_list, NODE_DATA(nid));
+               nid = folio_nid(lru_to_folio(folio_list));
+       } while (!list_empty(folio_list));
+
+       nr_demoted += demotion_folio_list(&node_folio_list, NODE_DATA(nid));
+
+       memalloc_noreclaim_restore(noreclaim_flag);
+
+       return nr_demoted;
+}
+
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
                                 struct lruvec *lruvec, struct scan_control *sc)
 {
diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h
index 6ce1f1ceb432..52222c2245a8 100644
--- a/tools/include/uapi/asm-generic/mman-common.h
+++ b/tools/include/uapi/asm-generic/mman-common.h
@@ -79,6 +79,9 @@

 #define MADV_COLLAPSE  25              /* Synchronous hugepage collapse */

+#define MADV_DEMOTE    26              /* Demote page into slow node */
+#define MADV_PROMOTE   27              /* Promote page into fast node */
+
 /* compatibility flags */
 #define MAP_FILE       0

--
2.34.1

-------------------------------------------------------------------------------------------------------------------------------------
±¾ÓÊ¼þ¼°Æä¸½¼þº¬ÓÐÐÂ»ªÈý¼¯ÍÅµÄ±£ÃÜÐÅÏ¢£¬½öÏÞÓÚ·¢ËÍ¸øÉÏÃæµØÖ·ÖÐÁÐ³ö
µÄ¸öÈË»òÈº×é¡£½ûÖ¹ÈÎºÎÆäËûÈËÒÔÈÎºÎÐÎÊ½Ê¹ÓÃ£¨°üÀ¨µ«²»ÏÞÓÚÈ«²¿»ò²¿·ÖµØÐ¹Â¶¡¢¸´ÖÆ¡¢
»òÉ¢·¢£©±¾ÓÊ¼þÖÐµÄÐÅÏ¢¡£Èç¹ûÄú´íÊÕÁË±¾ÓÊ¼þ£¬ÇëÄúÁ¢¼´µç»°»òÓÊ¼þÍ¨Öª·¢¼þÈË²¢É¾³ý±¾
ÓÊ¼þ£¡
This e-mail and its attachments contain confidential information from New H3C, which is
intended only for the person or entity whose address is listed above. Any use of the
information contained herein in any way (including, but not limited to, total or partial
disclosure, reproduction, or dissemination) by persons other than the intended
recipient(s) is prohibited. If you receive this e-mail in error, please notify the sender
by phone or email immediately and delete it!