Re: [PATCH 06/10] mm: vmscan: demote anon DRAM pages to PMEM node

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 3/22/19 11:03 PM, Zi Yan wrote:
On 22 Mar 2019, at 21:44, Yang Shi wrote:

Since PMEM provides larger capacity than DRAM and has much lower
access latency than disk, so it is a good choice to use as a middle
tier between DRAM and disk in page reclaim path.

With PMEM nodes, the demotion path of anonymous pages could be:

DRAM -> PMEM -> swap device

This patch demotes anonymous pages only for the time being and demote
THP to PMEM in a whole.  However this may cause expensive page reclaim
and/or compaction on PMEM node if there is memory pressure on it.  But,
considering the capacity of PMEM and allocation only happens on PMEM
when PMEM is specified explicity, such cases should be not that often.
So, it sounds worth keeping THP in a whole instead of splitting it.

Demote pages to the cloest non-DRAM node even though the system is
swapless.  The current logic of page reclaim just scan anon LRU when
swap is on and swappiness is set properly.  Demoting to PMEM doesn't
need care whether swap is available or not.  But, reclaiming from PMEM
still skip anon LRU is swap is not available.

The demotion just happens between DRAM node and its cloest PMEM node.
Demoting to a remote PMEM node is not allowed for now.

And, define a new migration reason for demotion, called MR_DEMOTE.
Demote page via async migration to avoid blocking.

Signed-off-by: Yang Shi <yang.shi@xxxxxxxxxxxxxxxxx>
---
  include/linux/migrate.h        |  1 +
  include/trace/events/migrate.h |  3 +-
  mm/debug.c                     |  1 +
  mm/internal.h                  | 22 ++++++++++
  mm/vmscan.c                    | 99 ++++++++++++++++++++++++++++++++++--------
  5 files changed, 107 insertions(+), 19 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e13d9bf..78c8dda 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -25,6 +25,7 @@ enum migrate_reason {
  	MR_MEMPOLICY_MBIND,
  	MR_NUMA_MISPLACED,
  	MR_CONTIG_RANGE,
+	MR_DEMOTE,
  	MR_TYPES
  };

diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 705b33d..c1d5b36 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -20,7 +20,8 @@
  	EM( MR_SYSCALL,		"syscall_or_cpuset")		\
  	EM( MR_MEMPOLICY_MBIND,	"mempolicy_mbind")		\
  	EM( MR_NUMA_MISPLACED,	"numa_misplaced")		\
-	EMe(MR_CONTIG_RANGE,	"contig_range")
+	EM( MR_CONTIG_RANGE,	"contig_range")			\
+	EMe(MR_DEMOTE,		"demote")

  /*
   * First define the enums in the above macros to be exported to userspace
diff --git a/mm/debug.c b/mm/debug.c
index c0b31b6..cc0d7df 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -25,6 +25,7 @@
  	"mempolicy_mbind",
  	"numa_misplaced",
  	"cma",
+	"demote",
  };

  const struct trace_print_flags pageflag_names[] = {
diff --git a/mm/internal.h b/mm/internal.h
index 46ad0d8..0152300 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -303,6 +303,19 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask,
  }
  #endif

+static inline bool has_nonram_online(void)
+{
+	int i = 0;
+
+	for_each_online_node(i) {
+		/* Have PMEM node online? */
+		if (!node_isset(i, def_alloc_nodemask))
+			return true;
+	}
+
+	return false;
+}
+
  /* mm/util.c */
  void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
  		struct vm_area_struct *prev, struct rb_node *rb_parent);
@@ -565,5 +578,14 @@ static inline bool is_migrate_highatomic_page(struct page *page)
  }

  void setup_zone_pageset(struct zone *zone);
+
+#ifdef CONFIG_NUMA
  extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
+#else
+static inline struct page *alloc_new_node_page(struct page *page,
+					       unsigned long node)
+{
+	return NULL;
+}
+#endif
  #endif	/* __MM_INTERNAL_H */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a5ad0b3..bdcab6b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1094,6 +1094,19 @@ static void page_check_dirty_writeback(struct page *page,
  		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
  }

+static inline bool is_demote_ok(struct pglist_data *pgdat)
+{
+	/* Current node is not DRAM node */
+	if (!node_isset(pgdat->node_id, def_alloc_nodemask))
+		return false;
+
+	/* No online PMEM node */
+	if (!has_nonram_online())
+		return false;
+
+	return true;
+}
+
  /*
   * shrink_page_list() returns the number of reclaimed pages
   */
@@ -1106,6 +1119,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  {
  	LIST_HEAD(ret_pages);
  	LIST_HEAD(free_pages);
+	LIST_HEAD(demote_pages);
  	unsigned nr_reclaimed = 0;

  	memset(stat, 0, sizeof(*stat));
@@ -1262,6 +1276,22 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  		}

  		/*
+		 * Demote DRAM pages regardless the mempolicy.
+		 * Demot anonymous pages only for now and skip MADV_FREE
s/Demot/Demote

Thanks for catching this. Will fix.


+		 * pages.
+		 */
+		if (PageAnon(page) && !PageSwapCache(page) &&
+		    (node_isset(page_to_nid(page), def_alloc_nodemask)) &&
+		    PageSwapBacked(page)) {
+
+			if (has_nonram_online()) {
+				list_add(&page->lru, &demote_pages);
+				unlock_page(page);
+				continue;
+			}
+		}
+
+		/*
  		 * Anonymous process memory has backing store?
  		 * Try to allocate it some swap space here.
  		 * Lazyfree page could be freed directly
@@ -1477,6 +1507,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
  	}

+	/* Demote pages to PMEM */
+	if (!list_empty(&demote_pages)) {
+		int err, target_nid;
+		nodemask_t used_mask;
+
+		nodes_clear(used_mask);
+		target_nid = find_next_best_node(pgdat->node_id, &used_mask,
+						 true);
+
+		err = migrate_pages(&demote_pages, alloc_new_node_page, NULL,
+				    target_nid, MIGRATE_ASYNC, MR_DEMOTE);
+
+		if (err) {
+			putback_movable_pages(&demote_pages);
+
+			list_splice(&ret_pages, &demote_pages);
+		}
+	}
+
I like your approach here. It reuses the existing migrate_pages() interface without
adding extra code. I also would like to be CC’d in your future versions.

Yes, sure.

Thanks,
Yang


Thank you.

--
Best Regards,
Yan Zi




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux