[RFC PATCH v1 5/7] powerpc/mm: Add page access count support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hot Cold Affinity engine is a facility provided by POWER10 where each access to
a page is counted and the access count value decreased/decayed if not accessed
in a time window. There is a 32-bit counter for each page.

This patch uses the HCA engine to provide page access count on POWER10 and uses
the same with multi-gen LRU to classify the page to correct LRU generation. This
uses a simple page classification mechanism where pages are sampled from the
youngest and oldest generation to find the max and min page hotness in the
lruvec. This value is later used to sort every page to the right generation.

The max and min hotness range is established during aging when new generations
are created.

Not-yet-Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx>
---
 arch/powerpc/Kconfig                  |  10 +
 arch/powerpc/include/asm/hca.h        |  49 +++++
 arch/powerpc/include/asm/page_aging.h |  35 ++++
 arch/powerpc/mm/Makefile              |   1 +
 arch/powerpc/mm/hca.c                 | 275 ++++++++++++++++++++++++++
 include/linux/mmzone.h                |   5 +
 include/linux/page_aging.h            |   5 +
 mm/Kconfig                            |   4 +
 mm/vmscan.c                           |   5 +-
 9 files changed, 387 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/include/asm/hca.h
 create mode 100644 arch/powerpc/include/asm/page_aging.h
 create mode 100644 arch/powerpc/mm/hca.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7a5f8dbfbdd0..71e8f23d9a96 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1045,6 +1045,16 @@ config PPC_SECVAR_SYSFS
 	  read/write operations on these variables. Say Y if you have
 	  secure boot enabled and want to expose variables to userspace.
 
+config PPC_HCA_HOTNESS
+	prompt "PowerPC HCA engine based page hotness"
+	def_bool y
+	select ARCH_HAS_PAGE_AGING
+	depends on PPC_BOOK3S_64
+	help
+	  Use HCA engine to find page hotness
+
+	  If unsure, say N.
+
 endmenu
 
 config ISA_DMA_API
diff --git a/arch/powerpc/include/asm/hca.h b/arch/powerpc/include/asm/hca.h
new file mode 100644
index 000000000000..c0ed380594ca
--- /dev/null
+++ b/arch/powerpc/include/asm/hca.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+/*
+ * Configuration helpers for the Hot-Cold Affinity helper
+ */
+
+#ifndef _ASM_POWERPC_HCA_H
+#define _ASM_POWERPC_HCA_H
+
+#include <linux/types.h>
+
+struct hca_entry {
+	unsigned long count;
+	unsigned long prev_count;
+	uint8_t age;
+};
+
+static inline unsigned long hotness_score(struct hca_entry *entry)
+{
+	unsigned long hotness;
+
+#if 0
+	/*
+	 * Give more weightage to the prev_count because it got
+	 * historical values. Take smaller part of count as we
+	 * age more because prev_count would be a better approximation.
+	 * We still need to consider count to accomidate spike in access.
+	 * + 1 with age to handle age == 0.
+	 */
+	hotness = entry->prev_count + (entry->count / (entry->age + 1));
+#else
+	/* Considering we are not finding in real workloads pages with
+	 * very high hotness a decay essentially move count value to prev count.
+	 * At that point we could look at decay as period zeroing of the counter.
+	 * I am finding better results with the below hotness score with real workloads.
+	 */
+	hotness = entry->prev_count + entry->count;
+#endif
+
+	return hotness;
+}
+
+extern void (*hca_backend_node_debugfs_init)(int numa_node, struct dentry *node_dentry);
+extern void (*hca_backend_debugfs_init)(struct dentry *root_dentry);
+extern int  (*hca_pfn_entry)(unsigned long pfn, struct hca_entry *entry);
+extern bool (*hca_node_enabled)(int numa_node);
+extern int  (*hca_clear_entry)(unsigned long pfn);
+
+#endif /* _ASM_POWERPC_HCA_H */
diff --git a/arch/powerpc/include/asm/page_aging.h b/arch/powerpc/include/asm/page_aging.h
new file mode 100644
index 000000000000..0d98cd877308
--- /dev/null
+++ b/arch/powerpc/include/asm/page_aging.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _ASM_POWERPC_PAGE_AGING_H_
+#define _ASM_POWERPC_PAGE_AGING_H_
+
+#ifdef CONFIG_LRU_GEN
+extern bool hca_lru_age;
+unsigned long hca_map_lru_seq(struct lruvec *lruvec, struct folio *folio);
+bool hca_try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+			    int scan_priority, bool can_swap, bool force_scan);
+
+#define arch_supports_page_access_count arch_supports_page_access_count
+static inline bool arch_supports_page_access_count(void)
+{
+	return hca_lru_age;
+}
+
+#define arch_try_to_inc_max_seq	arch_try_to_inc_max_seq
+static inline bool arch_try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+					   int scan_priority, bool can_swap,
+					   bool force_scan)
+{
+	return hca_try_to_inc_max_seq(lruvec, max_seq, scan_priority,
+				      can_swap, force_scan);
+
+}
+
+#define arch_get_lru_gen_seq	arch_get_lru_gen_seq
+static inline unsigned long arch_get_lru_gen_seq(struct lruvec *lruvec, struct folio *folio)
+{
+	return hca_map_lru_seq(lruvec, folio);
+}
+
+#endif /* CONFIG_LRU_GEN */
+#endif
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 503a6e249940..30bd4ad4aff0 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
 obj-$(CONFIG_PTDUMP_CORE)	+= ptdump/
 obj-$(CONFIG_KASAN)		+= kasan/
+obj-$(CONFIG_PPC_HCA_HOTNESS)	+= hca.o
diff --git a/arch/powerpc/mm/hca.c b/arch/powerpc/mm/hca.c
new file mode 100644
index 000000000000..af6de4492ead
--- /dev/null
+++ b/arch/powerpc/mm/hca.c
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/debugfs.h>
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/page_aging.h>
+
+#include <asm/hca.h>
+
+bool hca_lru_age;
+static struct dentry *hca_debugfs_root;
+/*
+ * percentage of pfns to scan from each lurvec list to determine max/min hotness
+ */
+static ulong scan_pfn_ratio __read_mostly = 20;
+/*
+ * Millisec to wait/skip before starting another random scan
+ */
+static ulong scan_skip_msec __read_mostly = 60;
+
+/* backend callbacks  */
+void (*hca_backend_node_debugfs_init)(int numa_node, struct dentry *node_dentry);
+void (*hca_backend_debugfs_init)(struct dentry *root_dentry);
+int  (*hca_pfn_entry)(unsigned long pfn, struct hca_entry *entry);
+bool (*hca_node_enabled)(int numa_node);
+int  (*hca_clear_entry)(unsigned long pfn);
+
+static int parse_hca_age(char *arg)
+{
+	return strtobool(arg, &hca_lru_age);
+}
+early_param("hca_age", parse_hca_age);
+
+static inline int folio_hca_entry(struct folio *folio, struct hca_entry *entry)
+{
+	return hca_pfn_entry(folio_pfn(folio), entry);
+}
+
+#ifdef CONFIG_LRU_GEN
+static inline int get_nr_gens(struct lruvec *lruvec, int type)
+{
+	return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
+}
+
+/* FIXME!! */
+static inline bool folio_evictable(struct folio *folio)
+{
+	bool ret;
+
+	/* Prevent address_space of inode and swap cache from being freed */
+	rcu_read_lock();
+	ret = !mapping_unevictable(folio_mapping(folio)) &&
+		!folio_test_mlocked(folio);
+	rcu_read_unlock();
+	return ret;
+}
+
+static void restablish_hotness_range(struct lruvec *lruvec)
+{
+	bool youngest = true;
+	int gen, nr_pages;
+	unsigned long seq;
+	int new_scan_pfn_count;
+	struct lru_gen_struct *lrugen = &lruvec->lrugen;
+	unsigned long current_hotness, max_hotness = 0, min_hotness = 0;
+
+	if (time_is_after_jiffies64(lrugen->next_span_scan))
+		return;
+
+	spin_lock_irq(&lruvec->lru_lock);
+
+retry:
+	for (int type = 0; type < ANON_AND_FILE; type++) {
+		for (int zone = 0; zone < MAX_NR_ZONES; zone++) {
+			int index = 0;
+			struct list_head *head;
+			struct folio *folio;
+			struct hca_entry entry;
+
+			if (youngest)
+				seq = lrugen->max_seq;
+			else
+				seq = lrugen->min_seq[type];
+			gen = lru_gen_from_seq(seq);
+			nr_pages = lrugen->nr_pages[gen][type][zone];
+
+			new_scan_pfn_count = nr_pages * scan_pfn_ratio/100;
+			if (!new_scan_pfn_count)
+				new_scan_pfn_count = nr_pages;
+
+			head = &lrugen->lists[gen][type][zone];
+			list_for_each_entry(folio, head, lru) {
+
+				if (unlikely(!folio_evictable(folio)))
+					continue;
+
+				if (folio_hca_entry(folio, &entry))
+					continue;
+
+				if (index++ > new_scan_pfn_count)
+					break;
+
+				current_hotness = hotness_score(&entry);
+				/* If the page didn't see any access, skip it */
+				if (!current_hotness)
+					continue;
+				/*
+				 * Let's make sure we at least wait 1 decay
+				 * updates before looking at this  pfn for
+				 * max/min computation.
+				 */
+				if (entry.age < 1)
+					continue;
+
+				if (current_hotness > max_hotness)
+					max_hotness = (current_hotness + max_hotness) / 2;
+				else if ((current_hotness < min_hotness) || !min_hotness)
+					min_hotness = (current_hotness + min_hotness) / 2;
+				else if ((current_hotness - min_hotness) < (max_hotness - min_hotness) / 2)
+					min_hotness = (current_hotness + min_hotness) / 2;
+				else
+					max_hotness = (current_hotness + max_hotness) / 2;
+
+			}
+
+		}
+	}
+	if (youngest) {
+		/* compute with oldest generation */
+		youngest = false;
+		goto retry;
+	}
+	lrugen->next_span_scan = get_jiffies_64() + msecs_to_jiffies(scan_skip_msec);
+	if (min_hotness) {
+		lrugen->max_hotness	=  max_hotness;
+		lrugen->min_hotness	=  min_hotness;
+	}
+
+	spin_unlock_irq(&lruvec->lru_lock);
+}
+
+/* Return Multigen LRU generation based on folio hotness */
+unsigned long hca_map_lru_seq(struct lruvec *lruvec, struct folio *folio)
+{
+	unsigned long seq;
+	int  type, nr_gens;
+	struct lru_gen_struct *lrugen = &lruvec->lrugen;
+	struct hca_entry folio_entry;
+	unsigned long hotness, seq_range;
+
+	type = folio_is_file_lru(folio);
+	if (!hca_lru_age || folio_hca_entry(folio, &folio_entry))
+		/* return youngest generation ? */
+		return lrugen->min_seq[type];
+
+	hotness = hotness_score(&folio_entry);
+	/* The page didn't see any access, return oldest generation */
+	if (!hotness)
+		return lrugen->min_seq[type];
+
+	/* Also adjust based on current value. */
+	if (hotness > lrugen->max_hotness) {
+		lrugen->max_hotness =  (hotness + lrugen->max_hotness) / 2;
+		return lrugen->max_seq;
+	} else if (hotness < lrugen->min_hotness) {
+		lrugen->min_hotness =  (hotness + lrugen->min_hotness) / 2;
+		return lrugen->min_seq[type];
+	}
+
+	/*
+	 * Convert the max and min hotness into 4 ranges for sequence.
+	 * Then place our current hotness into one of these range.
+	 * We use the range number as an increment factor for generation.
+	 */
+	/* inclusive range min and max */
+	seq_range =  lrugen->max_hotness  - lrugen->min_hotness + 1;
+	nr_gens = get_nr_gens(lruvec, type);
+	seq_range =  (seq_range + nr_gens  - 1)/nr_gens;
+
+	/* higher the hotness younger the generation */
+	seq = lrugen->min_seq[type] + ((hotness - lrugen->min_hotness)/seq_range);
+
+	return seq;
+}
+
+bool hca_try_to_inc_max_seq(struct lruvec *lruvec,
+				   unsigned long max_seq, int scan_priority,
+				   bool can_swap, bool force_scan)
+
+{
+	bool success = false;
+	struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+	VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+
+	/* see the comment in iterate_mm_list() */
+	if (lruvec->seq_update_progress)
+		success = false;
+	else {
+		spin_lock_irq(&lruvec->lru_lock);
+
+		if (max_seq != lrugen->max_seq)
+			goto done;
+
+		if (lruvec->seq_update_progress)
+			goto done;
+
+		success = true;
+		lruvec->seq_update_progress = true;
+done:
+		spin_unlock_irq(&lruvec->lru_lock);
+	}
+	if (!success) {
+		if (scan_priority <= DEF_PRIORITY - 2)
+			wait_event_killable(lruvec->seq_update_wait,
+					    max_seq < READ_ONCE(lrugen->max_seq));
+
+		return max_seq < READ_ONCE(lrugen->max_seq);
+	}
+
+	/*
+	 * With hardware aging use the counters to update
+	 * lruvec max and min hotness.
+	 */
+	restablish_hotness_range(lruvec);
+
+	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
+	inc_max_seq(lruvec, can_swap, force_scan);
+	/* either this sees any waiters or they will see updated max_seq */
+	if (wq_has_sleeper(&lruvec->seq_update_wait))
+		wake_up_all(&lruvec->seq_update_wait);
+
+	return success;
+}
+#endif /* CONFIG_LRU_GEN */
+
+static void hca_debugfs_init(void)
+{
+	int node;
+	char name[32];
+	struct dentry *node_dentry;
+
+	hca_debugfs_root = debugfs_create_dir("hca", arch_debugfs_dir);
+
+	for_each_online_node(node) {
+		snprintf(name, sizeof(name), "node%u", node);
+		node_dentry = debugfs_create_dir(name, hca_debugfs_root);
+
+		hca_backend_node_debugfs_init(node, node_dentry);
+	}
+
+	debugfs_create_ulong("scan-pfn-ratio", 0600, hca_debugfs_root,
+			     &scan_pfn_ratio);
+	debugfs_create_ulong("scan-skip-msec", 0600, hca_debugfs_root,
+			     &scan_skip_msec);
+	debugfs_create_bool("hca_lru_age", 0600, hca_debugfs_root,
+			    &hca_lru_age);
+
+	/* Now create backend debugs */
+	hca_backend_debugfs_init(hca_debugfs_root);
+}
+
+static int __init hca_init(void)
+{
+	if (!hca_backend_debugfs_init) {
+		pr_info("No HCA device registered. Disabling hca lru gen\n");
+		hca_lru_age = false;
+	}
+
+	hca_debugfs_init();
+	return 0;
+}
+
+late_initcall(hca_init);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0bcc5d88239a..934ad587a558 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -425,6 +425,11 @@ struct lru_gen_struct {
 	atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 	/* whether the multi-gen LRU is enabled */
+#ifndef CONFIG_LRU_TASK_PAGE_AGING
+	unsigned long max_hotness;
+	unsigned long min_hotness;
+	u64 next_span_scan;
+#endif
 	bool enabled;
 };
 
diff --git a/include/linux/page_aging.h b/include/linux/page_aging.h
index d7c63ce0d824..074c876f17e1 100644
--- a/include/linux/page_aging.h
+++ b/include/linux/page_aging.h
@@ -3,6 +3,10 @@
 #ifndef _LINUX_PAGE_AGING_H
 #define _LINUX_PAGE_AGING_H
 
+#ifdef CONFIG_ARCH_HAS_PAGE_AGING
+#include <asm/page_aging.h>
+#endif
+
 #ifndef arch_supports_page_access_count
 static inline bool arch_supports_page_access_count(void)
 {
@@ -14,6 +18,7 @@ static inline bool arch_supports_page_access_count(void)
 bool __try_to_inc_max_seq(struct lruvec *lruvec,
 			  unsigned long max_seq, int scan_priority,
 			  bool can_swap, bool force_scan);
+void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan);
 
 #ifndef arch_get_lru_gen_seq
 static inline unsigned long arch_get_lru_gen_seq(struct lruvec *lruvec, struct folio *folio)
diff --git a/mm/Kconfig b/mm/Kconfig
index ff7b209dec05..493709ac758e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1181,6 +1181,10 @@ config LRU_GEN_STATS
 	  from evicted generations for debugging purpose.
 
 	  This option has a per-memcg and per-node memory overhead.
+
+config ARCH_HAS_PAGE_AGING
+	bool
+
 # }
 
 source "mm/damon/Kconfig"
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c8b98201f0b0..a5f6238b3926 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4362,7 +4362,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
 	return success;
 }
 
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
+void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
 {
 	int prev, next;
 	int type, zone;
@@ -4420,6 +4420,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
 #endif
 	spin_unlock_irq(&lruvec->lru_lock);
 }
+
 #ifdef CONFIG_LRU_TASK_PAGE_AGING
 static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 			       int scan_priority, bool can_swap, bool force_scan)
@@ -5861,7 +5862,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
 		seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
 	}
 
-	seq_printf(m, " node %5d\n", nid);
+	seq_printf(m, " node %5d max_hotness %ld min_hotness %ld\n", nid, lrugen->max_hotness, lrugen->min_hotness);
 
 	if (!full)
 		seq = min_seq[LRU_GEN_ANON];
-- 
2.39.2





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux