[RFC PATCH 2/5] x86/ibs: Drive NUMA balancing via IBS access data

Bharata B Rao <bharata@xxxxxxx> · Wed, 8 Feb 2023 13:05:30 +0530

Feed the page access data obtained from IBS to NUMA balancing
as hint fault equivalents. The existing per-task and per-group
fault stats are now built from IBS-provided page access information.
With this it will not be necessary to scan the address space to
introduce NUMA hinting faults.

Use task_work framework to process the IBS sampled data. Actual
programming of IBS to generate page access information isn't
done yet.

Signed-off-by: Bharata B Rao <bharata@xxxxxxx>
---
 arch/x86/mm/ibs.c             | 38 ++++++++++++++-
 include/linux/migrate.h       |  1 +
 include/linux/sched.h         |  1 +
 include/linux/vm_event_item.h |  1 +
 kernel/sched/fair.c           | 10 ++++
 mm/memory.c                   | 92 +++++++++++++++++++++++++++++++++++
 mm/vmstat.c                   |  1 +
 7 files changed, 143 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/ibs.c b/arch/x86/mm/ibs.c
index 411dba2a88d1..adbc587b1767 100644
--- a/arch/x86/mm/ibs.c
+++ b/arch/x86/mm/ibs.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/init.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>
 
 #include <asm/nmi.h>
 #include <asm/perf_event.h> /* TODO: Move defns like IBS_OP_ENABLE into non-perf header */
@@ -8,12 +10,30 @@
 
 static u64 ibs_config __read_mostly;
 
+struct ibs_access_work {
+	struct callback_head work;
+	u64 laddr, paddr;
+};
+
+void task_ibs_access_work(struct callback_head *work)
+{
+	struct ibs_access_work *iwork = container_of(work, struct ibs_access_work, work);
+	struct task_struct *p = current;
+
+	u64 laddr = iwork->laddr;
+	u64 paddr = iwork->paddr;
+
+	kfree(iwork);
+	do_numa_access(p, laddr, paddr);
+}
+
 static int ibs_overflow_handler(unsigned int cmd, struct pt_regs *regs)
 {
 	u64 ops_ctl, ops_data3, ops_data2;
 	u64 remote_access;
 	u64 laddr = -1, paddr = -1;
 	struct mm_struct *mm = current->mm;
+	struct ibs_access_work *iwork;
 
 	rdmsrl(MSR_AMD64_IBSOPCTL, ops_ctl);
 
@@ -86,8 +106,24 @@ static int ibs_overflow_handler(unsigned int cmd, struct pt_regs *regs)
 	/* Is phys addr valid? */
 	if (ops_data3 & MSR_AMD64_IBSOPDATA3_PADDR_VALID)
 		rdmsrl(MSR_AMD64_IBSDCPHYSAD, paddr);
-	else
+	else {
 		count_vm_event(IBS_PADDR_INVALID);
+		goto handled;
+	}
+
+	/*
+	 * TODO: GFP_ATOMIC!
+	 */
+	iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC);
+	if (!iwork)
+		goto handled;
+
+	count_vm_event(IBS_USEFUL_SAMPLES);
+
+	iwork->laddr = laddr;
+	iwork->paddr = paddr;
+	init_task_work(&iwork->work, task_ibs_access_work);
+	task_work_add(current, &iwork->work, TWA_RESUME);
 
 handled:
 	return NMI_HANDLED;
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 3ef77f52a4f0..4dcce7885b0c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -216,6 +216,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
 			unsigned long npages);
 void migrate_device_finalize(unsigned long *src_pfns,
 			unsigned long *dst_pfns, unsigned long npages);
+void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr);
 
 #endif /* CONFIG_MIGRATION */
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 853d08f7562b..19dd4ee07436 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2420,4 +2420,5 @@ static inline void sched_core_fork(struct task_struct *p) { }
 
 extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 
+DECLARE_STATIC_KEY_FALSE(hw_access_hints);
 #endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 1d55e347d16c..2ccc7dee3c13 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -159,6 +159,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		IBS_LADDR_INVALID,
 		IBS_KERNEL_ADDR,
 		IBS_PADDR_INVALID,
+		IBS_USEFUL_SAMPLES,
 #endif
 #endif
 		NR_VM_EVENT_ITEMS
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f8736991427..c9b9e62da779 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,6 +47,7 @@
 #include <linux/psi.h>
 #include <linux/ratelimit.h>
 #include <linux/task_work.h>
+#include <linux/migrate.h>
 
 #include <asm/switch_to.h>
 
@@ -3125,6 +3126,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 	}
 }
 
+DEFINE_STATIC_KEY_FALSE(hw_access_hints);
+
 /*
  * Drive the periodic memory faults..
  */
@@ -3133,6 +3136,13 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	struct callback_head *work = &curr->numa_work;
 	u64 period, now;
 
+	/*
+	 * If we are using access hints from hardware (like using
+	 * IBS), don't scan the address space.
+	 */
+	if (static_branch_unlikely(&hw_access_hints))
+		return;
+
 	/*
 	 * We don't care about NUMA placement if we don't have memory.
 	 */
diff --git a/mm/memory.c b/mm/memory.c
index aad226daf41b..79096aba197c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4668,6 +4668,98 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 	return mpol_misplaced(page, vma, addr);
 }
 
+/*
+ * Called from task_work context to act upon the page access.
+ *
+ * Physical address (provided by IBS) is used directly instead
+ * of walking the page tables to get to the PTE/page. Hence we
+ * don't check if PTE is writable for the TNF_NO_GROUP
+ * optimization, which means RO pages are considered for grouping.
+ */
+void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr)
+{
+	struct mm_struct *mm = p->mm;
+	struct vm_area_struct *vma;
+	struct page *page = NULL;
+	int page_nid = NUMA_NO_NODE;
+	int last_cpupid;
+	int target_nid;
+	int flags = 0;
+
+	if (!mm)
+		return;
+
+	if (!mmap_read_trylock(mm))
+		return;
+
+	vma = find_vma(mm, laddr);
+	if (!vma)
+		goto out_unlock;
+
+	if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+		is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP))
+		goto out_unlock;
+
+	if (!vma->vm_mm ||
+	    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+		goto out_unlock;
+
+	if (!vma_is_accessible(vma))
+		goto out_unlock;
+
+	page = pfn_to_online_page(PHYS_PFN(paddr));
+	if (!page || is_zone_device_page(page))
+		goto out_unlock;
+
+	if (unlikely(!PageLRU(page)))
+		goto out_unlock;
+
+	/* TODO: handle PTE-mapped THP */
+	if (PageCompound(page))
+		goto out_unlock;
+
+	/*
+	 * Flag if the page is shared between multiple address spaces. This
+	 * is later used when determining whether to group tasks together
+	 */
+	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+		flags |= TNF_SHARED;
+
+	last_cpupid = page_cpupid_last(page);
+	page_nid = page_to_nid(page);
+
+	/*
+	 * For memory tiering mode, cpupid of slow memory page is used
+	 * to record page access time.  So use default value.
+	 */
+	if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+	    !node_is_toptier(page_nid))
+		last_cpupid = (-1 & LAST_CPUPID_MASK);
+	else
+		last_cpupid = page_cpupid_last(page);
+
+	target_nid = numa_migrate_prep(page, vma, laddr, page_nid, &flags);
+	if (target_nid == NUMA_NO_NODE) {
+		put_page(page);
+		goto out;
+	}
+
+	/* Migrate to the requested node */
+	if (migrate_misplaced_page(page, vma, target_nid)) {
+		page_nid = target_nid;
+		flags |= TNF_MIGRATED;
+	} else {
+		flags |= TNF_MIGRATE_FAIL;
+	}
+
+out:
+	if (page_nid != NUMA_NO_NODE)
+		task_numa_fault(last_cpupid, page_nid, 1, flags);
+
+out_unlock:
+	mmap_read_unlock(mm);
+}
+
 static vm_fault_t do_numa_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7a9d0d9ade8..33738426ae48 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1408,6 +1408,7 @@ const char * const vmstat_text[] = {
 	"ibs_invalid_laddr",
 	"ibs_kernel_addr",
 	"ibs_invalid_paddr",
+	"ibs_useful_samples",
 #endif
 #endif
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
-- 
2.25.1