Feed the page access data obtained from IBS to NUMA balancing as hint fault equivalents. The existing per-task and per-group fault stats are now built from IBS-provided page access information. With this it will not be necessary to scan the address space to introduce NUMA hinting faults. Use task_work framework to process the IBS sampled data. Actual programming of IBS to generate page access information isn't done yet. Signed-off-by: Bharata B Rao <bharata@xxxxxxx> --- arch/x86/mm/ibs.c | 38 ++++++++++++++- include/linux/migrate.h | 1 + include/linux/sched.h | 1 + include/linux/vm_event_item.h | 1 + kernel/sched/fair.c | 10 ++++ mm/memory.c | 92 +++++++++++++++++++++++++++++++++++ mm/vmstat.c | 1 + 7 files changed, 143 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/ibs.c b/arch/x86/mm/ibs.c index 411dba2a88d1..adbc587b1767 100644 --- a/arch/x86/mm/ibs.c +++ b/arch/x86/mm/ibs.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> +#include <linux/migrate.h> +#include <linux/task_work.h> #include <asm/nmi.h> #include <asm/perf_event.h> /* TODO: Move defns like IBS_OP_ENABLE into non-perf header */ @@ -8,12 +10,30 @@ static u64 ibs_config __read_mostly; +struct ibs_access_work { + struct callback_head work; + u64 laddr, paddr; +}; + +void task_ibs_access_work(struct callback_head *work) +{ + struct ibs_access_work *iwork = container_of(work, struct ibs_access_work, work); + struct task_struct *p = current; + + u64 laddr = iwork->laddr; + u64 paddr = iwork->paddr; + + kfree(iwork); + do_numa_access(p, laddr, paddr); +} + static int ibs_overflow_handler(unsigned int cmd, struct pt_regs *regs) { u64 ops_ctl, ops_data3, ops_data2; u64 remote_access; u64 laddr = -1, paddr = -1; struct mm_struct *mm = current->mm; + struct ibs_access_work *iwork; rdmsrl(MSR_AMD64_IBSOPCTL, ops_ctl); @@ -86,8 +106,24 @@ static int ibs_overflow_handler(unsigned int cmd, struct pt_regs *regs) /* Is phys addr valid? */ if (ops_data3 & MSR_AMD64_IBSOPDATA3_PADDR_VALID) rdmsrl(MSR_AMD64_IBSDCPHYSAD, paddr); - else + else { count_vm_event(IBS_PADDR_INVALID); + goto handled; + } + + /* + * TODO: GFP_ATOMIC! + */ + iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC); + if (!iwork) + goto handled; + + count_vm_event(IBS_USEFUL_SAMPLES); + + iwork->laddr = laddr; + iwork->paddr = paddr; + init_task_work(&iwork->work, task_ibs_access_work); + task_work_add(current, &iwork->work, TWA_RESUME); handled: return NMI_HANDLED; diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3ef77f52a4f0..4dcce7885b0c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -216,6 +216,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, unsigned long npages); void migrate_device_finalize(unsigned long *src_pfns, unsigned long *dst_pfns, unsigned long npages); +void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr); #endif /* CONFIG_MIGRATION */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 853d08f7562b..19dd4ee07436 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2420,4 +2420,5 @@ static inline void sched_core_fork(struct task_struct *p) { } extern void sched_set_stop_task(int cpu, struct task_struct *stop); +DECLARE_STATIC_KEY_FALSE(hw_access_hints); #endif diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 1d55e347d16c..2ccc7dee3c13 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -159,6 +159,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, IBS_LADDR_INVALID, IBS_KERNEL_ADDR, IBS_PADDR_INVALID, + IBS_USEFUL_SAMPLES, #endif #endif NR_VM_EVENT_ITEMS diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0f8736991427..c9b9e62da779 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -47,6 +47,7 @@ #include <linux/psi.h> #include <linux/ratelimit.h> #include <linux/task_work.h> +#include <linux/migrate.h> #include <asm/switch_to.h> @@ -3125,6 +3126,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) } } +DEFINE_STATIC_KEY_FALSE(hw_access_hints); + /* * Drive the periodic memory faults.. */ @@ -3133,6 +3136,13 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) struct callback_head *work = &curr->numa_work; u64 period, now; + /* + * If we are using access hints from hardware (like using + * IBS), don't scan the address space. + */ + if (static_branch_unlikely(&hw_access_hints)) + return; + /* * We don't care about NUMA placement if we don't have memory. */ diff --git a/mm/memory.c b/mm/memory.c index aad226daf41b..79096aba197c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4668,6 +4668,98 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } +/* + * Called from task_work context to act upon the page access. + * + * Physical address (provided by IBS) is used directly instead + * of walking the page tables to get to the PTE/page. Hence we + * don't check if PTE is writable for the TNF_NO_GROUP + * optimization, which means RO pages are considered for grouping. + */ +void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr) +{ + struct mm_struct *mm = p->mm; + struct vm_area_struct *vma; + struct page *page = NULL; + int page_nid = NUMA_NO_NODE; + int last_cpupid; + int target_nid; + int flags = 0; + + if (!mm) + return; + + if (!mmap_read_trylock(mm)) + return; + + vma = find_vma(mm, laddr); + if (!vma) + goto out_unlock; + + if (!vma_migratable(vma) || !vma_policy_mof(vma) || + is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) + goto out_unlock; + + if (!vma->vm_mm || + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + goto out_unlock; + + if (!vma_is_accessible(vma)) + goto out_unlock; + + page = pfn_to_online_page(PHYS_PFN(paddr)); + if (!page || is_zone_device_page(page)) + goto out_unlock; + + if (unlikely(!PageLRU(page))) + goto out_unlock; + + /* TODO: handle PTE-mapped THP */ + if (PageCompound(page)) + goto out_unlock; + + /* + * Flag if the page is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + flags |= TNF_SHARED; + + last_cpupid = page_cpupid_last(page); + page_nid = page_to_nid(page); + + /* + * For memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(page_nid)) + last_cpupid = (-1 & LAST_CPUPID_MASK); + else + last_cpupid = page_cpupid_last(page); + + target_nid = numa_migrate_prep(page, vma, laddr, page_nid, &flags); + if (target_nid == NUMA_NO_NODE) { + put_page(page); + goto out; + } + + /* Migrate to the requested node */ + if (migrate_misplaced_page(page, vma, target_nid)) { + page_nid = target_nid; + flags |= TNF_MIGRATED; + } else { + flags |= TNF_MIGRATE_FAIL; + } + +out: + if (page_nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, page_nid, 1, flags); + +out_unlock: + mmap_read_unlock(mm); +} + static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; diff --git a/mm/vmstat.c b/mm/vmstat.c index c7a9d0d9ade8..33738426ae48 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1408,6 +1408,7 @@ const char * const vmstat_text[] = { "ibs_invalid_laddr", "ibs_kernel_addr", "ibs_invalid_paddr", + "ibs_useful_samples", #endif #endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ -- 2.25.1