Use IBS (Instruction Based Sampling) feature present in AMD processors for memory access tracking. The access information obtained from IBS will be used in subsequent patches to drive NUMA balancing. An NMI handler is registered to obtain the IBS data. The handler does nothing much yet. It just filters out the non-useful samples and collects some stats. This patch just builds the framework and IBS execution sampling is enabled only in a subsequent patch. TODOs ----- 1. Perf also uses IBS. For the purpose of this prototype just disable the use of IBS in perf. This needs to be done cleanly. 2. Only the required MSR bits are defined here. About IBS --------- IBS can be programmed to provide data about instruction execution periodically. This is done by programming a desired sample count (number of ops) in a control register. When the programmed number of ops are dispatched, a micro-op gets tagged, various information about the tagged micro-op's execution is populated in IBS execution MSRs and an interrupt is raised. While IBS provides a lot of data for each sample, for the purpose of memory access profiling, we are interested in linear and physical address of the memory access that reached DRAM. Recent AMD processors provide further filtering where it is possible to limit the sampling to those ops that had an L3 miss which greately reduces the non-useful samples. While IBS provides capability to sample instruction fetch and execution, only IBS execution sampling is used here to collect data about memory accesses that occur during the instruction execution. More information about IBS is available in Sec 13.3 of AMD64 Architecture Programmer's Manual, Volume 2:System Programming which is present at: https://bugzilla.kernel.org/attachment.cgi?id=288923 Information about MSRs used for programming IBS can be found in Sec 2.1.14.4 of PPR Vol 1 for AMD Family 19h Model 11h B1 which is currently present at: https://www.amd.com/system/files/TechDocs/55901_0.25.zip Signed-off-by: Bharata B Rao <bharata@xxxxxxx> --- arch/x86/events/amd/ibs.c | 6 ++ arch/x86/include/asm/msr-index.h | 12 +++ arch/x86/mm/Makefile | 1 + arch/x86/mm/ibs.c | 169 +++++++++++++++++++++++++++++++ include/linux/vm_event_item.h | 11 ++ mm/vmstat.c | 11 ++ 6 files changed, 210 insertions(+) create mode 100644 arch/x86/mm/ibs.c diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index da3f5ebac4e1..290e6d221844 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1512,6 +1512,12 @@ static __init int amd_ibs_init(void) { u32 caps; + /* + * TODO: Find a clean way to disable perf IBS so that IBS + * can be used for NUMA balancing. + */ + return 0; + caps = __get_ibs_caps(); if (!caps) return -ENODEV; /* ibs not supported by the cpu */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 37ff47552bcb..443d4cf73366 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -593,6 +593,18 @@ /* AMD Last Branch Record MSRs */ #define MSR_AMD64_LBR_SELECT 0xc000010e +/* AMD IBS MSR bits */ +#define MSR_AMD64_IBSOPDATA2_DATASRC 0x7 +#define MSR_AMD64_IBSOPDATA2_DATASRC_DRAM 0x3 +#define MSR_AMD64_IBSOPDATA2_DATASRC_FAR_CCX_CACHE 0x5 + +#define MSR_AMD64_IBSOPDATA3_LDOP BIT_ULL(0) +#define MSR_AMD64_IBSOPDATA3_STOP BIT_ULL(1) +#define MSR_AMD64_IBSOPDATA3_DCMISS BIT_ULL(7) +#define MSR_AMD64_IBSOPDATA3_LADDR_VALID BIT_ULL(17) +#define MSR_AMD64_IBSOPDATA3_PADDR_VALID BIT_ULL(18) +#define MSR_AMD64_IBSOPDATA3_L2MISS BIT_ULL(20) + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c80febc44cd2..e74b95a57d86 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -27,6 +27,7 @@ endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ pgtable.o physaddr.o tlb.o cpu_entry_area.o maccess.o pgprot.o +obj-$(CONFIG_NUMA_BALANCING) += ibs.o obj-y += pat/ # Make sure __phys_addr has no stackprotector diff --git a/arch/x86/mm/ibs.c b/arch/x86/mm/ibs.c new file mode 100644 index 000000000000..411dba2a88d1 --- /dev/null +++ b/arch/x86/mm/ibs.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/init.h> + +#include <asm/nmi.h> +#include <asm/perf_event.h> /* TODO: Move defns like IBS_OP_ENABLE into non-perf header */ +#include <asm/apic.h> + +static u64 ibs_config __read_mostly; + +static int ibs_overflow_handler(unsigned int cmd, struct pt_regs *regs) +{ + u64 ops_ctl, ops_data3, ops_data2; + u64 remote_access; + u64 laddr = -1, paddr = -1; + struct mm_struct *mm = current->mm; + + rdmsrl(MSR_AMD64_IBSOPCTL, ops_ctl); + + /* + * When IBS sampling period is reprogrammed via read-modify-update + * of MSR_AMD64_IBSOPCTL, overflow NMIs could be generated with + * IBS_OP_ENABLE not set. For such cases, return as HANDLED. + * + * With this, the handler will say "handled" for all NMIs that + * aren't related to this NMI. This stems from the limitation of + * having both status and control bits in one MSR. + */ + if (!(ops_ctl & IBS_OP_VAL)) + goto handled; + + wrmsrl(MSR_AMD64_IBSOPCTL, ops_ctl & ~IBS_OP_VAL); + + count_vm_event(IBS_NR_EVENTS); + + if (!mm) { + count_vm_event(IBS_KTHREAD); + goto handled; + } + + rdmsrl(MSR_AMD64_IBSOPDATA3, ops_data3); + + /* Load/Store ops only */ + if (!(ops_data3 & (MSR_AMD64_IBSOPDATA3_LDOP | + MSR_AMD64_IBSOPDATA3_STOP))) { + count_vm_event(IBS_NON_LOAD_STORES); + goto handled; + } + + /* Discard the sample if it was L1 or L2 hit */ + if (!(ops_data3 & (MSR_AMD64_IBSOPDATA3_DCMISS | + MSR_AMD64_IBSOPDATA3_L2MISS))) { + count_vm_event(IBS_DC_L2_HITS); + goto handled; + } + + rdmsrl(MSR_AMD64_IBSOPDATA2, ops_data2); + remote_access = ops_data2 & MSR_AMD64_IBSOPDATA2_DATASRC; + + /* Consider only DRAM accesses, exclude cache accesses from near ccx */ + if (remote_access < MSR_AMD64_IBSOPDATA2_DATASRC_DRAM) { + count_vm_event(IBS_NEAR_CACHE_HITS); + goto handled; + } + + /* Exclude hits from peer cache in far ccx */ + if (remote_access == MSR_AMD64_IBSOPDATA2_DATASRC_FAR_CCX_CACHE) { + count_vm_event(IBS_FAR_CACHE_HITS); + goto handled; + } + + /* Is linear addr valid? */ + if (ops_data3 & MSR_AMD64_IBSOPDATA3_LADDR_VALID) + rdmsrl(MSR_AMD64_IBSDCLINAD, laddr); + else { + count_vm_event(IBS_LADDR_INVALID); + goto handled; + } + + /* Discard kernel address accesses */ + if (laddr & (1UL << 63)) { + count_vm_event(IBS_KERNEL_ADDR); + goto handled; + } + + /* Is phys addr valid? */ + if (ops_data3 & MSR_AMD64_IBSOPDATA3_PADDR_VALID) + rdmsrl(MSR_AMD64_IBSDCPHYSAD, paddr); + else + count_vm_event(IBS_PADDR_INVALID); + +handled: + return NMI_HANDLED; +} + +static inline int get_ibs_lvt_offset(void) +{ + u64 val; + + rdmsrl(MSR_AMD64_IBSCTL, val); + if (!(val & IBSCTL_LVT_OFFSET_VALID)) + return -EINVAL; + + return val & IBSCTL_LVT_OFFSET_MASK; +} + +static void setup_APIC_ibs(void) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset < 0) + goto failed; + + if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) + return; +failed: + pr_warn("IBS APIC setup failed on cpu #%d\n", + smp_processor_id()); +} + +static void clear_APIC_ibs(void) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset >= 0) + setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); +} + +static int x86_amd_ibs_access_profile_startup(unsigned int cpu) +{ + setup_APIC_ibs(); + return 0; +} + +static int x86_amd_ibs_access_profile_teardown(unsigned int cpu) +{ + clear_APIC_ibs(); + return 0; +} + +int __init ibs_access_profiling_init(void) +{ + u32 caps; + + ibs_config = IBS_OP_CNT_CTL | IBS_OP_ENABLE; + + if (!boot_cpu_has(X86_FEATURE_IBS)) { + pr_info("IBS capability is unavailable for access profiling\n"); + return 0; + } + + caps = cpuid_eax(IBS_CPUID_FEATURES); + if (caps & IBS_CAPS_ZEN4) + ibs_config |= IBS_OP_L3MISSONLY; + + register_nmi_handler(NMI_LOCAL, ibs_overflow_handler, 0, "ibs"); + + cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING, + "x86/amd/ibs_access_profile:starting", + x86_amd_ibs_access_profile_startup, + x86_amd_ibs_access_profile_teardown); + + pr_info("IBS access profiling setup for NUMA Balancing\n"); + return 0; +} + +arch_initcall(ibs_access_profiling_init); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 7f5d1caf5890..1d55e347d16c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -149,6 +149,17 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, DIRECT_MAP_LEVEL3_SPLIT, +#ifdef CONFIG_NUMA_BALANCING + IBS_NR_EVENTS, + IBS_KTHREAD, + IBS_NON_LOAD_STORES, + IBS_DC_L2_HITS, + IBS_NEAR_CACHE_HITS, + IBS_FAR_CACHE_HITS, + IBS_LADDR_INVALID, + IBS_KERNEL_ADDR, + IBS_PADDR_INVALID, +#endif #endif NR_VM_EVENT_ITEMS }; diff --git a/mm/vmstat.c b/mm/vmstat.c index 1ea6a5ce1c41..c7a9d0d9ade8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1398,6 +1398,17 @@ const char * const vmstat_text[] = { #ifdef CONFIG_X86 "direct_map_level2_splits", "direct_map_level3_splits", +#ifdef CONFIG_NUMA_BALANCING + "ibs_nr_events", + "ibs_kthread", + "ibs_non_load_stores", + "ibs_dc_l2_hits", + "ibs_near_cache_hits", + "ibs_far_cache_hits", + "ibs_invalid_laddr", + "ibs_kernel_addr", + "ibs_invalid_paddr", +#endif #endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; -- 2.25.1