Program IBS for access profiling for threads from the task sched switch path. IBS is programmed with a period that corresponds to the incoming thread. Kernel threads are excluded from this. The sample period is currently kept at a fixed value of 10000. Signed-off-by: Bharata B Rao <bharata@xxxxxxx> --- arch/x86/mm/ibs.c | 27 +++++++++++++++++++++++++++ include/linux/sched.h | 1 + kernel/sched/core.c | 1 + kernel/sched/fair.c | 1 + kernel/sched/sched.h | 5 +++++ 5 files changed, 35 insertions(+) diff --git a/arch/x86/mm/ibs.c b/arch/x86/mm/ibs.c index adbc587b1767..a479029e9262 100644 --- a/arch/x86/mm/ibs.c +++ b/arch/x86/mm/ibs.c @@ -8,6 +8,7 @@ #include <asm/perf_event.h> /* TODO: Move defns like IBS_OP_ENABLE into non-perf header */ #include <asm/apic.h> +#define IBS_SAMPLE_PERIOD 10000 static u64 ibs_config __read_mostly; struct ibs_access_work { @@ -15,6 +16,31 @@ struct ibs_access_work { u64 laddr, paddr; }; +void hw_access_sched_in(struct task_struct *prev, struct task_struct *curr) +{ + u64 config = 0; + unsigned int period; + + if (!static_branch_unlikely(&hw_access_hints)) + return; + + /* Disable IBS for kernel thread */ + if (!curr->mm) + goto out; + + if (curr->numa_sample_period) + period = curr->numa_sample_period; + else + period = IBS_SAMPLE_PERIOD; + + + config = (period >> 4) & IBS_OP_MAX_CNT; + config |= (period & IBS_OP_MAX_CNT_EXT_MASK); + config |= ibs_config; +out: + wrmsrl(MSR_AMD64_IBSOPCTL, config); +} + void task_ibs_access_work(struct callback_head *work) { struct ibs_access_work *iwork = container_of(work, struct ibs_access_work, work); @@ -198,6 +224,7 @@ int __init ibs_access_profiling_init(void) x86_amd_ibs_access_profile_startup, x86_amd_ibs_access_profile_teardown); + static_branch_enable(&hw_access_hints); pr_info("IBS access profiling setup for NUMA Balancing\n"); return 0; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 19dd4ee07436..66c532418d38 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1254,6 +1254,7 @@ struct task_struct { int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; + unsigned int numa_sample_period; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e838feb6adc5..1c13fed8bebc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5165,6 +5165,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) prev_state = READ_ONCE(prev->__state); vtime_task_switch(prev); perf_event_task_sched_in(prev, current); + hw_access_sched_in(prev, current); finish_task(prev); tick_nohz_task_switch(); finish_lock_switch(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c9b9e62da779..3f617c799821 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3094,6 +3094,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) p->node_stamp = 0; p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_sample_period = 0; p->numa_migrate_retry = 0; /* Protect against double add, see task_tick_numa and task_numa_work */ p->numa_work.next = &p->numa_work; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 771f8ddb7053..953d16c802d6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1723,11 +1723,16 @@ extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +void hw_access_sched_in(struct task_struct *prev, struct task_struct *curr); #else static inline void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) { } +static inline void hw_access_sched_in(struct task_struct *prev, + struct task_struct *curr) +{ +} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SMP -- 2.25.1