Allow an initial delay before enabling the collection of IBS provided access info. Signed-off-by: Bharata B Rao <bharata@xxxxxxx> --- arch/x86/mm/ibs.c | 18 ++++++++++++++++++ include/linux/mm.h | 2 ++ include/linux/mm_types.h | 3 +++ kernel/sched/debug.c | 2 ++ kernel/sched/fair.c | 3 +++ 5 files changed, 28 insertions(+) diff --git a/arch/x86/mm/ibs.c b/arch/x86/mm/ibs.c index a479029e9262..dfe5246954c0 100644 --- a/arch/x86/mm/ibs.c +++ b/arch/x86/mm/ibs.c @@ -16,6 +16,21 @@ struct ibs_access_work { u64 laddr, paddr; }; +static bool delay_hw_access_profiling(struct mm_struct *mm) +{ + unsigned long delay, now = jiffies; + + if (!mm->numa_hw_access_delay) + mm->numa_hw_access_delay = now + + msecs_to_jiffies(sysctl_numa_balancing_access_faults_delay); + + delay = mm->numa_hw_access_delay; + if (time_before(now, delay)) + return true; + + return false; +} + void hw_access_sched_in(struct task_struct *prev, struct task_struct *curr) { u64 config = 0; @@ -28,6 +43,9 @@ void hw_access_sched_in(struct task_struct *prev, struct task_struct *curr) if (!curr->mm) goto out; + if (delay_hw_access_profiling(curr->mm)) + goto out; + if (curr->numa_sample_period) period = curr->numa_sample_period; else diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f857163ac89..118705a296ef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1397,6 +1397,8 @@ static inline int folio_nid(const struct folio *folio) } #ifdef CONFIG_NUMA_BALANCING +extern unsigned int sysctl_numa_balancing_access_faults_delay; + /* page access time bits needs to hold at least 4 seconds */ #define PAGE_ACCESS_TIME_MIN_BITS 12 #if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9757067c3053..8a2fb8bf2d62 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -750,6 +750,9 @@ struct mm_struct { /* numa_scan_seq prevents two threads remapping PTEs. */ int numa_scan_seq; + + /* HW-provided access info is collected after this initial delay */ + unsigned long numa_hw_access_delay; #endif /* * An operation with batched TLB flushing is going on. Anything diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1cf19778a232..5c76a7594358 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -342,6 +342,8 @@ static __init int sched_init_debug(void) &sysctl_numa_balancing_sample_period_max); debugfs_create_u32("access_faults_threshold", 0644, numa, &sysctl_numa_balancing_access_faults_threshold); + debugfs_create_u32("access_faults_delay", 0644, numa, + &sysctl_numa_balancing_access_faults_delay); #endif debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1b0665b034d0..2e2b1e706a24 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1097,6 +1097,7 @@ unsigned int sysctl_numa_balancing_sample_period_def = 10000; unsigned int sysctl_numa_balancing_sample_period_min = 5000; unsigned int sysctl_numa_balancing_sample_period_max = 20000; unsigned int sysctl_numa_balancing_access_faults_threshold = 250; +unsigned int sysctl_numa_balancing_access_faults_delay = 1000; /* * Approximate time to scan a full NUMA task in ms. The task scan period is @@ -3189,6 +3190,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) if (mm_users == 1) { mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); mm->numa_scan_seq = 0; + mm->numa_hw_access_delay = jiffies + + msecs_to_jiffies(sysctl_numa_balancing_access_faults_delay); } } p->node_stamp = 0; -- 2.25.1