Optimizations are based on history of PIDs accessing VMA. - Increase tasks' access history windows (PeterZ) from 2 to 4. ( This patch is from Peter Zijlstra <peterz@xxxxxxxxxxxxx>) Idea: A task is allowed to scan a VMA if: - VMA was very recently accessed as indicated by the latest access PIDs information (hot VMA). - VMA is shared by more than 2 tasks. Here whole history of VMA's access PIDs is considered using bitmap_weight(). Signed-off-by: Raghavendra K T <raghavendra.kt@xxxxxxx> --- I will split the patset and post if we find this pathset useful going further. First patch is from PeterZ. include/linux/mm.h | 12 ++++++--- include/linux/mm_types.h | 11 +++++--- kernel/sched/fair.c | 58 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 69 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f5a97dec5169..1bf1df064b60 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1744,10 +1744,14 @@ static inline int folio_xchg_access_time(struct folio *folio, int time) static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { unsigned int pid_bit; - - pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); - if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) { - __set_bit(pid_bit, &vma->numab_state->pids_active[1]); + unsigned long *pids, pid_idx; + + if (vma->numab_state) { + pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); + pid_idx = READ_ONCE(vma->numab_state->pids_active_idx); + pids = vma->numab_state->pids_active + pid_idx; + if (!test_bit(pid_bit, pids)) + __set_bit(pid_bit, pids); } } #else /* !CONFIG_NUMA_BALANCING */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8b611e13153e..050ceef1e9d5 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -574,6 +574,7 @@ struct vma_lock { struct rw_semaphore lock; }; +#define NR_ACCESS_PID_HIST 4 struct vma_numab_state { /* * Initialised as time in 'jiffies' after which VMA @@ -588,17 +589,21 @@ struct vma_numab_state { */ unsigned long pids_active_reset; + /* Points to current active PID tracking index. */ + unsigned long pids_active_idx; + /* * Approximate tracking of PIDs that trapped a NUMA hinting * fault. May produce false positives due to hash collisions. * - * [0] Previous PID tracking - * [1] Current PID tracking + * [pids_active_idx - 1] Previous PID tracking + * [pids_active_idx] Current PID tracking * + * Whole array is used in a rotating manner to track latest PIDs. * Window moves after next_pid_reset has expired approximately * every VMA_PID_RESET_PERIOD jiffies: */ - unsigned long pids_active[2]; + unsigned long pids_active[NR_ACCESS_PID_HIST]; /* MM scan sequence ID when scan first started after VMA creation */ int start_scan_seq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6a16129f9a5c..ed329b2f4d53 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3157,9 +3157,44 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } +static inline bool vma_test_access_pid_history(struct vm_area_struct *vma) +{ + unsigned int i, pid_bit; + unsigned long pids = 0; + + pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); + + for (i = 0; i < NR_ACCESS_PID_HIST; i++) + pids |= vma->numab_state->pids_active[i]; + + return test_bit(pid_bit, &pids); +} + +static inline bool vma_accessed_recent(struct vm_area_struct *vma) +{ + unsigned long *pids, pid_idx; + + pid_idx = vma->numab_state->pids_active_idx; + pids = vma->numab_state->pids_active + pid_idx; + + return (bitmap_weight(pids, BITS_PER_LONG) >= 1); +} + +#define SHARED_VMA_THRESH 3 + +static inline bool vma_shared_access(struct vm_area_struct *vma) +{ + int i; + unsigned long pids = 0; + + for (i = 0; i < NR_ACCESS_PID_HIST; i++) + pids |= vma->numab_state->pids_active[i]; + + return (bitmap_weight(&pids, BITS_PER_LONG) >= SHARED_VMA_THRESH); +} + static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) { - unsigned long pids; /* * Allow unconditional access first two times, so that all the (pages) * of VMAs get prot_none fault introduced irrespective of accesses. @@ -3169,8 +3204,16 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2) return true; - pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; - if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids)) + /* Check if the current task had historically accessed VMA. */ + if (vma_test_access_pid_history(vma)) + return true; + + /* Check at least one task had accessed VMA recently. */ + if (vma_accessed_recent(vma)) + return true; + + /* Check if VMA is shared by many tasks. */ + if (vma_shared_access(vma)) return true; /* @@ -3202,6 +3245,7 @@ static void task_numa_work(struct callback_head *work) unsigned long nr_pte_updates = 0; long pages, virtpages; struct vma_iterator vmi; + unsigned long pid_idx; bool vma_pids_skipped; bool vma_pids_forced = false; @@ -3341,8 +3385,12 @@ static void task_numa_work(struct callback_head *work) time_after(jiffies, vma->numab_state->pids_active_reset)) { vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset + msecs_to_jiffies(VMA_PID_RESET_PERIOD); - vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]); - vma->numab_state->pids_active[1] = 0; + + pid_idx = vma->numab_state->pids_active_idx; + pid_idx = (pid_idx + 1) % NR_ACCESS_PID_HIST; + + vma->numab_state->pids_active_idx = pid_idx; + vma->numab_state->pids_active[pid_idx] = 0; } /* Do not rescan VMAs twice within the same sequence. */ -- 2.34.1