[RFC PATCH 1 1/1] sched/numa: Hot VMA and shared VMA optimization

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Optimizations are based on history of PIDs accessing VMA.

- Increase tasks' access history windows (PeterZ) from 2 to 4.
( This patch is from Peter Zijlstra <peterz@xxxxxxxxxxxxx>)

Idea: A task is allowed to scan a VMA if:
- VMA was very recently accessed as indicated by the latest
  access PIDs information (hot VMA).
- VMA is shared by more than 2 tasks. Here whole history of VMA's
access PIDs is considered using bitmap_weight().

Signed-off-by: Raghavendra K T <raghavendra.kt@xxxxxxx>
---
I will split the patset and post if we find this pathset useful
going further. First patch is from PeterZ.

 include/linux/mm.h       | 12 ++++++---
 include/linux/mm_types.h | 11 +++++---
 kernel/sched/fair.c      | 58 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5a97dec5169..1bf1df064b60 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1744,10 +1744,14 @@ static inline int folio_xchg_access_time(struct folio *folio, int time)
 static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 {
 	unsigned int pid_bit;
-
-	pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
-	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
-		__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
+	unsigned long *pids, pid_idx;
+
+	if (vma->numab_state) {
+		pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+		pid_idx = READ_ONCE(vma->numab_state->pids_active_idx);
+		pids = vma->numab_state->pids_active + pid_idx;
+		if (!test_bit(pid_bit, pids))
+			__set_bit(pid_bit, pids);
 	}
 }
 #else /* !CONFIG_NUMA_BALANCING */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8b611e13153e..050ceef1e9d5 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -574,6 +574,7 @@ struct vma_lock {
 	struct rw_semaphore lock;
 };
 
+#define NR_ACCESS_PID_HIST	4
 struct vma_numab_state {
 	/*
 	 * Initialised as time in 'jiffies' after which VMA
@@ -588,17 +589,21 @@ struct vma_numab_state {
 	 */
 	unsigned long pids_active_reset;
 
+	/* Points to current active PID tracking index. */
+	unsigned long pids_active_idx;
+
 	/*
 	 * Approximate tracking of PIDs that trapped a NUMA hinting
 	 * fault. May produce false positives due to hash collisions.
 	 *
-	 *   [0] Previous PID tracking
-	 *   [1] Current PID tracking
+	 *   [pids_active_idx - 1] Previous PID tracking
+	 *   [pids_active_idx] Current PID tracking
 	 *
+	 * Whole array is used in a rotating manner to track latest PIDs.
 	 * Window moves after next_pid_reset has expired approximately
 	 * every VMA_PID_RESET_PERIOD jiffies:
 	 */
-	unsigned long pids_active[2];
+	unsigned long pids_active[NR_ACCESS_PID_HIST];
 
 	/* MM scan sequence ID when scan first started after VMA creation */
 	int start_scan_seq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a16129f9a5c..ed329b2f4d53 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3157,9 +3157,44 @@ static void reset_ptenuma_scan(struct task_struct *p)
 	p->mm->numa_scan_offset = 0;
 }
 
+static inline bool vma_test_access_pid_history(struct vm_area_struct *vma)
+{
+	unsigned int i, pid_bit;
+	unsigned long pids = 0;
+
+	pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+
+	for (i = 0; i < NR_ACCESS_PID_HIST; i++)
+		pids  |= vma->numab_state->pids_active[i];
+
+	return test_bit(pid_bit, &pids);
+}
+
+static inline bool vma_accessed_recent(struct vm_area_struct *vma)
+{
+	unsigned long *pids, pid_idx;
+
+	pid_idx = vma->numab_state->pids_active_idx;
+	pids = vma->numab_state->pids_active + pid_idx;
+
+	return (bitmap_weight(pids, BITS_PER_LONG) >= 1);
+}
+
+#define SHARED_VMA_THRESH	3
+
+static inline bool vma_shared_access(struct vm_area_struct *vma)
+{
+	int i;
+	unsigned long pids = 0;
+
+	for (i = 0; i < NR_ACCESS_PID_HIST; i++)
+		pids  |= vma->numab_state->pids_active[i];
+
+	return (bitmap_weight(&pids, BITS_PER_LONG) >= SHARED_VMA_THRESH);
+}
+
 static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	unsigned long pids;
 	/*
 	 * Allow unconditional access first two times, so that all the (pages)
 	 * of VMAs get prot_none fault introduced irrespective of accesses.
@@ -3169,8 +3204,16 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 	if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
 		return true;
 
-	pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
-	if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+	/* Check if the current task had historically accessed VMA. */
+	if (vma_test_access_pid_history(vma))
+		return true;
+
+	/* Check at least one task had accessed VMA recently. */
+	if (vma_accessed_recent(vma))
+		return true;
+
+	/* Check if VMA is shared by many tasks. */
+	if (vma_shared_access(vma))
 		return true;
 
 	/*
@@ -3202,6 +3245,7 @@ static void task_numa_work(struct callback_head *work)
 	unsigned long nr_pte_updates = 0;
 	long pages, virtpages;
 	struct vma_iterator vmi;
+	unsigned long pid_idx;
 	bool vma_pids_skipped;
 	bool vma_pids_forced = false;
 
@@ -3341,8 +3385,12 @@ static void task_numa_work(struct callback_head *work)
 				time_after(jiffies, vma->numab_state->pids_active_reset)) {
 			vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
 				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
-			vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
-			vma->numab_state->pids_active[1] = 0;
+
+			pid_idx = vma->numab_state->pids_active_idx;
+			pid_idx = (pid_idx + 1) % NR_ACCESS_PID_HIST;
+
+			vma->numab_state->pids_active_idx = pid_idx;
+			vma->numab_state->pids_active[pid_idx] = 0;
 		}
 
 		/* Do not rescan VMAs twice within the same sequence. */
-- 
2.34.1





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux