* Mel Gorman <mgorman@xxxxxxx> [2013-06-26 15:38:06]: > Ideally it would be possible to distinguish between NUMA hinting faults > that are private to a task and those that are shared. This would require > that the last task that accessed a page for a hinting fault would be > recorded which would increase the size of struct page. Instead this patch > approximates private pages by assuming that faults that pass the two-stage > filter are private pages and all others are shared. The preferred NUMA > node is then selected based on where the maximum number of approximately > private faults were measured. Should we consider only private faults for preferred node? I would think if tasks have shared pages then moving all tasks that share the same pages to a node where the share pages are around would be preferred. No? If yes, how does the preferred node logic help to achieve the above? > > Signed-off-by: Mel Gorman <mgorman@xxxxxxx> > --- > include/linux/sched.h | 4 ++-- > kernel/sched/fair.c | 32 ++++++++++++++++++++++---------- > mm/huge_memory.c | 7 ++++--- > mm/memory.c | 9 ++++++--- > 4 files changed, 34 insertions(+), 18 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 82a6136..a41edea 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1600,10 +1600,10 @@ struct task_struct { > #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) > > #ifdef CONFIG_NUMA_BALANCING > -extern void task_numa_fault(int node, int pages, bool migrated); > +extern void task_numa_fault(int last_node, int node, int pages, bool migrated); > extern void set_numabalancing_state(bool enabled); > #else > -static inline void task_numa_fault(int node, int pages, bool migrated) > +static inline void task_numa_fault(int last_node, int node, int pages, bool migrated) > { > } > static inline void set_numabalancing_state(bool enabled) > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 99951a8..490e601 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -833,6 +833,11 @@ find_idlest_cpu_node(int this_cpu, int nid) > return idlest_cpu; > } > > +static inline int task_faults_idx(int nid, int priv) > +{ > + return 2 * nid + priv; > +} > + > static void task_numa_placement(struct task_struct *p) > { > int seq, nid, max_nid = 0; > @@ -849,13 +854,19 @@ static void task_numa_placement(struct task_struct *p) > /* Find the node with the highest number of faults */ > for (nid = 0; nid < nr_node_ids; nid++) { > unsigned long faults; > + int priv, i; > > - /* Decay existing window and copy faults since last scan */ > - p->numa_faults[nid] >>= 1; > - p->numa_faults[nid] += p->numa_faults_buffer[nid]; > - p->numa_faults_buffer[nid] = 0; > + for (priv = 0; priv < 2; priv++) { > + i = task_faults_idx(nid, priv); > + > + /* Decay existing window and copy faults since last scan */ > + p->numa_faults[i] >>= 1; > + p->numa_faults[i] += p->numa_faults_buffer[i]; > + p->numa_faults_buffer[i] = 0; > + } > > - faults = p->numa_faults[nid]; > + /* Find maximum private faults */ > + faults = p->numa_faults[task_faults_idx(nid, 1)]; > if (faults > max_faults) { > max_faults = faults; > max_nid = nid; > @@ -887,24 +898,25 @@ static void task_numa_placement(struct task_struct *p) > /* > * Got a PROT_NONE fault for a page on @node. > */ > -void task_numa_fault(int node, int pages, bool migrated) > +void task_numa_fault(int last_nid, int node, int pages, bool migrated) > { > struct task_struct *p = current; > + int priv = (cpu_to_node(task_cpu(p)) == last_nid); > > if (!sched_feat_numa(NUMA)) > return; > > /* Allocate buffer to track faults on a per-node basis */ > if (unlikely(!p->numa_faults)) { > - int size = sizeof(*p->numa_faults) * nr_node_ids; > + int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; > > /* numa_faults and numa_faults_buffer share the allocation */ > - p->numa_faults = kzalloc(size * 2, GFP_KERNEL); > + p->numa_faults = kzalloc(size * 4, GFP_KERNEL); > if (!p->numa_faults) > return; > > BUG_ON(p->numa_faults_buffer); > - p->numa_faults_buffer = p->numa_faults + nr_node_ids; > + p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); > } > > /* > @@ -918,7 +930,7 @@ void task_numa_fault(int node, int pages, bool migrated) > task_numa_placement(p); > > /* Record the fault, double the weight if pages were migrated */ > - p->numa_faults_buffer[node] += pages << migrated; > + p->numa_faults_buffer[task_faults_idx(node, priv)] += pages << migrated; > } > > static void reset_ptenuma_scan(struct task_struct *p) > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index e2f7f5aa..7cd7114 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1292,7 +1292,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, > { > struct page *page; > unsigned long haddr = addr & HPAGE_PMD_MASK; > - int target_nid; > + int target_nid, last_nid; > int current_nid = -1; > bool migrated; > > @@ -1307,6 +1307,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, > if (current_nid == numa_node_id()) > count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); > > + last_nid = page_nid_last(page); > target_nid = mpol_misplaced(page, vma, haddr); > if (target_nid == -1) { > put_page(page); > @@ -1332,7 +1333,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, > if (!migrated) > goto check_same; > > - task_numa_fault(target_nid, HPAGE_PMD_NR, true); > + task_numa_fault(last_nid, target_nid, HPAGE_PMD_NR, true); > return 0; > > check_same: > @@ -1347,7 +1348,7 @@ clear_pmdnuma: > out_unlock: > spin_unlock(&mm->page_table_lock); > if (current_nid != -1) > - task_numa_fault(current_nid, HPAGE_PMD_NR, false); > + task_numa_fault(last_nid, current_nid, HPAGE_PMD_NR, false); > return 0; > } > > diff --git a/mm/memory.c b/mm/memory.c > index ba94dec..c28bf52 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -3536,7 +3536,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, > { > struct page *page = NULL; > spinlock_t *ptl; > - int current_nid = -1; > + int current_nid = -1, last_nid; > int target_nid; > bool migrated = false; > > @@ -3566,6 +3566,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, > return 0; > } > > + last_nid = page_nid_last(page); > current_nid = page_to_nid(page); > target_nid = numa_migrate_prep(page, vma, addr, current_nid); > pte_unmap_unlock(ptep, ptl); > @@ -3586,7 +3587,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, > > out: > if (current_nid != -1) > - task_numa_fault(current_nid, 1, migrated); > + task_numa_fault(last_nid, current_nid, 1, migrated); > return 0; > } > > @@ -3602,6 +3603,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, > spinlock_t *ptl; > bool numa = false; > int local_nid = numa_node_id(); > + int last_nid; > > spin_lock(&mm->page_table_lock); > pmd = *pmdp; > @@ -3654,6 +3656,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, > * migrated to. > */ > curr_nid = local_nid; > + last_nid = page_nid_last(page); > target_nid = numa_migrate_prep(page, vma, addr, > page_to_nid(page)); > if (target_nid == -1) { > @@ -3666,7 +3669,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, > migrated = migrate_misplaced_page(page, target_nid); > if (migrated) > curr_nid = target_nid; > - task_numa_fault(curr_nid, 1, migrated); > + task_numa_fault(last_nid, curr_nid, 1, migrated); > > pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); > } > -- > 1.8.1.4 > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxx. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> -- Thanks and Regards Srikar Dronamraju -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>