On 08/05/2013 03:36 PM, Rik van Riel wrote: > On Fri, 2 Aug 2013 18:50:32 +0200 > Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote: > >> Subject: mm, numa: Do not group on RO pages > > Using the fraction of the faults that happen on each node to > determine both the group weight and the task weight of each > node, and attempting to move the task to the node with the > highest score, seems to work fairly well. > > Here are the specjbb scores with this patch, on top of your > task grouping patches: > > vanilla numasched7 > Warehouses > 1 40651 45657 > 2 82897 88827 > 3 116623 130644 > 4 144512 171051 > 5 176681 209915 > 6 190471 247480 > 7 204036 283966 > 8 214466 318464 > 9 223451 348657 > 10 227439 380886 > 11 226163 374822 > 12 220857 370519 > 13 215871 367582 > 14 210965 361110 > > I suspect there may be further room for improvement, but it > may be time for this patch to go into Mel's tree, so others > will test it as well, helping us all learn what is broken > and how it can be improved... I've been testing what I believe is the accumulation of Mel's original changes plus what Peter added via LKML and this thread then this change. Don't think I missed any, but apologies if I did. Looking at it with Andrea's AutoNUMA tests (modified to automatically generate power-of-two runs based on the available nodes -- i.e. a 4 node system would run 2-node then 4-node, 8 node runs 2,4,8, 16 (if I had one) should do 2,4,8,16, etc.) it does look like the "highest score" is being used -- but that's not really a great thing for this type of private memory accessed by multiple processes -- it looks to be all concentrating back into a single node in the unbound cases for the runs beyond 2 nodes taking 1000+ seconds where the stock kernel takes 670 and the hard binding takes only 483. So it looks to me like the weighting here is a bit too strong -- we don't want all the tasks on the same node (more threads than available processors) when there's an idle node reasonably close we can move some of the memory to. Granted, this would be easier in cases with really large DBs where the memory *and* cpu load are both larger than the node resources.... Including a spreadsheet with the basic run / hard binding run memory layout as things run and a run summary for comparison. Don > > Signed-off-by: Rik van Riel <riel@xxxxxxxxxx> > --- > include/linux/sched.h | 1 + > kernel/sched/fair.c | 109 +++++++++++++++++++++++++++++++++++++++++--------- > 2 files changed, 91 insertions(+), 19 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 9e7fcfe..5e175ae 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1355,6 +1355,7 @@ struct task_struct { > * The values remain static for the duration of a PTE scan > */ > unsigned long *numa_faults; > + unsigned long total_numa_faults; > > /* > * numa_faults_buffer records faults per node during the current > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 6a06bef..2c9c1dd 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -844,6 +844,18 @@ static unsigned int task_scan_max(struct task_struct *p) > */ > unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3; > > +struct numa_group { > + atomic_t refcount; > + > + spinlock_t lock; /* nr_tasks, tasks */ > + int nr_tasks; > + struct list_head task_list; > + > + struct rcu_head rcu; > + atomic_long_t total_faults; > + atomic_long_t faults[0]; > +}; > + > static inline int task_faults_idx(int nid, int priv) > { > return 2 * nid + priv; > @@ -857,6 +869,51 @@ static inline unsigned long task_faults(struct task_struct *p, int nid) > return p->numa_faults[2*nid] + p->numa_faults[2*nid+1]; > } > > +static inline unsigned long group_faults(struct task_struct *p, int nid) > +{ > + if (!p->numa_group) > + return 0; > + > + return atomic_long_read(&p->numa_group->faults[2*nid]) + > + atomic_long_read(&p->numa_group->faults[2*nid+1]); > +} > + > +/* > + * These return the fraction of accesses done by a particular task, or > + * task group, on a particular numa node. The group weight is given a > + * larger multiplier, in order to group tasks together that are almost > + * evenly spread out between numa nodes. > + */ > +static inline unsigned long task_weight(struct task_struct *p, int nid) > +{ > + unsigned long total_faults; > + > + if (!p->numa_faults) > + return 0; > + > + total_faults = p->total_numa_faults; > + > + if (!total_faults) > + return 0; > + > + return 1000 * task_faults(p, nid) / total_faults; > +} > + > +static inline unsigned long group_weight(struct task_struct *p, int nid) > +{ > + unsigned long total_faults; > + > + if (!p->numa_group) > + return 0; > + > + total_faults = atomic_long_read(&p->numa_group->total_faults); > + > + if (!total_faults) > + return 0; > + > + return 1200 * group_faults(p, nid) / total_faults; > +} > + > /* > * Create/Update p->mempolicy MPOL_INTERLEAVE to match p->numa_faults[]. > */ > @@ -979,8 +1036,10 @@ static void task_numa_compare(struct task_numa_env *env, long imp) > cur = NULL; > > if (cur) { > - imp += task_faults(cur, env->src_nid) - > - task_faults(cur, env->dst_nid); > + imp += task_weight(cur, env->src_nid) + > + group_weight(cur, env->src_nid) - > + task_weight(cur, env->dst_nid) - > + group_weight(cur, env->dst_nid); > } > > trace_printk("compare[%d] task:%s/%d improvement: %ld\n", > @@ -1051,7 +1110,7 @@ static int task_numa_migrate(struct task_struct *p) > .best_cpu = -1 > }; > struct sched_domain *sd; > - unsigned long faults; > + unsigned long weight; > int nid, cpu, ret; > > /* > @@ -1067,7 +1126,7 @@ static int task_numa_migrate(struct task_struct *p) > } > rcu_read_unlock(); > > - faults = task_faults(p, env.src_nid); > + weight = task_weight(p, env.src_nid) + group_weight(p, env.src_nid); > update_numa_stats(&env.src_stats, env.src_nid); > > for_each_online_node(nid) { > @@ -1076,7 +1135,7 @@ static int task_numa_migrate(struct task_struct *p) > if (nid == env.src_nid) > continue; > > - imp = task_faults(p, nid) - faults; > + imp = task_weight(p, nid) + group_weight(p, nid) - weight; > if (imp < 0) > continue; > > @@ -1122,21 +1181,10 @@ static void numa_migrate_preferred(struct task_struct *p) > p->numa_migrate_retry = jiffies + HZ/10; > } > > -struct numa_group { > - atomic_t refcount; > - > - spinlock_t lock; /* nr_tasks, tasks */ > - int nr_tasks; > - struct list_head task_list; > - > - struct rcu_head rcu; > - atomic_long_t faults[0]; > -}; > - > static void task_numa_placement(struct task_struct *p) > { > - int seq, nid, max_nid = -1; > - unsigned long max_faults = 0; > + int seq, nid, max_nid = -1, max_group_nid = -1; > + unsigned long max_faults = 0, max_group_faults = 0; > > seq = ACCESS_ONCE(p->mm->numa_scan_seq); > if (p->numa_scan_seq == seq) > @@ -1148,7 +1196,7 @@ static void task_numa_placement(struct task_struct *p) > > /* Find the node with the highest number of faults */ > for (nid = 0; nid < nr_node_ids; nid++) { > - unsigned long faults = 0; > + unsigned long faults = 0, group_faults = 0; > int priv, i; > > for (priv = 0; priv < 2; priv++) { > @@ -1161,6 +1209,7 @@ static void task_numa_placement(struct task_struct *p) > /* Decay existing window, copy faults since last scan */ > p->numa_faults[i] >>= 1; > p->numa_faults[i] += p->numa_faults_buffer[i]; > + p->total_numa_faults += p->numa_faults_buffer[i]; > p->numa_faults_buffer[i] = 0; > > diff += p->numa_faults[i]; > @@ -1169,6 +1218,8 @@ static void task_numa_placement(struct task_struct *p) > if (p->numa_group) { > /* safe because we can only change our own group */ > atomic_long_add(diff, &p->numa_group->faults[i]); > + atomic_long_add(diff, &p->numa_group->total_faults); > + group_faults += atomic_long_read(&p->numa_group->faults[i]); > } > } > > @@ -1176,11 +1227,29 @@ static void task_numa_placement(struct task_struct *p) > max_faults = faults; > max_nid = nid; > } > + > + if (group_faults > max_group_faults) { > + max_group_faults = group_faults; > + max_group_nid = nid; > + } > } > > if (sched_feat(NUMA_INTERLEAVE)) > task_numa_mempol(p, max_faults); > > + /* > + * Should we stay on our own, or move in with the group? > + * If the task's memory accesses are concentrated on one node, go > + * to (more likely, stay on) that node. If the group's accesses > + * are more concentrated than the task's accesses, join the group. > + * > + * max_group_faults max_faults > + * ------------------ > ------------ > + * total_group_faults total_faults > + */ > + if (group_weight(p, max_group_nid) > task_weight(p, max_nid)) > + max_nid = max_group_nid; > + > /* Preferred node as the node with the most faults */ > if (max_faults && max_nid != p->numa_preferred_nid) { > > @@ -1242,6 +1311,7 @@ void task_numa_group(struct task_struct *p, int cpu, int pid) > atomic_set(&grp->refcount, 1); > spin_lock_init(&grp->lock); > INIT_LIST_HEAD(&grp->task_list); > + atomic_long_set(&grp->total_faults, 0); > > spin_lock(&p->numa_lock); > list_add(&p->numa_entry, &grp->task_list); > @@ -1336,6 +1406,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated) > > BUG_ON(p->numa_faults_buffer); > p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); > + p->total_numa_faults = 0; > } > > /* > > . >
Attachment:
AutoNUMA Mel_PZ Summary.xlsx
Description: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
numa01 on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 179.04 4096 0 88236 3031 numa01_HARD_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 110.27 4096 0 49123 2976 numa01_INVERSE_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 282.85 4096 0 68945 6219 numa01_THREAD_ALLOC on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 288.28 4096 0 28452 4247 numa01_THREAD_ALLOC_HARD_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 217.53 4096 0 39038 4911 numa01_THREAD_ALLOC_INVERSE_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 558.59 4096 0 30321 9830 numa02 on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 22.41 4096 0 4233 400 numa02_HARD_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 29.26 4096 0 5724 640 numa02_INVERSE_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 64.31 4096 0 10779 1465 numa02_SMT on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 30.43 4096 0 3962 261 numa02_SMT_HARD_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 30.47 4096 0 4541 319 numa02_SMT_INVERSE_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 64.19 4096 0 5749 641 numa01 on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 518.92 4096 0 249899 13939 numa01_HARD_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 265.05 4096 0 156342 7872 numa01_INVERSE_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 322.30 4096 0 146232 11550 numa01_THREAD_ALLOC on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 147.36 4096 0 27545 3868 numa01_THREAD_ALLOC_HARD_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 280.08 4096 0 55980 8053 numa01_THREAD_ALLOC_INVERSE_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 323.98 4096 0 56929 10682 numa02 on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 16.21 4096 0 4244 527 numa02_HARD_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 15.15 4096 0 5763 608 numa02_INVERSE_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 35.94 4096 0 7754 1472 numa02_SMT on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 29.67 4096 0 4638 427 numa02_SMT_HARD_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 15.85 4096 0 4897 302 numa02_SMT_INVERSE_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 36.45 4096 0 7783 721 numa01 on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 1067.32 4096 0 152427 77781 numa01_HARD_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 450.17 4096 0 240440 38428 numa01_INVERSE_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 529.96 4096 0 148766 39733 numa01_THREAD_ALLOC on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 108.96 4096 0 43067 14924 numa01_THREAD_ALLOC_HARD_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 228.77 4096 0 57781 19736 numa01_THREAD_ALLOC_INVERSE_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 279.75 4096 0 48321 20551 numa02 on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 11.03 4096 0 4394 1426 numa02_HARD_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 8.12 4096 0 5604 1073 numa02_INVERSE_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 20.16 4096 0 6010 2671 numa02_SMT on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 15.97 4096 0 3811 493 numa02_SMT_HARD_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 8.56 4096 0 4126 296 numa02_SMT_INVERSE_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 20.38 4096 0 6232 806
numa01 on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 120.66 4096 0 48314 2345 numa01_HARD_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 110.33 4096 0 49860 3014 numa01_INVERSE_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 283.75 4096 0 70962 5537 numa01_THREAD_ALLOC on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 217.40 4096 0 31633 3203 numa01_THREAD_ALLOC_HARD_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 217.52 4096 0 41759 4841 numa01_THREAD_ALLOC_INVERSE_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 560.21 4096 0 39048 9879 numa02 on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 22.04 4096 0 4140 343 numa02_HARD_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 29.27 4096 0 8121 671 numa02_INVERSE_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 65.09 4096 0 12140 1475 numa02_SMT on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 47.72 4096 0 4556 424 numa02_SMT_HARD_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 30.45 4096 0 4320 299 numa02_SMT_INVERSE_BIND on 2 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 64.93 4096 0 4384 641 numa01 on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 297.28 4096 0 207781 9202 numa01_HARD_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 281.40 4096 0 261270 8203 numa01_INVERSE_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 324.01 4096 0 208260 11654 numa01_THREAD_ALLOC on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 126.75 4096 0 52259 3970 numa01_THREAD_ALLOC_HARD_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 273.75 4096 0 64228 8514 numa01_THREAD_ALLOC_INVERSE_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 311.88 4096 0 63381 10960 numa02 on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 16.89 4096 0 5091 592 numa02_HARD_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 15.16 4096 0 6512 617 numa02_INVERSE_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 35.67 4096 0 11728 1473 numa02_SMT on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 26.92 4096 0 4319 426 numa02_SMT_HARD_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 16.00 4096 0 6096 324 numa02_SMT_INVERSE_BIND on 4 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 36.39 4096 0 8588 724 numa01 on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 669.71 4096 0 212552 64899 numa01_HARD_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 483.45 4096 0 212077 37364 numa01_INVERSE_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 495.48 4096 0 223279 38543 numa01_THREAD_ALLOC on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 108.61 4096 0 50008 16326 numa01_THREAD_ALLOC_HARD_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 245.13 4096 0 72037 19834 numa01_THREAD_ALLOC_INVERSE_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 292.22 4096 0 69993 22849 numa02 on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 11.55 4096 0 5316 1516 numa02_HARD_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 8.10 4096 0 5591 1164 numa02_INVERSE_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 20.03 4096 0 11918 2662 numa02_SMT on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 16.88 4096 0 4916 549 numa02_SMT_HARD_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 8.68 4096 0 6176 324 numa02_SMT_INVERSE_BIND on 8 Nodes: ElapsedTime PageSize MajFaults MinFaults ContextSwtch 20.47 4096 0 8904 802