On Wed 17-05-17 10:11:38, Vlastimil Babka wrote: > Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing > cpuset's mems") has introduced a two-step protocol when rebinding task's > mempolicy due to cpuset update, in order to avoid a parallel allocation seeing > an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: > mm: reduce large amounts of memory barrier related damage v3") introduced > a seqlock protection and removed the synchronization point between the two > update steps. At that point (or perhaps later), the two-step rebinding became > unnecessary. Currently it only makes sure that the update first adds new nodes > in step 1 and then removes nodes in step 2. Without memory barriers the effects > are questionable, and even then this cannot prevent a parallel zonelist > iteration checking the nodemask at each step to observe all nodes as unusable > for allocation. We now fully rely on the seqlock to prevent premature OOMs and > allocation failures. > > We can thus remove the two-step update parts and simplify the code. > > Signed-off-by: Vlastimil Babka <vbabka@xxxxxxx> \o/ This code has been just piling up more complicated code on top of the code which shouldn't have existed in the first place so I am very happy to see it go. I hope we can go without rebinding altogether longterm. Chaging data under feets just asks for problems and this is a nice example of where it goes. Acked-by: Michal Hocko <mhocko@xxxxxxxx> > --- > include/linux/mempolicy.h | 6 +-- > include/uapi/linux/mempolicy.h | 8 ---- > kernel/cgroup/cpuset.c | 4 +- > mm/mempolicy.c | 102 ++++++++--------------------------------- > 4 files changed, 21 insertions(+), 99 deletions(-) > > diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h > index ecb6cbeede5a..3a58b4be1b0c 100644 > --- a/include/linux/mempolicy.h > +++ b/include/linux/mempolicy.h > @@ -142,8 +142,7 @@ bool vma_policy_mof(struct vm_area_struct *vma); > > extern void numa_default_policy(void); > extern void numa_policy_init(void); > -extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, > - enum mpol_rebind_step step); > +extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); > extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); > > extern int huge_node(struct vm_area_struct *vma, > @@ -260,8 +259,7 @@ static inline void numa_default_policy(void) > } > > static inline void mpol_rebind_task(struct task_struct *tsk, > - const nodemask_t *new, > - enum mpol_rebind_step step) > + const nodemask_t *new) > { > } > > diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h > index 9cd8b21dddbe..2a4d89508fec 100644 > --- a/include/uapi/linux/mempolicy.h > +++ b/include/uapi/linux/mempolicy.h > @@ -24,13 +24,6 @@ enum { > MPOL_MAX, /* always last member of enum */ > }; > > -enum mpol_rebind_step { > - MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */ > - MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */ > - MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/ > - MPOL_REBIND_NSTEP, > -}; > - > /* Flags for set_mempolicy */ > #define MPOL_F_STATIC_NODES (1 << 15) > #define MPOL_F_RELATIVE_NODES (1 << 14) > @@ -65,7 +58,6 @@ enum mpol_rebind_step { > */ > #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ > #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ > -#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ > #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ > #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ > > diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c > index 0f41292be0fb..dfd5b420452d 100644 > --- a/kernel/cgroup/cpuset.c > +++ b/kernel/cgroup/cpuset.c > @@ -1063,9 +1063,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, > } > > nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); > - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); > - > - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); > + mpol_rebind_task(tsk, newmems); > tsk->mems_allowed = *newmems; > > if (need_loop) { > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index c60807625fd5..047181452040 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p) > > static const struct mempolicy_operations { > int (*create)(struct mempolicy *pol, const nodemask_t *nodes); > - /* > - * If read-side task has no lock to protect task->mempolicy, write-side > - * task will rebind the task->mempolicy by two step. The first step is > - * setting all the newly nodes, and the second step is cleaning all the > - * disallowed nodes. In this way, we can avoid finding no node to alloc > - * page. > - * If we have a lock to protect task->mempolicy in read-side, we do > - * rebind directly. > - * > - * step: > - * MPOL_REBIND_ONCE - do rebind work at once > - * MPOL_REBIND_STEP1 - set all the newly nodes > - * MPOL_REBIND_STEP2 - clean all the disallowed nodes > - */ > - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, > - enum mpol_rebind_step step); > + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); > } mpol_ops[MPOL_MAX]; > > static inline int mpol_store_user_nodemask(const struct mempolicy *pol) > @@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p) > kmem_cache_free(policy_cache, p); > } > > -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, > - enum mpol_rebind_step step) > +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) > { > } > > -/* > - * step: > - * MPOL_REBIND_ONCE - do rebind work at once > - * MPOL_REBIND_STEP1 - set all the newly nodes > - * MPOL_REBIND_STEP2 - clean all the disallowed nodes > - */ > -static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, > - enum mpol_rebind_step step) > +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) > { > nodemask_t tmp; > > @@ -325,35 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, > else if (pol->flags & MPOL_F_RELATIVE_NODES) > mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); > else { > - /* > - * if step == 1, we use ->w.cpuset_mems_allowed to cache the > - * result > - */ > - if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { > - nodes_remap(tmp, pol->v.nodes, > - pol->w.cpuset_mems_allowed, *nodes); > - pol->w.cpuset_mems_allowed = step ? tmp : *nodes; > - } else if (step == MPOL_REBIND_STEP2) { > - tmp = pol->w.cpuset_mems_allowed; > - pol->w.cpuset_mems_allowed = *nodes; > - } else > - BUG(); > + nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, > + *nodes); > + pol->w.cpuset_mems_allowed = tmp; > } > > if (nodes_empty(tmp)) > tmp = *nodes; > > - if (step == MPOL_REBIND_STEP1) > - nodes_or(pol->v.nodes, pol->v.nodes, tmp); > - else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) > - pol->v.nodes = tmp; > - else > - BUG(); > + pol->v.nodes = tmp; > } > > static void mpol_rebind_preferred(struct mempolicy *pol, > - const nodemask_t *nodes, > - enum mpol_rebind_step step) > + const nodemask_t *nodes) > { > nodemask_t tmp; > > @@ -379,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol, > /* > * mpol_rebind_policy - Migrate a policy to a different set of nodes > * > - * If read-side task has no lock to protect task->mempolicy, write-side > - * task will rebind the task->mempolicy by two step. The first step is > - * setting all the newly nodes, and the second step is cleaning all the > - * disallowed nodes. In this way, we can avoid finding no node to alloc > - * page. > - * If we have a lock to protect task->mempolicy in read-side, we do > - * rebind directly. > - * > - * step: > - * MPOL_REBIND_ONCE - do rebind work at once > - * MPOL_REBIND_STEP1 - set all the newly nodes > - * MPOL_REBIND_STEP2 - clean all the disallowed nodes > + * Per-vma policies are protected by mmap_sem. Allocations using per-task > + * policies are protected by task->mems_allowed_seq to prevent a premature > + * OOM/allocation failure due to parallel nodemask modification. > */ > -static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, > - enum mpol_rebind_step step) > +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) > { > if (!pol) > return; > - if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && > + if (!mpol_store_user_nodemask(pol) && > nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) > return; > > - if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) > - return; > - > - if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) > - BUG(); > - > - if (step == MPOL_REBIND_STEP1) > - pol->flags |= MPOL_F_REBINDING; > - else if (step == MPOL_REBIND_STEP2) > - pol->flags &= ~MPOL_F_REBINDING; > - else if (step >= MPOL_REBIND_NSTEP) > - BUG(); > - > - mpol_ops[pol->mode].rebind(pol, newmask, step); > + mpol_ops[pol->mode].rebind(pol, newmask); > } > > /* > @@ -424,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, > * Called with task's alloc_lock held. > */ > > -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, > - enum mpol_rebind_step step) > +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) > { > - mpol_rebind_policy(tsk->mempolicy, new, step); > + mpol_rebind_policy(tsk->mempolicy, new); > } > > /* > @@ -442,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) > > down_write(&mm->mmap_sem); > for (vma = mm->mmap; vma; vma = vma->vm_next) > - mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); > + mpol_rebind_policy(vma->vm_policy, new); > up_write(&mm->mmap_sem); > } > > @@ -2101,10 +2038,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) > > if (current_cpuset_is_being_rebound()) { > nodemask_t mems = cpuset_mems_allowed(current); > - if (new->flags & MPOL_F_REBINDING) > - mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); > - else > - mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); > + mpol_rebind_policy(new, &mems); > } > atomic_set(&new->refcnt, 1); > return new; > -- > 2.12.2 -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html