[Cc linux-api] On Wed 13-10-21 15:15:39, Aneesh Kumar K.V wrote: > This mempolicy mode can be used with either the set_mempolicy(2) > or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it > allows an application to set a preference node from which the kernel > will fulfill memory allocation requests. Unlike the MPOL_PREFERRED mode, > it takes a set of nodes. The nodes in the nodemask are used as fallback > allocation nodes if memory is not available on the preferred node. > Unlike MPOL_PREFERRED_MANY, it will not fall back memory allocations > to all nodes in the system. Like the MPOL_BIND interface, it works over a > set of nodes and will cause a SIGSEGV or invoke the OOM killer if > memory is not available on those preferred nodes. > > This patch helps applications to hint a memory allocation preference node > and fallback to _only_ a set of nodes if the memory is not available > on the preferred node. Fallback allocation is attempted from the node which is > nearest to the preferred node. > > This new memory policy helps applications to have explicit control on slow > memory allocation and avoids default fallback to slow memory NUMA nodes. > The difference with MPOL_BIND is the ability to specify a preferred node > which is the first node in the nodemask argument passed. > > Cc: Ben Widawsky <ben.widawsky@xxxxxxxxx> > Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> > Cc: Feng Tang <feng.tang@xxxxxxxxx> > Cc: Michal Hocko <mhocko@xxxxxxxxxx> > Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> > Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> > Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> > Cc: Randy Dunlap <rdunlap@xxxxxxxxxxxxx> > Cc: Vlastimil Babka <vbabka@xxxxxxx> > Cc: Andi Kleen <ak@xxxxxxxxxxxxxxx> > Cc: Dan Williams <dan.j.williams@xxxxxxxxx> > Cc: Huang Ying <ying.huang@xxxxxxxxx>b > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> > --- > .../admin-guide/mm/numa_memory_policy.rst | 7 +++ > include/uapi/linux/mempolicy.h | 1 + > mm/mempolicy.c | 43 +++++++++++++++++-- > 3 files changed, 48 insertions(+), 3 deletions(-) > > diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst > index 64fd0ba0d057..4dfdcbd22d67 100644 > --- a/Documentation/admin-guide/mm/numa_memory_policy.rst > +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst > @@ -252,6 +252,13 @@ MPOL_PREFERRED_MANY > can fall back to all existing numa nodes. This is effectively > MPOL_PREFERRED allowed for a mask rather than a single node. > > +MPOL_PREFERRED_STRICT > + This mode specifies that the allocation should be attempted > + from the first node specified in the nodemask of the policy. > + If that allocation fails, the kernel will search other nodes > + in the nodemask, in order of increasing distance from the > + preferred node based on information provided by the platform firmware. > + > NUMA memory policy supports the following optional mode flags: > > MPOL_F_STATIC_NODES > diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h > index 046d0ccba4cd..8aa1d1963235 100644 > --- a/include/uapi/linux/mempolicy.h > +++ b/include/uapi/linux/mempolicy.h > @@ -23,6 +23,7 @@ enum { > MPOL_INTERLEAVE, > MPOL_LOCAL, > MPOL_PREFERRED_MANY, > + MPOL_PREFERRED_STRICT, > MPOL_MAX, /* always last member of enum */ > }; > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 1592b081c58e..59080dd1ea69 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -407,6 +407,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { > .create = mpol_new_nodemask, > .rebind = mpol_rebind_preferred, > }, > + [MPOL_PREFERRED_STRICT] = { > + .create = mpol_new_nodemask, > + .rebind = mpol_rebind_preferred, > + }, > }; > > static int migrate_page_add(struct page *page, struct list_head *pagelist, > @@ -900,6 +904,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) > case MPOL_INTERLEAVE: > case MPOL_PREFERRED: > case MPOL_PREFERRED_MANY: > + case MPOL_PREFERRED_STRICT: > *nodes = p->nodes; > break; > case MPOL_LOCAL: > @@ -1781,7 +1786,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) > cpuset_nodemask_valid_mems_allowed(&policy->nodes)) > return &policy->nodes; > > - if (mode == MPOL_PREFERRED_MANY) > + if (mode == MPOL_PREFERRED_MANY || mode == MPOL_PREFERRED_STRICT) > return &policy->nodes; > > return NULL; > @@ -1796,7 +1801,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) > */ > static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) > { > - if (policy->mode == MPOL_PREFERRED) { > + if (policy->mode == MPOL_PREFERRED || policy->mode == MPOL_PREFERRED_STRICT) { > nd = first_node(policy->nodes); > } else { > /* > @@ -1840,6 +1845,7 @@ unsigned int mempolicy_slab_node(void) > > switch (policy->mode) { > case MPOL_PREFERRED: > + case MPOL_PREFERRED_STRICT: > return first_node(policy->nodes); > > case MPOL_INTERLEAVE: > @@ -1952,7 +1958,8 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, > huge_page_shift(hstate_vma(vma))); > } else { > nid = policy_node(gfp_flags, *mpol, numa_node_id()); > - if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) > + if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY || > + mode == MPOL_PREFERRED_STRICT) > *nodemask = &(*mpol)->nodes; > } > return nid; > @@ -1986,6 +1993,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) > switch (mempolicy->mode) { > case MPOL_PREFERRED: > case MPOL_PREFERRED_MANY: > + case MPOL_PREFERRED_STRICT: > case MPOL_BIND: > case MPOL_INTERLEAVE: > *mask = mempolicy->nodes; > @@ -2072,6 +2080,23 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, > return page; > } > > +static struct page *alloc_pages_preferred_strict(gfp_t gfp, unsigned int order, > + struct mempolicy *pol) > +{ > + int nid; > + gfp_t preferred_gfp; > + > + /* > + * With MPOL_PREFERRED_STRICT first node in the policy nodemask > + * is picked as the preferred node id and the fallback allocation > + * is still restricted to the preferred nodes in the nodemask. > + */ > + preferred_gfp = gfp | __GFP_NOWARN; > + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); > + nid = first_node(pol->nodes); > + return __alloc_pages(preferred_gfp, order, nid, &pol->nodes); > +} > + > /** > * alloc_pages_vma - Allocate a page for a VMA. > * @gfp: GFP flags. > @@ -2113,6 +2138,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, > goto out; > } > > + if (pol->mode == MPOL_PREFERRED_STRICT) { > + page = alloc_pages_preferred_strict(gfp, order, pol); > + mpol_cond_put(pol); > + goto out; > + } > + > if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { > int hpage_node = node; > > @@ -2193,6 +2224,8 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) > else if (pol->mode == MPOL_PREFERRED_MANY) > page = alloc_pages_preferred_many(gfp, order, > numa_node_id(), pol); > + else if (pol->mode == MPOL_PREFERRED_STRICT) > + page = alloc_pages_preferred_strict(gfp, order, pol); > else > page = __alloc_pages(gfp, order, > policy_node(gfp, pol, numa_node_id()), > @@ -2265,6 +2298,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) > case MPOL_INTERLEAVE: > case MPOL_PREFERRED: > case MPOL_PREFERRED_MANY: > + case MPOL_PREFERRED_STRICT: > return !!nodes_equal(a->nodes, b->nodes); > case MPOL_LOCAL: > return true; > @@ -2405,6 +2439,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long > break; > > case MPOL_PREFERRED: > + case MPOL_PREFERRED_STRICT: > if (node_isset(curnid, pol->nodes)) > goto out; > polnid = first_node(pol->nodes); > @@ -2866,6 +2901,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) > err = 0; > goto out; > case MPOL_PREFERRED_MANY: > + case MPOL_PREFERRED_STRICT: > case MPOL_BIND: > /* > * Insist on a nodelist > @@ -2953,6 +2989,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) > break; > case MPOL_PREFERRED: > case MPOL_PREFERRED_MANY: > + case MPOL_PREFERRED_STRICT: > case MPOL_BIND: > case MPOL_INTERLEAVE: > nodes = pol->nodes; > -- > 2.31.1 -- Michal Hocko SUSE Labs