Re: [RFC PATCH] mm/mempolicy: add MPOL_PREFERRED_STRICT memory policy

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[Cc linux-api]

On Wed 13-10-21 15:15:39, Aneesh Kumar K.V wrote:
> This mempolicy mode can be used with either the set_mempolicy(2)
> or mbind(2) interfaces.  Like the MPOL_PREFERRED interface, it
> allows an application to set a preference node from which the kernel
> will fulfill memory allocation requests. Unlike the MPOL_PREFERRED mode,
> it takes a set of nodes. The nodes in the nodemask are used as fallback
> allocation nodes if memory is not available on the preferred node.
> Unlike MPOL_PREFERRED_MANY, it will not fall back memory allocations
> to all nodes in the system. Like the MPOL_BIND interface, it works over a
> set of nodes and will cause a SIGSEGV or invoke the OOM killer if
> memory is not available on those preferred nodes.
> 
> This patch helps applications to hint a memory allocation preference node
> and fallback to _only_ a set of nodes if the memory is not available
> on the preferred node.  Fallback allocation is attempted from the node which is
> nearest to the preferred node.
> 
> This new memory policy helps applications to have explicit control on slow
> memory allocation and avoids default fallback to slow memory NUMA nodes.
> The difference with MPOL_BIND is the ability to specify a preferred node
> which is the first node in the nodemask argument passed.
> 
> Cc: Ben Widawsky <ben.widawsky@xxxxxxxxx>
> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> Cc: Feng Tang <feng.tang@xxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxxxxx>
> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
> Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
> Cc: Randy Dunlap <rdunlap@xxxxxxxxxxxxx>
> Cc: Vlastimil Babka <vbabka@xxxxxxx>
> Cc: Andi Kleen <ak@xxxxxxxxxxxxxxx>
> Cc: Dan Williams <dan.j.williams@xxxxxxxxx>
> Cc: Huang Ying <ying.huang@xxxxxxxxx>b
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx>
> ---
>  .../admin-guide/mm/numa_memory_policy.rst     |  7 +++
>  include/uapi/linux/mempolicy.h                |  1 +
>  mm/mempolicy.c                                | 43 +++++++++++++++++--
>  3 files changed, 48 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
> index 64fd0ba0d057..4dfdcbd22d67 100644
> --- a/Documentation/admin-guide/mm/numa_memory_policy.rst
> +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
> @@ -252,6 +252,13 @@ MPOL_PREFERRED_MANY
>  	can fall back to all existing numa nodes. This is effectively
>  	MPOL_PREFERRED allowed for a mask rather than a single node.
>  
> +MPOL_PREFERRED_STRICT
> +	This mode specifies that the allocation should be attempted
> +	from the first node specified in the nodemask of the policy.
> +	If that allocation fails, the kernel will search other nodes
> +	in the nodemask, in order of increasing distance from the
> +	preferred node based on information provided by the platform firmware.
> +
>  NUMA memory policy supports the following optional mode flags:
>  
>  MPOL_F_STATIC_NODES
> diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
> index 046d0ccba4cd..8aa1d1963235 100644
> --- a/include/uapi/linux/mempolicy.h
> +++ b/include/uapi/linux/mempolicy.h
> @@ -23,6 +23,7 @@ enum {
>  	MPOL_INTERLEAVE,
>  	MPOL_LOCAL,
>  	MPOL_PREFERRED_MANY,
> +	MPOL_PREFERRED_STRICT,
>  	MPOL_MAX,	/* always last member of enum */
>  };
>  
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 1592b081c58e..59080dd1ea69 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -407,6 +407,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
>  		.create = mpol_new_nodemask,
>  		.rebind = mpol_rebind_preferred,
>  	},
> +	[MPOL_PREFERRED_STRICT] = {
> +		.create = mpol_new_nodemask,
> +		.rebind = mpol_rebind_preferred,
> +	},
>  };
>  
>  static int migrate_page_add(struct page *page, struct list_head *pagelist,
> @@ -900,6 +904,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
>  	case MPOL_INTERLEAVE:
>  	case MPOL_PREFERRED:
>  	case MPOL_PREFERRED_MANY:
> +	case MPOL_PREFERRED_STRICT:
>  		*nodes = p->nodes;
>  		break;
>  	case MPOL_LOCAL:
> @@ -1781,7 +1786,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
>  		cpuset_nodemask_valid_mems_allowed(&policy->nodes))
>  		return &policy->nodes;
>  
> -	if (mode == MPOL_PREFERRED_MANY)
> +	if (mode == MPOL_PREFERRED_MANY || mode == MPOL_PREFERRED_STRICT)
>  		return &policy->nodes;
>  
>  	return NULL;
> @@ -1796,7 +1801,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
>   */
>  static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
>  {
> -	if (policy->mode == MPOL_PREFERRED) {
> +	if (policy->mode == MPOL_PREFERRED || policy->mode == MPOL_PREFERRED_STRICT) {
>  		nd = first_node(policy->nodes);
>  	} else {
>  		/*
> @@ -1840,6 +1845,7 @@ unsigned int mempolicy_slab_node(void)
>  
>  	switch (policy->mode) {
>  	case MPOL_PREFERRED:
> +	case MPOL_PREFERRED_STRICT:
>  		return first_node(policy->nodes);
>  
>  	case MPOL_INTERLEAVE:
> @@ -1952,7 +1958,8 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
>  					huge_page_shift(hstate_vma(vma)));
>  	} else {
>  		nid = policy_node(gfp_flags, *mpol, numa_node_id());
> -		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
> +		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY ||
> +			mode == MPOL_PREFERRED_STRICT)
>  			*nodemask = &(*mpol)->nodes;
>  	}
>  	return nid;
> @@ -1986,6 +1993,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
>  	switch (mempolicy->mode) {
>  	case MPOL_PREFERRED:
>  	case MPOL_PREFERRED_MANY:
> +	case MPOL_PREFERRED_STRICT:
>  	case MPOL_BIND:
>  	case MPOL_INTERLEAVE:
>  		*mask = mempolicy->nodes;
> @@ -2072,6 +2080,23 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
>  	return page;
>  }
>  
> +static struct page *alloc_pages_preferred_strict(gfp_t gfp, unsigned int order,
> +						 struct mempolicy *pol)
> +{
> +	int nid;
> +	gfp_t preferred_gfp;
> +
> +	/*
> +	 * With MPOL_PREFERRED_STRICT first node in the policy nodemask
> +	 * is picked as the preferred node id and the fallback allocation
> +	 * is still restricted to the preferred nodes in the nodemask.
> +	 */
> +	preferred_gfp = gfp | __GFP_NOWARN;
> +	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
> +	nid = first_node(pol->nodes);
> +	return __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
> +}
> +
>  /**
>   * alloc_pages_vma - Allocate a page for a VMA.
>   * @gfp: GFP flags.
> @@ -2113,6 +2138,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
>  		goto out;
>  	}
>  
> +	if (pol->mode == MPOL_PREFERRED_STRICT) {
> +		page = alloc_pages_preferred_strict(gfp, order, pol);
> +		mpol_cond_put(pol);
> +		goto out;
> +	}
> +
>  	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
>  		int hpage_node = node;
>  
> @@ -2193,6 +2224,8 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
>  	else if (pol->mode == MPOL_PREFERRED_MANY)
>  		page = alloc_pages_preferred_many(gfp, order,
>  				numa_node_id(), pol);
> +	else if (pol->mode == MPOL_PREFERRED_STRICT)
> +		page = alloc_pages_preferred_strict(gfp, order, pol);
>  	else
>  		page = __alloc_pages(gfp, order,
>  				policy_node(gfp, pol, numa_node_id()),
> @@ -2265,6 +2298,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
>  	case MPOL_INTERLEAVE:
>  	case MPOL_PREFERRED:
>  	case MPOL_PREFERRED_MANY:
> +	case MPOL_PREFERRED_STRICT:
>  		return !!nodes_equal(a->nodes, b->nodes);
>  	case MPOL_LOCAL:
>  		return true;
> @@ -2405,6 +2439,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
>  		break;
>  
>  	case MPOL_PREFERRED:
> +	case MPOL_PREFERRED_STRICT:
>  		if (node_isset(curnid, pol->nodes))
>  			goto out;
>  		polnid = first_node(pol->nodes);
> @@ -2866,6 +2901,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
>  			err = 0;
>  		goto out;
>  	case MPOL_PREFERRED_MANY:
> +	case MPOL_PREFERRED_STRICT:
>  	case MPOL_BIND:
>  		/*
>  		 * Insist on a nodelist
> @@ -2953,6 +2989,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
>  		break;
>  	case MPOL_PREFERRED:
>  	case MPOL_PREFERRED_MANY:
> +	case MPOL_PREFERRED_STRICT:
>  	case MPOL_BIND:
>  	case MPOL_INTERLEAVE:
>  		nodes = pol->nodes;
> -- 
> 2.31.1

-- 
Michal Hocko
SUSE Labs



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux