Re: [PATCH 4/4] mm, page_alloc: Add a bulk page allocator

Jesper Dangaard Brouer <brouer@xxxxxxxxxx> · Wed, 4 Jan 2017 14:48:44 +0100

On Wed,  4 Jan 2017 11:10:49 +0000
Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> wrote:

> This patch adds a new page allocator interface via alloc_pages_bulk,
> __alloc_pages_bulk and __alloc_pages_bulk_nodemask. A caller requests
> a number of pages to be allocated and added to a list. They can be
> freed in bulk using free_hot_cold_page_list.
> 
> The API is not guaranteed to return the requested number of pages and
> may fail if the preferred allocation zone has limited free memory,
> the cpuset changes during the allocation or page debugging decides
> to fail an allocation. It's up to the caller to request more pages
> in batch if necessary.

I generally like it, thanks! :-)

> Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
> ---
>  include/linux/gfp.h | 23 ++++++++++++++
>  mm/page_alloc.c     | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 115 insertions(+)
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 4175dca4ac39..1da3a9a48701 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -433,6 +433,29 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
>  	return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
>  }
>  
> +unsigned long
> +__alloc_pages_bulk_nodemask(gfp_t gfp_mask, unsigned int order,
> +			struct zonelist *zonelist, nodemask_t *nodemask,
> +			unsigned long nr_pages, struct list_head *alloc_list);
> +
> +static inline unsigned long
> +__alloc_pages_bulk(gfp_t gfp_mask, unsigned int order,
> +		struct zonelist *zonelist, unsigned long nr_pages,
> +		struct list_head *list)
> +{
> +	return __alloc_pages_bulk_nodemask(gfp_mask, order, zonelist, NULL,
> +						nr_pages, list);
> +}
> +
> +static inline unsigned long
> +alloc_pages_bulk(gfp_t gfp_mask, unsigned int order,
> +		unsigned long nr_pages, struct list_head *list)
> +{
> +	int nid = numa_mem_id();
> +	return __alloc_pages_bulk(gfp_mask, order,
> +			node_zonelist(nid, gfp_mask), nr_pages, list);
> +}
> +
>  /*
>   * Allocate pages, preferring the node given as nid. The node must be valid and
>   * online. For more general interface, see alloc_pages_node().
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 01b09f9da288..307ad4299dec 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3887,6 +3887,98 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
>  EXPORT_SYMBOL(__alloc_pages_nodemask);
>  
>  /*
> + * This is a batched version of the page allocator that attempts to
> + * allocate nr_pages quickly from the preferred zone and add them to list.
> + * Note that there is no guarantee that nr_pages will be allocated although
> + * every effort will be made to allocate at least one. Unlike the core
> + * allocator, no special effort is made to recover from transient
> + * failures caused by changes in cpusets.
> + */
> +unsigned long
> +__alloc_pages_bulk_nodemask(gfp_t gfp_mask, unsigned int order,
> +			struct zonelist *zonelist, nodemask_t *nodemask,
> +			unsigned long nr_pages, struct list_head *alloc_list)
> +{
> +	struct page *page;
> +	unsigned long alloced = 0;
> +	unsigned int alloc_flags = ALLOC_WMARK_LOW;
> +	struct zone *zone;
> +	struct per_cpu_pages *pcp;
> +	struct list_head *pcp_list;
> +	unsigned long flags;
> +	int migratetype;
> +	gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
> +	struct alloc_context ac = { };
> +	bool cold = ((gfp_mask & __GFP_COLD) != 0);
> +
> +	/* If there are already pages on the list, don't bother */
> +	if (!list_empty(alloc_list))
> +		return 0;
> +
> +	/* Only handle bulk allocation of order-0 */
> +	if (order)
> +		goto failed;
> +
> +	gfp_mask &= gfp_allowed_mask;
> +	if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
> +		return 0;
> +
> +	finalise_ac(gfp_mask, order, &ac);
> +	if (!ac.preferred_zoneref)
> +		return 0;
> +
> +	/*
> +	 * Only attempt a batch allocation if watermarks on the preferred zone
> +	 * are safe.
> +	 */
> +	zone = ac.preferred_zoneref->zone;
> +	if (!zone_watermark_fast(zone, order, zone->watermark[ALLOC_WMARK_HIGH] + nr_pages,
> +				 zonelist_zone_idx(ac.preferred_zoneref), alloc_flags))
> +		goto failed;
> +
> +	/* Attempt the batch allocation */
> +	migratetype = ac.migratetype;
> +
> +	local_irq_save(flags);

It would be a win if we could either use local_irq_{disable,enable} or
preempt_{disable,enable} here, by dictating it can only be used from
irq-safe context (like you did in patch 3).

> +	pcp = &this_cpu_ptr(zone->pageset)->pcp;
> +	pcp_list = &pcp->lists[migratetype];
> +
> +	while (nr_pages) {
> +		page = __rmqueue_pcplist(zone, order, gfp_mask, migratetype,
> +								cold, pcp, pcp_list);
> +		if (!page)
> +			break;
> +
> +		nr_pages--;
> +		alloced++;
> +		list_add(&page->lru, alloc_list);
> +	}
> +
> +	if (!alloced) {
> +		local_irq_restore(flags);
> +		preempt_enable();

The preempt_enable here looks wrong.

> +		goto failed;
> +	}
> +
> +	__count_zid_vm_events(PGALLOC, zone_idx(zone), alloced);
> +	zone_statistics(zone, zone, gfp_mask);
> +
> +	local_irq_restore(flags);
> +
> +	return alloced;
> +
> +failed:
> +	page = __alloc_pages_nodemask(gfp_mask, order, zonelist, nodemask);
> +	if (page) {
> +		alloced++;
> +		list_add(&page->lru, alloc_list);
> +	}
> +
> +	return alloced;
> +}
> +EXPORT_SYMBOL(__alloc_pages_bulk_nodemask);
> +
> +/*
>   * Common helper functions.
>   */
>  unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>