On Wed, 4 Jan 2017 11:10:49 +0000 Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> wrote: > This patch adds a new page allocator interface via alloc_pages_bulk, > __alloc_pages_bulk and __alloc_pages_bulk_nodemask. A caller requests > a number of pages to be allocated and added to a list. They can be > freed in bulk using free_hot_cold_page_list. > > The API is not guaranteed to return the requested number of pages and > may fail if the preferred allocation zone has limited free memory, > the cpuset changes during the allocation or page debugging decides > to fail an allocation. It's up to the caller to request more pages > in batch if necessary. I generally like it, thanks! :-) > Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> > --- > include/linux/gfp.h | 23 ++++++++++++++ > mm/page_alloc.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 115 insertions(+) > > diff --git a/include/linux/gfp.h b/include/linux/gfp.h > index 4175dca4ac39..1da3a9a48701 100644 > --- a/include/linux/gfp.h > +++ b/include/linux/gfp.h > @@ -433,6 +433,29 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, > return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL); > } > > +unsigned long > +__alloc_pages_bulk_nodemask(gfp_t gfp_mask, unsigned int order, > + struct zonelist *zonelist, nodemask_t *nodemask, > + unsigned long nr_pages, struct list_head *alloc_list); > + > +static inline unsigned long > +__alloc_pages_bulk(gfp_t gfp_mask, unsigned int order, > + struct zonelist *zonelist, unsigned long nr_pages, > + struct list_head *list) > +{ > + return __alloc_pages_bulk_nodemask(gfp_mask, order, zonelist, NULL, > + nr_pages, list); > +} > + > +static inline unsigned long > +alloc_pages_bulk(gfp_t gfp_mask, unsigned int order, > + unsigned long nr_pages, struct list_head *list) > +{ > + int nid = numa_mem_id(); > + return __alloc_pages_bulk(gfp_mask, order, > + node_zonelist(nid, gfp_mask), nr_pages, list); > +} > + > /* > * Allocate pages, preferring the node given as nid. The node must be valid and > * online. For more general interface, see alloc_pages_node(). > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 01b09f9da288..307ad4299dec 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -3887,6 +3887,98 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, > EXPORT_SYMBOL(__alloc_pages_nodemask); > > /* > + * This is a batched version of the page allocator that attempts to > + * allocate nr_pages quickly from the preferred zone and add them to list. > + * Note that there is no guarantee that nr_pages will be allocated although > + * every effort will be made to allocate at least one. Unlike the core > + * allocator, no special effort is made to recover from transient > + * failures caused by changes in cpusets. > + */ > +unsigned long > +__alloc_pages_bulk_nodemask(gfp_t gfp_mask, unsigned int order, > + struct zonelist *zonelist, nodemask_t *nodemask, > + unsigned long nr_pages, struct list_head *alloc_list) > +{ > + struct page *page; > + unsigned long alloced = 0; > + unsigned int alloc_flags = ALLOC_WMARK_LOW; > + struct zone *zone; > + struct per_cpu_pages *pcp; > + struct list_head *pcp_list; > + unsigned long flags; > + int migratetype; > + gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ > + struct alloc_context ac = { }; > + bool cold = ((gfp_mask & __GFP_COLD) != 0); > + > + /* If there are already pages on the list, don't bother */ > + if (!list_empty(alloc_list)) > + return 0; > + > + /* Only handle bulk allocation of order-0 */ > + if (order) > + goto failed; > + > + gfp_mask &= gfp_allowed_mask; > + if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) > + return 0; > + > + finalise_ac(gfp_mask, order, &ac); > + if (!ac.preferred_zoneref) > + return 0; > + > + /* > + * Only attempt a batch allocation if watermarks on the preferred zone > + * are safe. > + */ > + zone = ac.preferred_zoneref->zone; > + if (!zone_watermark_fast(zone, order, zone->watermark[ALLOC_WMARK_HIGH] + nr_pages, > + zonelist_zone_idx(ac.preferred_zoneref), alloc_flags)) > + goto failed; > + > + /* Attempt the batch allocation */ > + migratetype = ac.migratetype; > + > + local_irq_save(flags); It would be a win if we could either use local_irq_{disable,enable} or preempt_{disable,enable} here, by dictating it can only be used from irq-safe context (like you did in patch 3). > + pcp = &this_cpu_ptr(zone->pageset)->pcp; > + pcp_list = &pcp->lists[migratetype]; > + > + while (nr_pages) { > + page = __rmqueue_pcplist(zone, order, gfp_mask, migratetype, > + cold, pcp, pcp_list); > + if (!page) > + break; > + > + nr_pages--; > + alloced++; > + list_add(&page->lru, alloc_list); > + } > + > + if (!alloced) { > + local_irq_restore(flags); > + preempt_enable(); The preempt_enable here looks wrong. > + goto failed; > + } > + > + __count_zid_vm_events(PGALLOC, zone_idx(zone), alloced); > + zone_statistics(zone, zone, gfp_mask); > + > + local_irq_restore(flags); > + > + return alloced; > + > +failed: > + page = __alloc_pages_nodemask(gfp_mask, order, zonelist, nodemask); > + if (page) { > + alloced++; > + list_add(&page->lru, alloc_list); > + } > + > + return alloced; > +} > +EXPORT_SYMBOL(__alloc_pages_bulk_nodemask); > + > +/* > * Common helper functions. > */ > unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) -- Best regards, Jesper Dangaard Brouer MSc.CS, Principal Kernel Engineer at Red Hat LinkedIn: http://www.linkedin.com/in/brouer -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>