Re: [PATCH 3/5] mm: compaction: Capture a suitable high-order page immediately when it is made available

Minchan Kim <minchan@xxxxxxxxxx> · Thu, 9 Aug 2012 10:33:58 +0900

Hi Mel,

Just one questoin below.

On Wed, Aug 08, 2012 at 08:08:42PM +0100, Mel Gorman wrote:
> While compaction is migrating pages to free up large contiguous blocks for
> allocation it races with other allocation requests that may steal these
> blocks or break them up. This patch alters direct compaction to capture a
> suitable free page as soon as it becomes available to reduce this race. It
> uses similar logic to split_free_page() to ensure that watermarks are
> still obeyed.
> 
> Signed-off-by: Mel Gorman <mgorman@xxxxxxx>
> Reviewed-by: Rik van Riel <riel@xxxxxxxxxx>
> ---
>  include/linux/compaction.h |    4 +--
>  include/linux/mm.h         |    1 +
>  mm/compaction.c            |   71 +++++++++++++++++++++++++++++++++++++-------
>  mm/internal.h              |    1 +
>  mm/page_alloc.c            |   63 +++++++++++++++++++++++++++++----------
>  5 files changed, 111 insertions(+), 29 deletions(-)
> 
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> index 51a90b7..5673459 100644
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
>  extern int fragmentation_index(struct zone *zone, unsigned int order);
>  extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
>  			int order, gfp_t gfp_mask, nodemask_t *mask,
> -			bool sync);
> +			bool sync, struct page **page);
>  extern int compact_pgdat(pg_data_t *pgdat, int order);
>  extern unsigned long compaction_suitable(struct zone *zone, int order);
>  
> @@ -64,7 +64,7 @@ static inline bool compaction_deferred(struct zone *zone, int order)
>  #else
>  static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
>  			int order, gfp_t gfp_mask, nodemask_t *nodemask,
> -			bool sync)
> +			bool sync, struct page **page)
>  {
>  	return COMPACT_CONTINUE;
>  }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index b36d08c..0812e86 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -454,6 +454,7 @@ void put_pages_list(struct list_head *pages);
>  
>  void split_page(struct page *page, unsigned int order);
>  int split_free_page(struct page *page);
> +int capture_free_page(struct page *page, int alloc_order, int migratetype);
>  
>  /*
>   * Compound pages have a destructor function.  Provide a
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 95ca967..63af8d2 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -50,6 +50,41 @@ static inline bool migrate_async_suitable(int migratetype)
>  	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
>  }
>  
> +static void compact_capture_page(struct compact_control *cc)
> +{
> +	unsigned long flags;
> +	int mtype;
> +
> +	if (!cc->page || *cc->page)
> +		return;
> +
> +	/* Speculatively examine the free lists without zone lock */
> +	for (mtype = 0; mtype < MIGRATE_PCPTYPES; mtype++) {
> +		int order;
> +		for (order = cc->order; order < MAX_ORDER; order++) {
> +			struct page *page;
> +			struct free_area *area;
> +			area = &(cc->zone->free_area[order]);
> +			if (list_empty(&area->free_list[mtype]))
> +				continue;
> +
> +			/* Take the lock and attempt capture of the page */
> +			spin_lock_irqsave(&cc->zone->lock, flags);
> +			if (!list_empty(&area->free_list[mtype])) {
> +				page = list_entry(area->free_list[mtype].next,
> +							struct page, lru);
> +				if (capture_free_page(page, cc->order, mtype)) {
> +					spin_unlock_irqrestore(&cc->zone->lock,
> +									flags);
> +					*cc->page = page;
> +					return;
> +				}
> +			}
> +			spin_unlock_irqrestore(&cc->zone->lock, flags);
> +		}
> +	}
> +}
> +
>  /*
>   * Isolate free pages onto a private freelist. Caller must hold zone->lock.
>   * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
> @@ -561,7 +596,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
>  static int compact_finished(struct zone *zone,
>  			    struct compact_control *cc)
>  {
> -	unsigned int order;
>  	unsigned long watermark;
>  
>  	if (fatal_signal_pending(current))
> @@ -586,14 +620,22 @@ static int compact_finished(struct zone *zone,
>  		return COMPACT_CONTINUE;
>  
>  	/* Direct compactor: Is a suitable page free? */
> -	for (order = cc->order; order < MAX_ORDER; order++) {
> -		/* Job done if page is free of the right migratetype */
> -		if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
> -			return COMPACT_PARTIAL;
> -
> -		/* Job done if allocation would set block type */
> -		if (order >= pageblock_order && zone->free_area[order].nr_free)
> +	if (cc->page) {
> +		/* Was a suitable page captured? */
> +		if (*cc->page)
>  			return COMPACT_PARTIAL;
> +	} else {
> +		unsigned int order;
> +		for (order = cc->order; order < MAX_ORDER; order++) {
> +			struct free_area *area = &zone->free_area[cc->order];
> +			/* Job done if page is free of the right migratetype */
> +			if (!list_empty(&area->free_list[cc->migratetype]))
> +				return COMPACT_PARTIAL;
> +
> +			/* Job done if allocation would set block type */
> +			if (cc->order >= pageblock_order && area->nr_free)
> +				return COMPACT_PARTIAL;
> +		}
>  	}
>  
>  	return COMPACT_CONTINUE;
> @@ -708,6 +750,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
>  				goto out;
>  			}
>  		}
> +
> +		/* Capture a page now if it is a suitable size */

Why do we capture only when we migrate MIGRATE_MOVABLE type?
If you have a reasone, it should have been added as comment.

> +		if (cc->migratetype == MIGRATE_MOVABLE)
> +			compact_capture_page(cc);
>  	}
>  
>  out:
> @@ -720,7 +766,7 @@ out:
>  
>  static unsigned long compact_zone_order(struct zone *zone,
>  				 int order, gfp_t gfp_mask,
> -				 bool sync)
> +				 bool sync, struct page **page)
>  {
>  	struct compact_control cc = {
>  		.nr_freepages = 0,
> @@ -729,6 +775,7 @@ static unsigned long compact_zone_order(struct zone *zone,
>  		.migratetype = allocflags_to_migratetype(gfp_mask),
>  		.zone = zone,
>  		.sync = sync,
> +		.page = page,
>  	};
>  	INIT_LIST_HEAD(&cc.freepages);
>  	INIT_LIST_HEAD(&cc.migratepages);
> @@ -750,7 +797,7 @@ int sysctl_extfrag_threshold = 500;
>   */
>  unsigned long try_to_compact_pages(struct zonelist *zonelist,
>  			int order, gfp_t gfp_mask, nodemask_t *nodemask,
> -			bool sync)
> +			bool sync, struct page **page)
>  {
>  	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
>  	int may_enter_fs = gfp_mask & __GFP_FS;
> @@ -770,7 +817,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
>  								nodemask) {
>  		int status;
>  
> -		status = compact_zone_order(zone, order, gfp_mask, sync);
> +		status = compact_zone_order(zone, order, gfp_mask, sync, page);
>  		rc = max(status, rc);
>  
>  		/* If a normal allocation would succeed, stop compacting */
> @@ -825,6 +872,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
>  	struct compact_control cc = {
>  		.order = order,
>  		.sync = false,
> +		.page = NULL,
>  	};
>  
>  	return __compact_pgdat(pgdat, &cc);
> @@ -835,6 +883,7 @@ static int compact_node(int nid)
>  	struct compact_control cc = {
>  		.order = -1,
>  		.sync = true,
> +		.page = NULL,
>  	};
>  
>  	return __compact_pgdat(NODE_DATA(nid), &cc);
> diff --git a/mm/internal.h b/mm/internal.h
> index 2ba87fb..9156714 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -124,6 +124,7 @@ struct compact_control {
>  	int order;			/* order a direct compactor needs */
>  	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
>  	struct zone *zone;
> +	struct page **page;		/* Page captured of requested size */
>  };
>  
>  unsigned long
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 4a4f921..adc3aa8 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1374,16 +1374,11 @@ void split_page(struct page *page, unsigned int order)
>  }
>  
>  /*
> - * Similar to split_page except the page is already free. As this is only
> - * being used for migration, the migratetype of the block also changes.
> - * As this is called with interrupts disabled, the caller is responsible
> - * for calling arch_alloc_page() and kernel_map_page() after interrupts
> - * are enabled.
> - *
> - * Note: this is probably too low level an operation for use in drivers.
> - * Please consult with lkml before using this in your driver.
> + * Similar to the split_page family of functions except that the page
> + * required at the given order and being isolated now to prevent races
> + * with parallel allocators
>   */
> -int split_free_page(struct page *page)
> +int capture_free_page(struct page *page, int alloc_order, int migratetype)
>  {
>  	unsigned int order;
>  	unsigned long watermark;
> @@ -1405,10 +1400,11 @@ int split_free_page(struct page *page)
>  	rmv_page_order(page);
>  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
>  
> -	/* Split into individual pages */
> -	set_page_refcounted(page);
> -	split_page(page, order);
> +	if (alloc_order != order)
> +		expand(zone, page, alloc_order, order,
> +			&zone->free_area[order], migratetype);
>  
> +	/* Set the pageblock if the captured page is at least a pageblock */
>  	if (order >= pageblock_order - 1) {
>  		struct page *endpage = page + (1 << order) - 1;
>  		for (; page < endpage; page += pageblock_nr_pages) {
> @@ -1419,7 +1415,35 @@ int split_free_page(struct page *page)
>  		}
>  	}
>  
> -	return 1 << order;
> +	return 1UL << order;
> +}
> +
> +/*
> + * Similar to split_page except the page is already free. As this is only
> + * being used for migration, the migratetype of the block also changes.
> + * As this is called with interrupts disabled, the caller is responsible
> + * for calling arch_alloc_page() and kernel_map_page() after interrupts
> + * are enabled.
> + *
> + * Note: this is probably too low level an operation for use in drivers.
> + * Please consult with lkml before using this in your driver.
> + */
> +int split_free_page(struct page *page)
> +{
> +	unsigned int order;
> +	int nr_pages;
> +
> +	BUG_ON(!PageBuddy(page));
> +	order = page_order(page);
> +
> +	nr_pages = capture_free_page(page, order, 0);
> +	if (!nr_pages)
> +		return 0;
> +
> +	/* Split into individual pages */
> +	set_page_refcounted(page);
> +	split_page(page, order);
> +	return nr_pages;
>  }
>  
>  /*
> @@ -2065,7 +2089,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>  	bool *deferred_compaction,
>  	unsigned long *did_some_progress)
>  {
> -	struct page *page;
> +	struct page *page = NULL;
>  
>  	if (!order)
>  		return NULL;
> @@ -2077,10 +2101,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>  
>  	current->flags |= PF_MEMALLOC;
>  	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
> -						nodemask, sync_migration);
> +					nodemask, sync_migration, &page);
>  	current->flags &= ~PF_MEMALLOC;
> -	if (*did_some_progress != COMPACT_SKIPPED) {
>  
> +	/* If compaction captured a page, prep and use it */
> +	if (page) {
> +		prep_new_page(page, order, gfp_mask);
> +		goto got_page;
> +	}
> +
> +	if (*did_some_progress != COMPACT_SKIPPED) {
>  		/* Page migration frees to the PCP lists but we want merging */
>  		drain_pages(get_cpu());
>  		put_cpu();
> @@ -2090,6 +2120,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>  				alloc_flags, preferred_zone,
>  				migratetype);
>  		if (page) {
> +got_page:
>  			preferred_zone->compact_considered = 0;
>  			preferred_zone->compact_defer_shift = 0;
>  			if (order >= preferred_zone->compact_order_failed)
> -- 
> 1.7.9.2
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>