Re: [patch for-3.2-rc3] cpusets: stall when updating mems_allowed for mempolicy or disjoint nodemask

David Rientjes <rientjes@xxxxxxxxxx> · Tue, 22 Nov 2011 19:32:40 -0800 (PST)

On Wed, 23 Nov 2011, Miao Xie wrote:

> Yes, what you said is right.
> But in fact, on the kernel where MAX_NUMNODES <= BITS_PER_LONG, the same problem
> can also occur.
> 	task1			task1's mems	task2
> 	alloc page		2-3
> 	  alloc on node1? NO	2-3
> 				2-3		change mems from 2-3 to 1-2
> 				1-2		rebind task1's mpol
> 				1-2		  set new bits
> 				1-2		change mems from 0-1 to 0
> 				1-2		rebind task1's mpol
> 				0-1		  set new bits
> 	  alloc on node2? NO	0-1
> 	  ...
> 	can't alloc page
> 	  goto oom
> 

One of the major reasons why changing cpuset.mems can take >30s is because 
of lengthy delays in the page allocator because it continuously loops 
while trying reclaim or killing a thread and trying to allocate a page.

I think we should be able to optimize this by dropping it when it's not 
required and moving it to get_page_from_freelist() which is the only thing 
that cares about cpuset_zone_allowed_softwall().

Something like this:

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1643,17 +1643,29 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
-		struct zone *preferred_zone, int migratetype)
+		struct zone **preferred_zone, int migratetype)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
 	int classzone_idx;
 	struct zone *zone;
-	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+	nodemask_t *allowednodes = NULL;
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 
-	classzone_idx = zone_idx(preferred_zone);
+	get_mems_allowed();
+	/*
+	 * preferred_zone must come from an allowed node if the allocation is
+	 * constrained to either a mempolicy (nodemask != NULL) or otherwise
+	 * limited by cpusets.
+	 */
+	if (alloc_flags & ALLOC_CPUSET)
+		allowednodes = nodemask ? : &cpuset_current_mems_allowed;
+
+	first_zones_zonelist(zonelist, high_zoneidx, allowednodes, 
+				preferred_zone);
+	classzone_idx = zone_idx(*preferred_zone);
+	allowednodes = NULL;
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
@@ -1717,7 +1729,7 @@ zonelist_scan:
 		}
 
 try_this_zone:
-		page = buffered_rmqueue(preferred_zone, zone, order,
+		page = buffered_rmqueue(*preferred_zone, zone, order,
 						gfp_mask, migratetype);
 		if (page)
 			break;
@@ -1731,6 +1743,7 @@ this_zone_full:
 		zlc_active = 0;
 		goto zonelist_scan;
 	}
+	put_mems_allowed();
 	return page;
 }
 
@@ -1832,7 +1845,7 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask, struct zone *preferred_zone,
+	nodemask_t *nodemask, struct zone **preferred_zone,
 	int migratetype)
 {
 	struct page *page;
@@ -1885,13 +1898,13 @@ out:
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+	nodemask_t *nodemask, int alloc_flags, struct zone **preferred_zone,
 	int migratetype, unsigned long *did_some_progress,
 	bool sync_migration)
 {
 	struct page *page;
 
-	if (!order || compaction_deferred(preferred_zone))
+	if (!order || compaction_deferred(*preferred_zone))
 		return NULL;
 
 	current->flags |= PF_MEMALLOC;
@@ -1909,8 +1922,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 				alloc_flags, preferred_zone,
 				migratetype);
 		if (page) {
-			preferred_zone->compact_considered = 0;
-			preferred_zone->compact_defer_shift = 0;
+			*preferred_zone->compact_considered = 0;
+			*preferred_zone->compact_defer_shift = 0;
 			count_vm_event(COMPACTSUCCESS);
 			return page;
 		}
@@ -1921,7 +1934,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		 * but not enough to satisfy watermarks.
 		 */
 		count_vm_event(COMPACTFAIL);
-		defer_compaction(preferred_zone);
+		defer_compaction(*preferred_zone);
 
 		cond_resched();
 	}
@@ -1932,7 +1945,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+	nodemask_t *nodemask, int alloc_flags, struct zone **preferred_zone,
 	int migratetype, unsigned long *did_some_progress,
 	bool sync_migration)
 {
@@ -1944,7 +1957,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+	nodemask_t *nodemask, int alloc_flags, struct zone **preferred_zone,
 	int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
@@ -2001,7 +2014,7 @@ retry:
 static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask, struct zone *preferred_zone,
+	nodemask_t *nodemask, struct zone **preferred_zone,
 	int migratetype)
 {
 	struct page *page;
@@ -2012,7 +2025,8 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 			preferred_zone, migratetype);
 
 		if (!page && gfp_mask & __GFP_NOFAIL)
-			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+			wait_iff_congested(*preferred_zone, BLK_RW_ASYNC,
+					   HZ/50);
 	} while (!page && (gfp_mask & __GFP_NOFAIL));
 
 	return page;
@@ -2075,7 +2089,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
-	nodemask_t *nodemask, struct zone *preferred_zone,
+	nodemask_t *nodemask, struct zone **preferred_zone,
 	int migratetype)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
@@ -2110,7 +2124,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 restart:
 	if (!(gfp_mask & __GFP_NO_KSWAPD))
 		wake_all_kswapd(order, zonelist, high_zoneidx,
-						zone_idx(preferred_zone));
+						zone_idx(*preferred_zone));
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -2119,14 +2133,6 @@ restart:
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 
-	/*
-	 * Find the true preferred zone if the allocation is unconstrained by
-	 * cpusets.
-	 */
-	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
-		first_zones_zonelist(zonelist, high_zoneidx, NULL,
-					&preferred_zone);
-
 rebalance:
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
@@ -2220,7 +2226,7 @@ rebalance:
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
 		/* Wait for some write requests to complete then retry */
-		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+		wait_iff_congested(*preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
 	} else {
 		/*
@@ -2277,25 +2283,17 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 
-	get_mems_allowed();
-	/* The preferred zone is used for statistics later */
-	first_zones_zonelist(zonelist, high_zoneidx,
-				nodemask ? : &cpuset_current_mems_allowed,
-				&preferred_zone);
-	if (!preferred_zone) {
-		put_mems_allowed();
-		return NULL;
-	}
-
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
-			preferred_zone, migratetype);
-	if (unlikely(!page))
+			&preferred_zone, migratetype);
+	if (unlikely(!page)) {
+		if (!preferred_zone)
+			return NULL;
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
-				preferred_zone, migratetype);
-	put_mems_allowed();
+				&preferred_zone, migratetype);
+	}
 
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 	return page;

This would significantly reduce the amount of time that it takes to write 
to cpuset.mems because we drop get_mems_allowed() between allocation 
attempts.  We really, really want to do this anyway because it's possible 
that a cpuset is being expanded to a larger set of nodes and they are 
inaccessible to concurrent memory allocations because the page allocator 
is holding get_mems_allowed() while looping and trying to find more 
memory.

Comments?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>