On Tue, 22 Nov 2011 19:32:40 -0800 (pst), David Rientjes wrote: [SNIP] > One of the major reasons why changing cpuset.mems can take >30s is because > of lengthy delays in the page allocator because it continuously loops > while trying reclaim or killing a thread and trying to allocate a page. > > I think we should be able to optimize this by dropping it when it's not > required and moving it to get_page_from_freelist() which is the only thing > that cares about cpuset_zone_allowed_softwall(). > > Something like this: > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -1643,17 +1643,29 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) > static struct page * > get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, > struct zonelist *zonelist, int high_zoneidx, int alloc_flags, > - struct zone *preferred_zone, int migratetype) > + struct zone **preferred_zone, int migratetype) > { > struct zoneref *z; > struct page *page = NULL; > int classzone_idx; > struct zone *zone; > - nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ > + nodemask_t *allowednodes = NULL; > int zlc_active = 0; /* set if using zonelist_cache */ > int did_zlc_setup = 0; /* just call zlc_setup() one time */ > > - classzone_idx = zone_idx(preferred_zone); > + get_mems_allowed(); > + /* > + * preferred_zone must come from an allowed node if the allocation is > + * constrained to either a mempolicy (nodemask != NULL) or otherwise > + * limited by cpusets. > + */ > + if (alloc_flags & ALLOC_CPUSET) > + allowednodes = nodemask ? : &cpuset_current_mems_allowed; > + > + first_zones_zonelist(zonelist, high_zoneidx, allowednodes, > + preferred_zone); > + classzone_idx = zone_idx(*preferred_zone); > + allowednodes = NULL; > zonelist_scan: > /* > * Scan zonelist, looking for a zone with enough free. > @@ -1717,7 +1729,7 @@ zonelist_scan: > } > > try_this_zone: > - page = buffered_rmqueue(preferred_zone, zone, order, > + page = buffered_rmqueue(*preferred_zone, zone, order, > gfp_mask, migratetype); > if (page) > break; > @@ -1731,6 +1743,7 @@ this_zone_full: > zlc_active = 0; > goto zonelist_scan; > } > + put_mems_allowed(); > return page; > } > > @@ -1832,7 +1845,7 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, > static inline struct page * > __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, > struct zonelist *zonelist, enum zone_type high_zoneidx, > - nodemask_t *nodemask, struct zone *preferred_zone, > + nodemask_t *nodemask, struct zone **preferred_zone, > int migratetype) > { > struct page *page; > @@ -1885,13 +1898,13 @@ out: > static struct page * > __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, > struct zonelist *zonelist, enum zone_type high_zoneidx, > - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, > + nodemask_t *nodemask, int alloc_flags, struct zone **preferred_zone, > int migratetype, unsigned long *did_some_progress, > bool sync_migration) > { > struct page *page; > > - if (!order || compaction_deferred(preferred_zone)) > + if (!order || compaction_deferred(*preferred_zone)) > return NULL; > > current->flags |= PF_MEMALLOC; > @@ -1909,8 +1922,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, > alloc_flags, preferred_zone, > migratetype); > if (page) { > - preferred_zone->compact_considered = 0; > - preferred_zone->compact_defer_shift = 0; > + *preferred_zone->compact_considered = 0; > + *preferred_zone->compact_defer_shift = 0; > count_vm_event(COMPACTSUCCESS); > return page; > } > @@ -1921,7 +1934,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, > * but not enough to satisfy watermarks. > */ > count_vm_event(COMPACTFAIL); > - defer_compaction(preferred_zone); > + defer_compaction(*preferred_zone); > > cond_resched(); > } > @@ -1932,7 +1945,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, > static inline struct page * > __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, > struct zonelist *zonelist, enum zone_type high_zoneidx, > - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, > + nodemask_t *nodemask, int alloc_flags, struct zone **preferred_zone, > int migratetype, unsigned long *did_some_progress, > bool sync_migration) > { > @@ -1944,7 +1957,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, > static inline struct page * > __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, > struct zonelist *zonelist, enum zone_type high_zoneidx, > - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, > + nodemask_t *nodemask, int alloc_flags, struct zone **preferred_zone, > int migratetype, unsigned long *did_some_progress) > { > struct page *page = NULL; > @@ -2001,7 +2014,7 @@ retry: > static inline struct page * > __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, > struct zonelist *zonelist, enum zone_type high_zoneidx, > - nodemask_t *nodemask, struct zone *preferred_zone, > + nodemask_t *nodemask, struct zone **preferred_zone, > int migratetype) > { > struct page *page; > @@ -2012,7 +2025,8 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, > preferred_zone, migratetype); > > if (!page && gfp_mask & __GFP_NOFAIL) > - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); > + wait_iff_congested(*preferred_zone, BLK_RW_ASYNC, > + HZ/50); > } while (!page && (gfp_mask & __GFP_NOFAIL)); > > return page; > @@ -2075,7 +2089,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) > static inline struct page * > __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, > struct zonelist *zonelist, enum zone_type high_zoneidx, > - nodemask_t *nodemask, struct zone *preferred_zone, > + nodemask_t *nodemask, struct zone **preferred_zone, > int migratetype) > { > const gfp_t wait = gfp_mask & __GFP_WAIT; > @@ -2110,7 +2124,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, > restart: > if (!(gfp_mask & __GFP_NO_KSWAPD)) > wake_all_kswapd(order, zonelist, high_zoneidx, > - zone_idx(preferred_zone)); > + zone_idx(*preferred_zone)); > > /* > * OK, we're below the kswapd watermark and have kicked background > @@ -2119,14 +2133,6 @@ restart: > */ > alloc_flags = gfp_to_alloc_flags(gfp_mask); > > - /* > - * Find the true preferred zone if the allocation is unconstrained by > - * cpusets. > - */ > - if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) > - first_zones_zonelist(zonelist, high_zoneidx, NULL, > - &preferred_zone); > - > rebalance: > /* This is the last chance, in general, before the goto nopage. */ > page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, > @@ -2220,7 +2226,7 @@ rebalance: > pages_reclaimed += did_some_progress; > if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { > /* Wait for some write requests to complete then retry */ > - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); > + wait_iff_congested(*preferred_zone, BLK_RW_ASYNC, HZ/50); > goto rebalance; > } else { > /* > @@ -2277,25 +2283,17 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, > if (unlikely(!zonelist->_zonerefs->zone)) > return NULL; > > - get_mems_allowed(); > - /* The preferred zone is used for statistics later */ > - first_zones_zonelist(zonelist, high_zoneidx, > - nodemask ? : &cpuset_current_mems_allowed, > - &preferred_zone); > - if (!preferred_zone) { > - put_mems_allowed(); > - return NULL; > - } > - > /* First allocation attempt */ > page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, > zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, > - preferred_zone, migratetype); > - if (unlikely(!page)) > + &preferred_zone, migratetype); > + if (unlikely(!page)) { > + if (!preferred_zone) > + return NULL; > page = __alloc_pages_slowpath(gfp_mask, order, > zonelist, high_zoneidx, nodemask, > - preferred_zone, migratetype); > - put_mems_allowed(); > + &preferred_zone, migratetype); > + } > > trace_mm_page_alloc(page, order, gfp_mask, migratetype); > return page; > > This would significantly reduce the amount of time that it takes to write > to cpuset.mems because we drop get_mems_allowed() between allocation > attempts. We really, really want to do this anyway because it's possible > that a cpuset is being expanded to a larger set of nodes and they are > inaccessible to concurrent memory allocations because the page allocator > is holding get_mems_allowed() while looping and trying to find more > memory. > > Comments? This is a good idea. But I worry that oom will happen easily, because we do direct reclamation and compact by mems_allowed. I think we can fix it by using a sequence to make the tasks know their mems_allowed are changed, and the task will do direct reclamation and compact again. The other method is that making the tasks ignore its mems_allowed when they do direct reclamation and compact at the second time. By these two way, we can drop get_mems_allowed() as many as possible. Thanks -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>