On 02/14/2017 03:44 PM, Anshuman Khandual wrote: > On 02/14/2017 01:58 PM, Vlastimil Babka wrote: >> On 02/10/2017 11:06 AM, Anshuman Khandual wrote: >>> This implements allocation isolation for CDM nodes in buddy allocator by >>> discarding CDM memory zones all the time except in the cases where the gfp >>> flag has got __GFP_THISNODE or the nodemask contains CDM nodes in cases >>> where it is non NULL (explicit allocation request in the kernel or user >>> process MPOL_BIND policy based requests). >>> >>> Signed-off-by: Anshuman Khandual <khandual@xxxxxxxxxxxxxxxxxx> >>> --- >>> mm/page_alloc.c | 16 ++++++++++++++++ >>> 1 file changed, 16 insertions(+) >>> >>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c >>> index 84d61bb..392c24a 100644 >>> --- a/mm/page_alloc.c >>> +++ b/mm/page_alloc.c >>> @@ -64,6 +64,7 @@ >>> #include <linux/page_owner.h> >>> #include <linux/kthread.h> >>> #include <linux/memcontrol.h> >>> +#include <linux/node.h> >>> >>> #include <asm/sections.h> >>> #include <asm/tlbflush.h> >>> @@ -2908,6 +2909,21 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, >>> struct page *page; >>> unsigned long mark; >>> >>> + /* >>> + * CDM nodes get skipped if the requested gfp flag >>> + * does not have __GFP_THISNODE set or the nodemask >>> + * does not have any CDM nodes in case the nodemask >>> + * is non NULL (explicit allocation requests from >>> + * kernel or user process MPOL_BIND policy which has >>> + * CDM nodes). >>> + */ >>> + if (is_cdm_node(zone->zone_pgdat->node_id)) { >>> + if (!(gfp_mask & __GFP_THISNODE)) { >>> + if (!ac->nodemask) >>> + continue; >>> + } >>> + } >> >> With the current cpuset implementation, this will have a subtle corner >> case when allocating from a cpuset that allows the cdm node, and there >> is no (task or vma) mempolicy applied for the allocation. In the fast >> path (__alloc_pages_nodemask()) we'll set ac->nodemask to >> current->mems_allowed, so your code will wrongly assume that this >> ac->nodemask is a policy that allows the CDM node. Probably not what you >> want? > > You are right, its a problem and not what we want. We can make the > function get_page_from_freelist() take another parameter "orig_nodemask" > which gets passed into __alloc_pages_nodemask() in the first place. So > inside zonelist iterator we can compare orig_nodemask with current > ac.nodemask to figure out if cpuset swapping of nodemask happened and > skip CDM node if necessary. Thats a viable solution IMHO. Hello Vlastimil, As I mentioned before yesterday this solution works and tested to verify that there is no allocation leak happening to CDM even after cpuset_enabled() is turned ON after changing /sys/fs/cgroup/cpuset/ setup. Major part of the change is just to add an additional parameter into the function get_page_from_freelist and changing all it's call sites. - Anshuman diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 392c24a..9f41e0f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2894,11 +2894,15 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) */ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, - const struct alloc_context *ac) + const struct alloc_context *ac, nodemask_t *orig_mask) { struct zoneref *z = ac->preferred_zoneref; struct zone *zone; struct pglist_data *last_pgdat_dirty_limit = NULL; + bool cpuset_fallback; + + if (ac->nodemask != orig_mask) + cpuset_fallback = true; /* * Scan zonelist, looking for a zone with enough free. @@ -2919,6 +2923,9 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) */ if (is_cdm_node(zone->zone_pgdat->node_id)) { if (!(gfp_mask & __GFP_THISNODE)) { + if (cpuset_fallback) + continue; + if (!ac->nodemask) continue; } @@ -3066,7 +3073,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, - const struct alloc_context *ac, unsigned long *did_some_progress) + const struct alloc_context *ac, unsigned long *did_some_progress, nodemask_t *orig_mask) { struct oom_control oc = { .zonelist = ac->zonelist, @@ -3095,7 +3102,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) * we're still under heavy pressure. */ page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, - ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); + ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac, orig_mask); if (page) goto out; @@ -3131,14 +3138,14 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) if (gfp_mask & __GFP_NOFAIL) { page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); + ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac, orig_mask); /* * fallback to ignore cpuset restriction if our nodes * are depleted */ if (!page) page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS, ac); + ALLOC_NO_WATERMARKS, ac, orig_mask); } } out: @@ -3157,7 +3164,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum compact_priority prio, enum compact_result *compact_result) + enum compact_priority prio, enum compact_result *compact_result, nodemask_t *orig_mask) { struct page *page; @@ -3178,7 +3185,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, orig_mask); if (page) { struct zone *zone = page_zone(page); @@ -3263,7 +3270,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum compact_priority prio, enum compact_result *compact_result) + enum compact_priority prio, enum compact_result *compact_result, nodemask_t *orig_mask) { *compact_result = COMPACT_SKIPPED; return NULL; @@ -3330,7 +3337,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - unsigned long *did_some_progress) + unsigned long *did_some_progress, nodemask_t *orig_mask) { struct page *page = NULL; bool drained = false; @@ -3340,7 +3347,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) return NULL; retry: - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, orig_mask); /* * If an allocation failed after direct reclaim, it could be because @@ -3533,7 +3540,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, - struct alloc_context *ac) + struct alloc_context *ac, nodemask_t *orig_mask) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; @@ -3597,7 +3604,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) * The adjusted alloc_flags might result in immediate success, so try * that first */ - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, orig_mask); if (page) goto got_pg; @@ -3612,7 +3619,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, INIT_COMPACT_PRIORITY, - &compact_result); + &compact_result, orig_mask); if (page) goto got_pg; @@ -3661,7 +3668,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) } /* Attempt with potentially adjusted zonelist and alloc_flags */ - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, orig_mask); if (page) goto got_pg; @@ -3697,13 +3704,13 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, - &did_some_progress); + &did_some_progress, orig_mask); if (page) goto got_pg; /* Try direct compaction and then allocating */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, - compact_priority, &compact_result); + compact_priority, &compact_result, orig_mask); if (page) goto got_pg; @@ -3750,7 +3757,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) goto retry_cpuset; /* Reclaim has failed us, start killing things */ - page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); + page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress, orig_mask); if (page) goto got_pg; @@ -3842,7 +3849,7 @@ struct page * } /* First allocation attempt */ - page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); + page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac, nodemask); if (likely(page)) goto out; @@ -3861,7 +3868,7 @@ struct page * if (unlikely(ac.nodemask != nodemask)) ac.nodemask = nodemask; - page = __alloc_pages_slowpath(alloc_mask, order, &ac); + page = __alloc_pages_slowpath(alloc_mask, order, &ac, nodemask); out: if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>