On 10/24/19 8:59 PM, David Rientjes wrote: >> diff --git a/mm/mempolicy.c b/mm/mempolicy.c >> index 4ae967bcf954..2c48146f3ee2 100644 >> --- a/mm/mempolicy.c >> +++ b/mm/mempolicy.c >> @@ -2129,18 +2129,20 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, >> nmask = policy_nodemask(gfp, pol); >> if (!nmask || node_isset(hpage_node, *nmask)) { >> mpol_cond_put(pol); >> + /* >> + * First, try to allocate THP only on local node, but >> + * don't reclaim unnecessarily, just compact. >> + */ >> page = __alloc_pages_node(hpage_node, >> - gfp | __GFP_THISNODE, order); >> + gfp | __GFP_THISNODE | __GFP_NORETRY, order); >> >> /* >> - * If hugepage allocations are configured to always >> - * synchronous compact or the vma has been madvised >> - * to prefer hugepage backing, retry allowing remote >> - * memory as well. >> + * If that fails, allow both compaction and reclaim, >> + * but on all nodes. >> */ >> - if (!page && (gfp & __GFP_DIRECT_RECLAIM)) >> + if (!page) >> page = __alloc_pages_node(hpage_node, >> - gfp | __GFP_NORETRY, order); >> + gfp, order); >> >> goto out; >> } > Hi Vlastimil, > > For the default case where thp enabled is not set to "always" and the VMA I assume you meant "defrag" instead of "enabled". > is not madvised for MADV_HUGEPAGE, how does this prefer to return node > local pages rather than remote hugepages? The idea is to optimize for > access latency when the vma has not been explicitly madvised. Right, you mentioned this before IIRC, and I forgot. How about this? We could be also smarter and consolidate to a single attempt if there's actually just a single NUMA node, but that can be optimized later. ----8<---- >From 4c3a2217d0ee5ead00b1443010d07c664b6ac645 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka <vbabka@xxxxxxx> Date: Tue, 1 Oct 2019 14:20:58 +0200 Subject: [PATCH] mm, thp: tweak reclaim/compaction effort of local-only and all-node allocations THP page faults now attempt a __GFP_THISNODE allocation first, which should only compact existing free memory, followed by another attempt that can allocate from any node using reclaim/compaction effort specified by global defrag setting and madvise. This patch makes the following changes to the scheme: - before the patch, the first allocation relies on a check for pageblock order and __GFP_IO to prevent excessive reclaim. This however affects also the second attempt, which is not limited to single node. Instead of that, reuse the existing check for costly order __GFP_NORETRY allocations, and make sure the first THP attempt uses __GFP_NORETRY. As a side-effect, all costly order __GFP_NORETRY allocations will bail out if compaction needs reclaim, while previously they only bailed out when compaction was deferred due to previous failures. This should be still acceptable within the __GFP_NORETRY semantics. - before the patch, the second allocation attempt (on all nodes) was passing __GFP_NORETRY. This is redundant as the check for pageblock order (discussed above) was stronger. It's also contrary to madvise(MADV_HUGEPAGE) which means some effort to allocate THP is requested. After this patch, the second attempt doesn't pass __GFP_THISNODE nor __GFP_NORETRY. To sum up, THP page faults now try the following attempts: 1. local node only THP allocation with no reclaim, just compaction. 2. for madvised VMA's or when synchronous compaction is enabled always - THP allocation from any node with effort determined by global defrag setting and VMA madvise 3. fallback to base pages on any node Signed-off-by: Vlastimil Babka <vbabka@xxxxxxx> --- mm/mempolicy.c | 10 +++++++--- mm/page_alloc.c | 24 +++++------------------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ae967bcf954..ed6fbc5b1e20 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2129,18 +2129,22 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); if (!nmask || node_isset(hpage_node, *nmask)) { mpol_cond_put(pol); + /* + * First, try to allocate THP only on local node, but + * don't reclaim unnecessarily, just compact. + */ page = __alloc_pages_node(hpage_node, - gfp | __GFP_THISNODE, order); + gfp | __GFP_THISNODE | __GFP_NORETRY, order); /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote - * memory as well. + * memory with both reclaim and compact as well. */ if (!page && (gfp & __GFP_DIRECT_RECLAIM)) page = __alloc_pages_node(hpage_node, - gfp | __GFP_NORETRY, order); + gfp, order); goto out; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ecc3dbad606b..36d7d852f7b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4473,8 +4473,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, if (page) goto got_pg; - if (order >= pageblock_order && (gfp_mask & __GFP_IO) && - !(gfp_mask & __GFP_RETRY_MAYFAIL)) { + /* + * Checks for costly allocations with __GFP_NORETRY, which + * includes some THP page fault allocations + */ + if (costly_order && (gfp_mask & __GFP_NORETRY)) { /* * If allocating entire pageblock(s) and compaction * failed because all zones are below low watermarks @@ -4495,23 +4498,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, if (compact_result == COMPACT_SKIPPED || compact_result == COMPACT_DEFERRED) goto nopage; - } - - /* - * Checks for costly allocations with __GFP_NORETRY, which - * includes THP page fault allocations - */ - if (costly_order && (gfp_mask & __GFP_NORETRY)) { - /* - * If compaction is deferred for high-order allocations, - * it is because sync compaction recently failed. If - * this is the case and the caller requested a THP - * allocation, we do not want to heavily disrupt the - * system, so we fail the allocation instead of entering - * direct reclaim. - */ - if (compact_result == COMPACT_DEFERRED) - goto nopage; /* * Looks like reclaim/compaction is worth trying, but -- 2.23.0