[PATCH 1/3] hugetlb: add nodemask arg to huge page alloc, free and surplus adjust fcns Against: 25jun09 mmotm atop the "hugetlb: balance freeing..." series In preparation for constraining huge page allocation and freeing by the controlling task's numa mempolicy, add a "nodes_allowed" nodemask pointer to the allocate, free and surplus adjustment functions. For now, pass NULL to indicate default behavior--i.e., use node_online_map. A subsqeuent patch will derive a non-default mask from the controlling task's numa mempolicy. Note the "cleanup" in alloc_bootmem_huge_page(): always advance next nid, even if allocation succeeds. I believe that this is correct behavior, and I'll replace it in the next patch which assumes this behavior. However, perhaps the current code is correct: we only want to advance bootmem huge page allocation to the next node when we've exhausted all huge pages on the current hstate "next_node_to_alloc". Any who understands the rationale for this: please advise. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx> mm/hugetlb.c | 51 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 20 deletions(-) Index: linux-2.6.31-rc1-mmotm-090625-1549/mm/hugetlb.c =================================================================== --- linux-2.6.31-rc1-mmotm-090625-1549.orig/mm/hugetlb.c 2009-07-07 09:58:13.000000000 -0400 +++ linux-2.6.31-rc1-mmotm-090625-1549/mm/hugetlb.c 2009-07-07 09:58:17.000000000 -0400 @@ -631,17 +631,22 @@ static struct page *alloc_fresh_huge_pag * if we just successfully allocated a hugepage so that * the next caller gets hugepages on the next node. */ -static int hstate_next_node_to_alloc(struct hstate *h) +static int hstate_next_node_to_alloc(struct hstate *h, + nodemask_t *nodes_allowed) { int next_nid; - next_nid = next_node(h->next_nid_to_alloc, node_online_map); + + if (!nodes_allowed) + nodes_allowed = &node_online_map; + + next_nid = next_node(h->next_nid_to_alloc, *nodes_allowed); if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); + next_nid = first_node(*nodes_allowed); h->next_nid_to_alloc = next_nid; return next_nid; } -static int alloc_fresh_huge_page(struct hstate *h) +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; int start_nid; @@ -655,7 +660,7 @@ static int alloc_fresh_huge_page(struct page = alloc_fresh_huge_page_node(h, next_nid); if (page) ret = 1; - next_nid = hstate_next_node_to_alloc(h); + next_nid = hstate_next_node_to_alloc(h, nodes_allowed); } while (!page && next_nid != start_nid); if (ret) @@ -670,12 +675,16 @@ static int alloc_fresh_huge_page(struct * helper for free_pool_huge_page() - find next node * from which to free a huge page */ -static int hstate_next_node_to_free(struct hstate *h) +static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) { int next_nid; - next_nid = next_node(h->next_nid_to_free, node_online_map); + + if (!nodes_allowed) + nodes_allowed = &node_online_map; + + next_nid = next_node(h->next_nid_to_free, *nodes_allowed); if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); + next_nid = first_node(*nodes_allowed); h->next_nid_to_free = next_nid; return next_nid; } @@ -686,7 +695,8 @@ static int hstate_next_node_to_free(stru * balanced over allowed nodes. * Called with hugetlb_lock locked. */ -static int free_pool_huge_page(struct hstate *h, bool acct_surplus) +static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + bool acct_surplus) { int start_nid; int next_nid; @@ -715,7 +725,7 @@ static int free_pool_huge_page(struct hs update_and_free_page(h, page); ret = 1; } - next_nid = hstate_next_node_to_free(h); + next_nid = hstate_next_node_to_free(h, nodes_allowed); } while (!ret && next_nid != start_nid); return ret; @@ -917,7 +927,7 @@ static void return_unused_surplus_pages( * on-line nodes for us and will handle the hstate accounting. */ while (nr_pages--) { - if (!free_pool_huge_page(h, 1)) + if (!free_pool_huge_page(h, NULL, 1)) break; } } @@ -1030,6 +1040,7 @@ int __weak alloc_bootmem_huge_page(struc NODE_DATA(h->next_nid_to_alloc), huge_page_size(h), huge_page_size(h), 0); + hstate_next_node_to_alloc(h, NULL); /* always advance nid */ if (addr) { /* * Use the beginning of the huge page to store the @@ -1039,7 +1050,6 @@ int __weak alloc_bootmem_huge_page(struc m = addr; goto found; } - hstate_next_node_to_alloc(h); nr_nodes--; } return 0; @@ -1083,7 +1093,7 @@ static void __init hugetlb_hstate_alloc_ if (h->order >= MAX_ORDER) { if (!alloc_bootmem_huge_page(h)) break; - } else if (!alloc_fresh_huge_page(h)) + } else if (!alloc_fresh_huge_page(h, NULL)) break; } h->max_huge_pages = i; @@ -1158,7 +1168,8 @@ static inline void try_to_free_low(struc * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ -static int adjust_pool_surplus(struct hstate *h, int delta) +static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, + int delta) { int start_nid, next_nid; int ret = 0; @@ -1174,7 +1185,7 @@ static int adjust_pool_surplus(struct hs do { int nid = next_nid; if (delta < 0) { - next_nid = hstate_next_node_to_alloc(h); + next_nid = hstate_next_node_to_alloc(h, nodes_allowed); /* * To shrink on this node, there must be a surplus page */ @@ -1182,7 +1193,7 @@ static int adjust_pool_surplus(struct hs continue; } if (delta > 0) { - next_nid = hstate_next_node_to_free(h); + next_nid = hstate_next_node_to_free(h, nodes_allowed); /* * Surplus cannot exceed the total number of pages */ @@ -1221,7 +1232,7 @@ static unsigned long set_max_huge_pages( */ spin_lock(&hugetlb_lock); while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { - if (!adjust_pool_surplus(h, -1)) + if (!adjust_pool_surplus(h, NULL, -1)) break; } @@ -1232,7 +1243,7 @@ static unsigned long set_max_huge_pages( * and reducing the surplus. */ spin_unlock(&hugetlb_lock); - ret = alloc_fresh_huge_page(h); + ret = alloc_fresh_huge_page(h, NULL); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -1258,11 +1269,11 @@ static unsigned long set_max_huge_pages( min_count = max(count, min_count); try_to_free_low(h, min_count); while (min_count < persistent_huge_pages(h)) { - if (!free_pool_huge_page(h, 0)) + if (!free_pool_huge_page(h, NULL, 0)) break; } while (count < persistent_huge_pages(h)) { - if (!adjust_pool_surplus(h, 1)) + if (!adjust_pool_surplus(h, NULL, 1)) break; } out: -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html