cc linux-mm@xxxxxxxxx yaozhenguo <yaozhenguo1@xxxxxxxxx> 于2021年9月2日周四 下午2:59写道: > > We can specify the number of hugepages to allocate at boot. But the > hugepages is balanced in all nodes at present. In some scenarios, > we only need hugepages in one node. For example: DPDK needs hugepages > which is in the same node as NIC. if DPDK needs four hugepages of 1G > size in node1 and system has 16 numa nodes. We must reserve 64 hugepages > in kernel cmdline. But, only four hugepages are used. The others should > be free after boot.If the system memory is low(for example: 64G), it will > be an impossible task. So, extend hugepages kernel parameter to specify > node number of hugepages to allocate at boot. > For example add following parameter: > > hugepagesz=1G hugepages=0:1,1:3 > > It will allocate 1 hugepages in node0 and 3 hugepages in node1. > > Signed-off-by: yaozhenguo <yaozhenguo1@xxxxxxxxx> > --- > v3: 1. Skip gigantic hugepages allocation if hugetlb_cma is enabled. > 2. Fix wrong behavior for parameter: hugepagesz=2M hugepages=2 hugepages=5. > 3. Update hugetlbpage.rst. > 4. Fix side effects which v2 brings in. > 5. add cond_resched in hugetlb_hstate_alloc_pages_onenode. > --- > .../admin-guide/kernel-parameters.txt | 8 +- > Documentation/admin-guide/mm/hugetlbpage.rst | 12 +- > include/linux/hugetlb.h | 1 + > mm/hugetlb.c | 116 ++++++++++++++++-- > 4 files changed, 126 insertions(+), 11 deletions(-) > > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt > index bdb22006f..64a128924 100644 > --- a/Documentation/admin-guide/kernel-parameters.txt > +++ b/Documentation/admin-guide/kernel-parameters.txt > @@ -1588,9 +1588,11 @@ > the number of pages of hugepagesz to be allocated. > If this is the first HugeTLB parameter on the command > line, it specifies the number of pages to allocate for > - the default huge page size. See also > - Documentation/admin-guide/mm/hugetlbpage.rst. > - Format: <integer> > + the default huge page size. If using node format, It > + specifies numbers of hugepage in a specific node. > + See also Documentation/admin-guide/mm/hugetlbpage.rst. > + Format: <integer> or (node format) > + <node>:<numbers>[,<node>:<numbers>] > > hugepagesz= > [HW] The size of the HugeTLB pages. This is used in > diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst > index 8abaeb144..bc5f674ff 100644 > --- a/Documentation/admin-guide/mm/hugetlbpage.rst > +++ b/Documentation/admin-guide/mm/hugetlbpage.rst > @@ -128,7 +128,9 @@ hugepages > implicitly specifies the number of huge pages of default size to > allocate. If the number of huge pages of default size is implicitly > specified, it can not be overwritten by a hugepagesz,hugepages > - parameter pair for the default size. > + parameter pair for the default size. This parameter also has node > + format. It specifies numbers of hugepage in a specific node when > + using node format. > > For example, on an architecture with 2M default huge page size:: > > @@ -138,6 +140,14 @@ hugepages > indicating that the hugepages=512 parameter is ignored. If a hugepages > parameter is preceded by an invalid hugepagesz parameter, it will > be ignored. > + > + Node format example:: > + > + hugepagesz=2M hugepages=0:1,1:2 > + > + It will allocate 1 2M hugepages in node0 and 2 2M hugepages in node1. > + If the node number exceeds the maximum node, the parameter will be > + ignored. > default_hugepagesz > Specify the default huge page size. This parameter can > only be specified once on the command line. default_hugepagesz can > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index f7ca1a387..5939ecd4f 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -605,6 +605,7 @@ struct hstate { > unsigned long nr_overcommit_huge_pages; > struct list_head hugepage_activelist; > struct list_head hugepage_freelists[MAX_NUMNODES]; > + unsigned int max_huge_pages_node[MAX_NUMNODES]; > unsigned int nr_huge_pages_node[MAX_NUMNODES]; > unsigned int free_huge_pages_node[MAX_NUMNODES]; > unsigned int surplus_huge_pages_node[MAX_NUMNODES]; > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index dfc940d52..317f8fa21 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -66,6 +66,7 @@ static struct hstate * __initdata parsed_hstate; > static unsigned long __initdata default_hstate_max_huge_pages; > static bool __initdata parsed_valid_hugepagesz = true; > static bool __initdata parsed_default_hugepagesz; > +static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; > > /* > * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, > @@ -2842,10 +2843,75 @@ static void __init gather_bootmem_prealloc(void) > } > } > > +static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) > +{ > + unsigned long i; > + char buf[32]; > + > + for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { > + if (hstate_is_gigantic(h)) { > + struct huge_bootmem_page *m; > + void *addr; > + > + addr = memblock_alloc_try_nid_raw( > + huge_page_size(h), huge_page_size(h), > + 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); > + if (!addr) > + break; > + m = addr; > + BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); > + /* > + * Put them into a private list first because mem_map > + * is not up yet > + */ > + INIT_LIST_HEAD(&m->list); > + list_add(&m->list, &huge_boot_pages); > + m->hstate = h; > + } else { > + struct page *page; > + > + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; > + > + page = alloc_fresh_huge_page(h, gfp_mask, nid, > + &node_states[N_MEMORY], NULL); > + if (!page) > + break; > + put_page(page); /* free it into the hugepage allocator */ > + } > + cond_resched(); > + } > + if (i == h->max_huge_pages_node[nid]) > + return; > + > + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); > + pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", > + h->max_huge_pages_node[nid], buf, nid, i); > + h->max_huge_pages_node[nid] = i; > + h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); > +} > + > static void __init hugetlb_hstate_alloc_pages(struct hstate *h) > { > unsigned long i; > nodemask_t *node_alloc_noretry; > + bool hugetlb_node_set = false; > + > + /* skip gigantic hugepages allocation if hugetlb_cma enabled */ > + if (hstate_is_gigantic(h) && hugetlb_cma_size) { > + pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); > + return; > + } > + > + /* do node alloc */ > + for (i = 0; i < nodes_weight(node_states[N_MEMORY]); i++) { > + if (h->max_huge_pages_node[i] > 0) { > + hugetlb_hstate_alloc_pages_onenode(h, i); > + hugetlb_node_set = true; > + } > + } > + > + if (hugetlb_node_set) > + return; > > if (!hstate_is_gigantic(h)) { > /* > @@ -2867,10 +2933,6 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) > > for (i = 0; i < h->max_huge_pages; ++i) { > if (hstate_is_gigantic(h)) { > - if (hugetlb_cma_size) { > - pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); > - goto free; > - } > if (!alloc_bootmem_huge_page(h)) > break; > } else if (!alloc_pool_huge_page(h, > @@ -2887,7 +2949,6 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) > h->max_huge_pages, buf, i); > h->max_huge_pages = i; > } > -free: > kfree(node_alloc_noretry); > } > > @@ -3580,6 +3641,10 @@ static int __init hugetlb_init(void) > default_hstate_max_huge_pages; > } > } > + for (i = 0; i < nodes_weight(node_states[N_MEMORY]); i++) > + if (default_hugepages_in_node[i] > 0) > + default_hstate.max_huge_pages_node[i] = > + default_hugepages_in_node[i]; > > hugetlb_cma_check(); > hugetlb_init_hstates(); > @@ -3649,6 +3714,10 @@ static int __init hugepages_setup(char *s) > { > unsigned long *mhp; > static unsigned long *last_mhp; > + unsigned int node = NUMA_NO_NODE; > + int count; > + unsigned long tmp; > + char *p = s; > > if (!parsed_valid_hugepagesz) { > pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); > @@ -3672,8 +3741,37 @@ static int __init hugepages_setup(char *s) > return 0; > } > > - if (sscanf(s, "%lu", mhp) <= 0) > - *mhp = 0; > + while (*p) { > + count = 0; > + if (sscanf(p, "%lu%n", &tmp, &count) != 1) > + goto invalid; > + /* Parameter is node format */ > + if (p[count] == ':') { > + node = tmp; > + p += count + 1; > + if (node < 0 || > + node >= nodes_weight(node_states[N_MEMORY])) > + goto invalid; > + /* Parse hugepages */ > + if (sscanf(p, "%lu%n", &tmp, &count) != 1) > + goto invalid; > + if (!hugetlb_max_hstate) > + default_hugepages_in_node[node] = tmp; > + else > + parsed_hstate->max_huge_pages_node[node] = tmp; > + *mhp += tmp; > + /* Go to parse next node*/ > + if (p[count] == ',') > + p += count + 1; > + else > + break; > + } else { > + if (p != s) > + goto invalid; > + *mhp = tmp; > + break; > + } > + } > > /* > * Global state is always initialized later in hugetlb_init. > @@ -3686,6 +3784,10 @@ static int __init hugepages_setup(char *s) > last_mhp = mhp; > > return 1; > + > +invalid: > + pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); > + return 0; > } > __setup("hugepages=", hugepages_setup); > > -- > 2.27.0 >