> On Jan 24, 2024, at 18:52, Gang Li <gang.li@xxxxxxxxx> wrote: > > On 2024/1/24 17:23, Muchun Song wrote: >> On 2024/1/18 20:39, Gang Li wrote: >>> Optimizing the initialization speed of 1G huge pages through >>> parallelization. >>> >>> 1G hugetlbs are allocated from bootmem, a process that is already >>> very fast and does not currently require optimization. Therefore, >>> we focus on parallelizing only the initialization phase in >>> `gather_bootmem_prealloc`. >>> >>> Here are some test results: >>> test no patch(ms) patched(ms) saved >>> ------------------- -------------- ------------- -------- >>> 256c2t(4 node) 1G 4745 2024 57.34% >> What does "256c2t" mean? > > A machine with 256 core and 2T memory. A little confusing. I thought 256c2 is a number in hexadecimal. The unit of memory should be capital T. We should add a simple explanation about this. > >>> 128c1t(2 node) 1G 3358 1712 49.02% >>> 12t 1G 77000 18300 76.23% I am curious how many NUMA nodes does this system have? I suspect it should not be one. >>> >>> Signed-off-by: Gang Li <gang.li@xxxxxxxxx> >>> Tested-by: David Rientjes <rientjes@xxxxxxxxxx> >>> --- >>> include/linux/hugetlb.h | 2 +- >>> mm/hugetlb.c | 42 +++++++++++++++++++++++++++++++++-------- >>> 2 files changed, 35 insertions(+), 9 deletions(-) >>> >>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h >>> index c1ee640d87b1..77b30a8c6076 100644 >>> --- a/include/linux/hugetlb.h >>> +++ b/include/linux/hugetlb.h >>> @@ -178,7 +178,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, >>> struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); >>> extern int sysctl_hugetlb_shm_group; >>> -extern struct list_head huge_boot_pages; >>> +extern struct list_head huge_boot_pages[MAX_NUMNODES]; >>> /* arch callbacks */ >>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c >>> index 9b348ba418f5..2f4b77630ada 100644 >>> --- a/mm/hugetlb.c >>> +++ b/mm/hugetlb.c >>> @@ -69,7 +69,7 @@ static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) >>> #endif >>> static unsigned long hugetlb_cma_size __initdata; >>> -__initdata LIST_HEAD(huge_boot_pages); >>> +__initdata struct list_head huge_boot_pages[MAX_NUMNODES]; >>> /* for command line parsing */ >>> static struct hstate * __initdata parsed_hstate; >>> @@ -3301,7 +3301,7 @@ int alloc_bootmem_huge_page(struct hstate *h, int nid) >>> int __alloc_bootmem_huge_page(struct hstate *h, int nid) >>> { >>> struct huge_bootmem_page *m = NULL; /* initialize for clang */ >>> - int nr_nodes, node; >>> + int nr_nodes, node = nid; >> Why not use nid directly in the following list_add()? > > `node` may be changed in `for_each_node_mask_to_alloc`. Got it. > >>> /* do node specific alloc */ >>> if (nid != NUMA_NO_NODE) { >>> @@ -3339,7 +3339,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) >>> huge_page_size(h) - PAGE_SIZE); >>> /* Put them into a private list first because mem_map is not up yet */ >>> INIT_LIST_HEAD(&m->list); >>> - list_add(&m->list, &huge_boot_pages); >>> + list_add(&m->list, &huge_boot_pages[node]); >>> m->hstate = h; >>> return 1; >>> } >>> @@ -3390,8 +3390,6 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, >>> /* Send list for bulk vmemmap optimization processing */ >>> hugetlb_vmemmap_optimize_folios(h, folio_list); >>> - /* Add all new pool pages to free lists in one lock cycle */ >>> - spin_lock_irqsave(&hugetlb_lock, flags); >>> list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { >>> if (!folio_test_hugetlb_vmemmap_optimized(folio)) { >>> /* >>> @@ -3404,23 +3402,27 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, >>> HUGETLB_VMEMMAP_RESERVE_PAGES, >>> pages_per_huge_page(h)); >>> } >>> + /* Subdivide locks to achieve better parallel performance * >>> + spin_lock_irqsave(&hugetlb_lock, flags); >>> __prep_account_new_huge_page(h, folio_nid(folio)); >>> enqueue_hugetlb_folio(h, folio); >>> + spin_unlock_irqrestore(&hugetlb_lock, flags); >>> } >>> - spin_unlock_irqrestore(&hugetlb_lock, flags); >>> } >>> /* >>> * Put bootmem huge pages into the standard lists after mem_map is up. >>> * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. >>> */ >>> -static void __init gather_bootmem_prealloc(void) >>> +static void __init __gather_bootmem_prealloc(unsigned long start, unsigned long end, void *arg) >> This function name could be gather_bootmem_prealloc_node. > > LGTM. > >>> + >>> { >>> + int nid = start; >>> LIST_HEAD(folio_list); >>> struct huge_bootmem_page *m; >>> struct hstate *h = NULL, *prev_h = NULL; >>> - list_for_each_entry(m, &huge_boot_pages, list) { >>> + list_for_each_entry(m, &huge_boot_pages[nid], list) { >>> struct page *page = virt_to_page(m); >>> struct folio *folio = (void *)page; >>> @@ -3453,6 +3455,22 @@ static void __init gather_bootmem_prealloc(void) >>> prep_and_add_bootmem_folios(h, &folio_list); >>> } >>> +static void __init gather_bootmem_prealloc(void) >>> +{ >>> + struct padata_mt_job job = { >>> + .thread_fn = __gather_bootmem_prealloc, >>> + .fn_arg = NULL, >>> + .start = 0, >>> + .size = num_node_state(N_MEMORY), >>> + .align = 1, >>> + .min_chunk = 1, >>> + .max_threads = num_node_state(N_MEMORY), >>> + .numa_aware = true, >>> + }; >>> + >>> + padata_do_multithreaded(&job); >>> +} >>> + >>> static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) >>> { >>> unsigned long i; >>> @@ -3602,6 +3620,14 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) >>> return; >>> } >>> + /* hugetlb_hstate_alloc_pages will be called many times, init huge_boot_pages once*/ >> s/init/initialize/g >> And you miss a black right before "*/". > > OK > >>> + if (huge_boot_pages[0].next == NULL) { >> It it not intuitive. I'd like to use a 'initialied' variable > > Would it make the code look a bit redundant? What is redundant? > >> to indicate whether it has been initialized. BTW, it can be >> marked as __initdata. >> > > OK