On 2/11/22 07:54, Muchun Song wrote: > On Fri, Feb 11, 2022 at 3:34 AM Joao Martins <joao.m.martins@xxxxxxxxxx> wrote: > [...] >> pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, >> - struct vmem_altmap *altmap) >> + struct vmem_altmap *altmap, >> + struct page *block) > > Why not use the name of "reuse" instead of "block"? > Seems like "reuse" is more clear. > Good idea, let me rename that to @reuse. >> { >> pte_t *pte = pte_offset_kernel(pmd, addr); >> if (pte_none(*pte)) { >> pte_t entry; >> void *p; >> >> - p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); >> - if (!p) >> - return NULL; >> + if (!block) { >> + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); >> + if (!p) >> + return NULL; >> + } else { >> + /* >> + * When a PTE/PMD entry is freed from the init_mm >> + * there's a a free_pages() call to this page allocated >> + * above. Thus this get_page() is paired with the >> + * put_page_testzero() on the freeing path. >> + * This can only called by certain ZONE_DEVICE path, >> + * and through vmemmap_populate_compound_pages() when >> + * slab is available. >> + */ >> + get_page(block); >> + p = page_to_virt(block); >> + } >> entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); >> set_pte_at(&init_mm, addr, pte, entry); >> } >> @@ -609,7 +624,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) >> } >> >> static int __meminit vmemmap_populate_address(unsigned long addr, int node, >> - struct vmem_altmap *altmap) >> + struct vmem_altmap *altmap, >> + struct page *reuse, struct page **page) > > We can remove the last argument (struct page **page) if we change > the return type to "pte_t *". More simple, don't you think? > Hmmm, perhaps it is simpler, specially provided the only error code is ENOMEM. Albeit perhaps what we want is a `struct page *` rather than a pte. >> { >> pgd_t *pgd; >> p4d_t *p4d; >> @@ -629,11 +645,13 @@ static int __meminit vmemmap_populate_address(unsigned long addr, int node, >> pmd = vmemmap_pmd_populate(pud, addr, node); >> if (!pmd) >> return -ENOMEM; >> - pte = vmemmap_pte_populate(pmd, addr, node, altmap); >> + pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse); >> if (!pte) >> return -ENOMEM; >> vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); >> >> + if (page) >> + *page = pte_page(*pte); >> return 0; >> } >> >> @@ -644,10 +662,120 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, >> int rc; >> >> for (; addr < end; addr += PAGE_SIZE) { >> - rc = vmemmap_populate_address(addr, node, altmap); >> + rc = vmemmap_populate_address(addr, node, altmap, NULL, NULL); >> if (rc) >> return rc; >> + } >> + >> + return 0; >> +} >> + >> +static int __meminit vmemmap_populate_range(unsigned long start, >> + unsigned long end, >> + int node, struct page *page) >> +{ >> + unsigned long addr = start; >> + int rc; >> >> + for (; addr < end; addr += PAGE_SIZE) { >> + rc = vmemmap_populate_address(addr, node, NULL, page, NULL); >> + if (rc) >> + return rc; >> + } >> + >> + return 0; >> +} >> + >> +static inline int __meminit vmemmap_populate_page(unsigned long addr, int node, >> + struct page **page) >> +{ >> + return vmemmap_populate_address(addr, node, NULL, NULL, page); >> +} >> + >> +/* >> + * For compound pages bigger than section size (e.g. x86 1G compound >> + * pages with 2M subsection size) fill the rest of sections as tail >> + * pages. >> + * >> + * Note that memremap_pages() resets @nr_range value and will increment >> + * it after each range successful onlining. Thus the value or @nr_range >> + * at section memmap populate corresponds to the in-progress range >> + * being onlined here. >> + */ >> +static bool __meminit reuse_compound_section(unsigned long start_pfn, >> + struct dev_pagemap *pgmap) >> +{ >> + unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); >> + unsigned long offset = start_pfn - >> + PHYS_PFN(pgmap->ranges[pgmap->nr_range].start); >> + >> + return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION; >> +} >> + >> +static struct page * __meminit compound_section_tail_page(unsigned long addr) >> +{ >> + pte_t *ptep; >> + >> + addr -= PAGE_SIZE; >> + >> + /* >> + * Assuming sections are populated sequentially, the previous section's >> + * page data can be reused. >> + */ >> + ptep = pte_offset_kernel(pmd_off_k(addr), addr); >> + if (!ptep) >> + return NULL; >> + >> + return pte_page(*ptep); >> +} >> + >> +static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, >> + unsigned long start, >> + unsigned long end, int node, >> + struct dev_pagemap *pgmap) >> +{ >> + unsigned long size, addr; >> + >> + if (reuse_compound_section(start_pfn, pgmap)) { >> + struct page *page; >> + >> + page = compound_section_tail_page(start); >> + if (!page) >> + return -ENOMEM; >> + >> + /* >> + * Reuse the page that was populated in the prior iteration >> + * with just tail struct pages. >> + */ >> + return vmemmap_populate_range(start, end, node, page); >> + } >> + >> + size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page)); >> + for (addr = start; addr < end; addr += size) { >> + unsigned long next = addr, last = addr + size; >> + struct page *block; >> + int rc; >> + >> + /* Populate the head page vmemmap page */ >> + rc = vmemmap_populate_page(addr, node, NULL); >> + if (rc) >> + return rc; >> + >> + /* Populate the tail pages vmemmap page */ >> + block = NULL; >> + next = addr + PAGE_SIZE; >> + rc = vmemmap_populate_page(next, node, &block); >> + if (rc) >> + return rc; >> + >> + /* >> + * Reuse the previous page for the rest of tail pages >> + * See layout diagram in Documentation/vm/vmemmap_dedup.rst >> + */ >> + next += PAGE_SIZE; >> + rc = vmemmap_populate_range(next, last, node, block); >> + if (rc) >> + return rc; >> } >> >> return 0; >> @@ -659,12 +787,18 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn, >> { >> unsigned long start = (unsigned long) pfn_to_page(pfn); >> unsigned long end = start + nr_pages * sizeof(struct page); >> + int r; >> >> if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) || >> !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION))) >> return NULL; >> >> - if (vmemmap_populate(start, end, nid, altmap)) >> + if (pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap) > > Should we add a judgment like "is_power_of_2(sizeof(struct page))" since > this optimization is only applied when the size of the struct page does not > cross page boundaries? Totally miss that -- let me make that adjustment. Can I ask which architectures/conditions this happens?