Re: [PATCH v5 4/5] mm/sparse-vmemmap: improve memory savings for compound devmaps

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 2/11/22 07:54, Muchun Song wrote:
> On Fri, Feb 11, 2022 at 3:34 AM Joao Martins <joao.m.martins@xxxxxxxxxx> wrote:
> [...]
>>  pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
>> -                                      struct vmem_altmap *altmap)
>> +                                      struct vmem_altmap *altmap,
>> +                                      struct page *block)
> 
> Why not use the name of "reuse" instead of "block"?
> Seems like "reuse" is more clear.
> 
Good idea, let me rename that to @reuse.

>>  {
>>         pte_t *pte = pte_offset_kernel(pmd, addr);
>>         if (pte_none(*pte)) {
>>                 pte_t entry;
>>                 void *p;
>>
>> -               p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
>> -               if (!p)
>> -                       return NULL;
>> +               if (!block) {
>> +                       p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
>> +                       if (!p)
>> +                               return NULL;
>> +               } else {
>> +                       /*
>> +                        * When a PTE/PMD entry is freed from the init_mm
>> +                        * there's a a free_pages() call to this page allocated
>> +                        * above. Thus this get_page() is paired with the
>> +                        * put_page_testzero() on the freeing path.
>> +                        * This can only called by certain ZONE_DEVICE path,
>> +                        * and through vmemmap_populate_compound_pages() when
>> +                        * slab is available.
>> +                        */
>> +                       get_page(block);
>> +                       p = page_to_virt(block);
>> +               }
>>                 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
>>                 set_pte_at(&init_mm, addr, pte, entry);
>>         }
>> @@ -609,7 +624,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
>>  }
>>
>>  static int __meminit vmemmap_populate_address(unsigned long addr, int node,
>> -                                             struct vmem_altmap *altmap)
>> +                                             struct vmem_altmap *altmap,
>> +                                             struct page *reuse, struct page **page)
> 
> We can remove the last argument (struct page **page) if we change
> the return type to "pte_t *".  More simple, don't you think?
> 

Hmmm, perhaps it is simpler, specially provided the only error code is ENOMEM.

Albeit perhaps what we want is a `struct page *` rather than a pte.

>>  {
>>         pgd_t *pgd;
>>         p4d_t *p4d;
>> @@ -629,11 +645,13 @@ static int __meminit vmemmap_populate_address(unsigned long addr, int node,
>>         pmd = vmemmap_pmd_populate(pud, addr, node);
>>         if (!pmd)
>>                 return -ENOMEM;
>> -       pte = vmemmap_pte_populate(pmd, addr, node, altmap);
>> +       pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
>>         if (!pte)
>>                 return -ENOMEM;
>>         vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
>>
>> +       if (page)
>> +               *page = pte_page(*pte);
>>         return 0;
>>  }
>>
>> @@ -644,10 +662,120 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
>>         int rc;
>>
>>         for (; addr < end; addr += PAGE_SIZE) {
>> -               rc = vmemmap_populate_address(addr, node, altmap);
>> +               rc = vmemmap_populate_address(addr, node, altmap, NULL, NULL);
>>                 if (rc)
>>                         return rc;
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static int __meminit vmemmap_populate_range(unsigned long start,
>> +                                           unsigned long end,
>> +                                           int node, struct page *page)
>> +{
>> +       unsigned long addr = start;
>> +       int rc;
>>
>> +       for (; addr < end; addr += PAGE_SIZE) {
>> +               rc = vmemmap_populate_address(addr, node, NULL, page, NULL);
>> +               if (rc)
>> +                       return rc;
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static inline int __meminit vmemmap_populate_page(unsigned long addr, int node,
>> +                                                 struct page **page)
>> +{
>> +       return vmemmap_populate_address(addr, node, NULL, NULL, page);
>> +}
>> +
>> +/*
>> + * For compound pages bigger than section size (e.g. x86 1G compound
>> + * pages with 2M subsection size) fill the rest of sections as tail
>> + * pages.
>> + *
>> + * Note that memremap_pages() resets @nr_range value and will increment
>> + * it after each range successful onlining. Thus the value or @nr_range
>> + * at section memmap populate corresponds to the in-progress range
>> + * being onlined here.
>> + */
>> +static bool __meminit reuse_compound_section(unsigned long start_pfn,
>> +                                            struct dev_pagemap *pgmap)
>> +{
>> +       unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
>> +       unsigned long offset = start_pfn -
>> +               PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
>> +
>> +       return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
>> +}
>> +
>> +static struct page * __meminit compound_section_tail_page(unsigned long addr)
>> +{
>> +       pte_t *ptep;
>> +
>> +       addr -= PAGE_SIZE;
>> +
>> +       /*
>> +        * Assuming sections are populated sequentially, the previous section's
>> +        * page data can be reused.
>> +        */
>> +       ptep = pte_offset_kernel(pmd_off_k(addr), addr);
>> +       if (!ptep)
>> +               return NULL;
>> +
>> +       return pte_page(*ptep);
>> +}
>> +
>> +static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
>> +                                                    unsigned long start,
>> +                                                    unsigned long end, int node,
>> +                                                    struct dev_pagemap *pgmap)
>> +{
>> +       unsigned long size, addr;
>> +
>> +       if (reuse_compound_section(start_pfn, pgmap)) {
>> +               struct page *page;
>> +
>> +               page = compound_section_tail_page(start);
>> +               if (!page)
>> +                       return -ENOMEM;
>> +
>> +               /*
>> +                * Reuse the page that was populated in the prior iteration
>> +                * with just tail struct pages.
>> +                */
>> +               return vmemmap_populate_range(start, end, node, page);
>> +       }
>> +
>> +       size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
>> +       for (addr = start; addr < end; addr += size) {
>> +               unsigned long next = addr, last = addr + size;
>> +               struct page *block;
>> +               int rc;
>> +
>> +               /* Populate the head page vmemmap page */
>> +               rc = vmemmap_populate_page(addr, node, NULL);
>> +               if (rc)
>> +                       return rc;
>> +
>> +               /* Populate the tail pages vmemmap page */
>> +               block = NULL;
>> +               next = addr + PAGE_SIZE;
>> +               rc = vmemmap_populate_page(next, node, &block);
>> +               if (rc)
>> +                       return rc;
>> +
>> +               /*
>> +                * Reuse the previous page for the rest of tail pages
>> +                * See layout diagram in Documentation/vm/vmemmap_dedup.rst
>> +                */
>> +               next += PAGE_SIZE;
>> +               rc = vmemmap_populate_range(next, last, node, block);
>> +               if (rc)
>> +                       return rc;
>>         }
>>
>>         return 0;
>> @@ -659,12 +787,18 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
>>  {
>>         unsigned long start = (unsigned long) pfn_to_page(pfn);
>>         unsigned long end = start + nr_pages * sizeof(struct page);
>> +       int r;
>>
>>         if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
>>                 !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
>>                 return NULL;
>>
>> -       if (vmemmap_populate(start, end, nid, altmap))
>> +       if (pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
> 
> Should we add a judgment like "is_power_of_2(sizeof(struct page))" since
> this optimization is only applied when the size of the struct page does not
> cross page boundaries?

Totally miss that -- let me make that adjustment.

Can I ask which architectures/conditions this happens?



[Index of Archives]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]     [Linux Resources]

  Powered by Linux