Re: [External] RE: [PATCH v4 09/21] mm/hugetlb: Free the vmemmap pages associated with each hugetlb page

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Nov 17, 2020 at 5:55 PM Song Bao Hua (Barry Song)
<song.bao.hua@xxxxxxxxxxxxx> wrote:
>
>
>
> > -----Original Message-----
> > From: owner-linux-mm@xxxxxxxxx [mailto:owner-linux-mm@xxxxxxxxx] On
> > Behalf Of Muchun Song
> > Sent: Saturday, November 14, 2020 12:00 AM
> > To: corbet@xxxxxxx; mike.kravetz@xxxxxxxxxx; tglx@xxxxxxxxxxxxx;
> > mingo@xxxxxxxxxx; bp@xxxxxxxxx; x86@xxxxxxxxxx; hpa@xxxxxxxxx;
> > dave.hansen@xxxxxxxxxxxxxxx; luto@xxxxxxxxxx; peterz@xxxxxxxxxxxxx;
> > viro@xxxxxxxxxxxxxxxxxx; akpm@xxxxxxxxxxxxxxxxxxxx; paulmck@xxxxxxxxxx;
> > mchehab+huawei@xxxxxxxxxx; pawan.kumar.gupta@xxxxxxxxxxxxxxx;
> > rdunlap@xxxxxxxxxxxxx; oneukum@xxxxxxxx; anshuman.khandual@xxxxxxx;
> > jroedel@xxxxxxx; almasrymina@xxxxxxxxxx; rientjes@xxxxxxxxxx;
> > willy@xxxxxxxxxxxxx; osalvador@xxxxxxx; mhocko@xxxxxxxx
> > Cc: duanxiongchun@xxxxxxxxxxxxx; linux-doc@xxxxxxxxxxxxxxx;
> > linux-kernel@xxxxxxxxxxxxxxx; linux-mm@xxxxxxxxx;
> > linux-fsdevel@xxxxxxxxxxxxxxx; Muchun Song <songmuchun@xxxxxxxxxxxxx>
> > Subject: [PATCH v4 09/21] mm/hugetlb: Free the vmemmap pages associated
> > with each hugetlb page
> >
> > When we allocate a hugetlb page from the buddy, we should free the
> > unused vmemmap pages associated with it. We can do that in the
> > prep_new_huge_page().
> >
> > Signed-off-by: Muchun Song <songmuchun@xxxxxxxxxxxxx>
> > ---
> >  arch/x86/include/asm/hugetlb.h          |   9 ++
> >  arch/x86/include/asm/pgtable_64_types.h |   8 ++
> >  mm/hugetlb.c                            |  16 +++
> >  mm/hugetlb_vmemmap.c                    | 188
> > ++++++++++++++++++++++++++++++++
> >  mm/hugetlb_vmemmap.h                    |   5 +
> >  5 files changed, 226 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
> > index 1721b1aadeb1..c601fe042832 100644
> > --- a/arch/x86/include/asm/hugetlb.h
> > +++ b/arch/x86/include/asm/hugetlb.h
> > @@ -4,6 +4,15 @@
> >
> >  #include <asm/page.h>
> >  #include <asm-generic/hugetlb.h>
> > +#include <asm/pgtable.h>
> > +
> > +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> > +#define vmemmap_pmd_huge vmemmap_pmd_huge
> > +static inline bool vmemmap_pmd_huge(pmd_t *pmd)
> > +{
> > +     return pmd_large(*pmd);
> > +}
> > +#endif
> >
> >  #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
> >
> > diff --git a/arch/x86/include/asm/pgtable_64_types.h
> > b/arch/x86/include/asm/pgtable_64_types.h
> > index 52e5f5f2240d..bedbd2e7d06c 100644
> > --- a/arch/x86/include/asm/pgtable_64_types.h
> > +++ b/arch/x86/include/asm/pgtable_64_types.h
> > @@ -139,6 +139,14 @@ extern unsigned int ptrs_per_p4d;
> >  # define VMEMMAP_START               __VMEMMAP_BASE_L4
> >  #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
> >
> > +/*
> > + * VMEMMAP_SIZE - allows the whole linear region to be covered by
> > + *                a struct page array.
> > + */
> > +#define VMEMMAP_SIZE         (1UL << (__VIRTUAL_MASK_SHIFT -
> > PAGE_SHIFT - \
> > +                                      1 + ilog2(sizeof(struct page))))
> > +#define VMEMMAP_END          (VMEMMAP_START + VMEMMAP_SIZE)
> > +
> >  #define VMALLOC_END          (VMALLOC_START + (VMALLOC_SIZE_TB <<
> > 40) - 1)
> >
> >  #define MODULES_VADDR                (__START_KERNEL_map +
> > KERNEL_IMAGE_SIZE)
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index f88032c24667..a0ce6f33a717 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1499,6 +1499,14 @@ void free_huge_page(struct page *page)
> >
> >  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
> >  {
> > +     free_huge_page_vmemmap(h, page);
> > +     /*
> > +      * Because we store preallocated pages on @page->lru,
> > +      * vmemmap_pgtable_free() must be called before the
> > +      * initialization of @page->lru in INIT_LIST_HEAD().
> > +      */
> > +     vmemmap_pgtable_free(page);
> > +
> >       INIT_LIST_HEAD(&page->lru);
> >       set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
> >       set_hugetlb_cgroup(page, NULL);
> > @@ -1751,6 +1759,14 @@ static struct page *alloc_fresh_huge_page(struct
> > hstate *h,
> >       if (!page)
> >               return NULL;
> >
> > +     if (vmemmap_pgtable_prealloc(h, page)) {
> > +             if (hstate_is_gigantic(h))
> > +                     free_gigantic_page(page, huge_page_order(h));
> > +             else
> > +                     put_page(page);
> > +             return NULL;
> > +     }
> > +
> >       if (hstate_is_gigantic(h))
> >               prep_compound_gigantic_page(page, huge_page_order(h));
> >       prep_new_huge_page(h, page, page_to_nid(page));
> > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> > index 332c131c01a8..937562a15f1e 100644
> > --- a/mm/hugetlb_vmemmap.c
> > +++ b/mm/hugetlb_vmemmap.c
> > @@ -74,6 +74,7 @@
> >  #include <linux/pagewalk.h>
> >  #include <linux/mmzone.h>
> >  #include <linux/list.h>
> > +#include <linux/bootmem_info.h>
> >  #include <asm/pgalloc.h>
> >  #include "hugetlb_vmemmap.h"
> >
> > @@ -86,6 +87,8 @@
> >   * reserve at least 2 pages as vmemmap areas.
> >   */
> >  #define RESERVE_VMEMMAP_NR           2U
> > +#define RESERVE_VMEMMAP_SIZE         (RESERVE_VMEMMAP_NR <<
> > PAGE_SHIFT)
> > +#define TAIL_PAGE_REUSE                      -1
> >
> >  #ifndef VMEMMAP_HPAGE_SHIFT
> >  #define VMEMMAP_HPAGE_SHIFT          HPAGE_SHIFT
> > @@ -97,6 +100,21 @@
> >
> >  #define page_huge_pte(page)          ((page)->pmd_huge_pte)
> >
> > +#define vmemmap_hpage_addr_end(addr, end)                             \
> > +({                                                                    \
> > +     unsigned long __boundary;                                        \
> > +     __boundary = ((addr) + VMEMMAP_HPAGE_SIZE) &
> > VMEMMAP_HPAGE_MASK; \
> > +     (__boundary - 1 < (end) - 1) ? __boundary : (end);               \
> > +})
> > +
> > +#ifndef vmemmap_pmd_huge
> > +#define vmemmap_pmd_huge vmemmap_pmd_huge
> > +static inline bool vmemmap_pmd_huge(pmd_t *pmd)
> > +{
> > +     return pmd_huge(*pmd);
> > +}
> > +#endif
> > +
> >  static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
> >  {
> >       return h->nr_free_vmemmap_pages;
> > @@ -158,6 +176,176 @@ int vmemmap_pgtable_prealloc(struct hstate *h,
> > struct page *page)
> >       return -ENOMEM;
> >  }
> >
> > +/*
> > + * Walk a vmemmap address to the pmd it maps.
> > + */
> > +static pmd_t *vmemmap_to_pmd(unsigned long page)
> > +{
> > +     pgd_t *pgd;
> > +     p4d_t *p4d;
> > +     pud_t *pud;
> > +     pmd_t *pmd;
> > +
> > +     if (page < VMEMMAP_START || page >= VMEMMAP_END)
> > +             return NULL;
> > +
> > +     pgd = pgd_offset_k(page);
> > +     if (pgd_none(*pgd))
> > +             return NULL;
> > +     p4d = p4d_offset(pgd, page);
> > +     if (p4d_none(*p4d))
> > +             return NULL;
> > +     pud = pud_offset(p4d, page);
> > +
> > +     if (pud_none(*pud) || pud_bad(*pud))
> > +             return NULL;
> > +     pmd = pmd_offset(pud, page);
> > +
> > +     return pmd;
> > +}
> > +
> > +static inline spinlock_t *vmemmap_pmd_lock(pmd_t *pmd)
> > +{
> > +     return pmd_lock(&init_mm, pmd);
> > +}
> > +
> > +static inline int freed_vmemmap_hpage(struct page *page)
> > +{
> > +     return atomic_read(&page->_mapcount) + 1;
> > +}
> > +
> > +static inline int freed_vmemmap_hpage_inc(struct page *page)
> > +{
> > +     return atomic_inc_return_relaxed(&page->_mapcount) + 1;
> > +}
> > +
> > +static inline int freed_vmemmap_hpage_dec(struct page *page)
> > +{
> > +     return atomic_dec_return_relaxed(&page->_mapcount) + 1;
> > +}
> > +
> > +static inline void free_vmemmap_page_list(struct list_head *list)
> > +{
> > +     struct page *page, *next;
> > +
> > +     list_for_each_entry_safe(page, next, list, lru) {
> > +             list_del(&page->lru);
> > +             free_vmemmap_page(page);
> > +     }
> > +}
> > +
> > +static void __free_huge_page_pte_vmemmap(struct page *reuse, pte_t *ptep,
> > +                                      unsigned long start,
> > +                                      unsigned long end,
> > +                                      struct list_head *free_pages)
> > +{
> > +     /* Make the tail pages are mapped read-only. */
> > +     pgprot_t pgprot = PAGE_KERNEL_RO;
> > +     pte_t entry = mk_pte(reuse, pgprot);
> > +     unsigned long addr;
> > +
> > +     for (addr = start; addr < end; addr += PAGE_SIZE, ptep++) {
> > +             struct page *page;
> > +             pte_t old = *ptep;
> > +
> > +             VM_WARN_ON(!pte_present(old));
> > +             page = pte_page(old);
> > +             list_add(&page->lru, free_pages);
> > +
> > +             set_pte_at(&init_mm, addr, ptep, entry);
> > +     }
> > +}
> > +
> > +static void __free_huge_page_pmd_vmemmap(struct hstate *h, pmd_t *pmd,
> > +                                      unsigned long addr,
> > +                                      struct list_head *free_pages)
> > +{
> > +     unsigned long next;
> > +     unsigned long start = addr + RESERVE_VMEMMAP_SIZE;
> > +     unsigned long end = addr + vmemmap_pages_size_per_hpage(h);
> > +     struct page *reuse = NULL;
> > +
> > +     addr = start;
> > +     do {
> > +             pte_t *ptep;
> > +
> > +             ptep = pte_offset_kernel(pmd, addr);
> > +             if (!reuse)
> > +                     reuse = pte_page(ptep[TAIL_PAGE_REUSE]);
> > +
> > +             next = vmemmap_hpage_addr_end(addr, end);
> > +             __free_huge_page_pte_vmemmap(reuse, ptep, addr, next,
> > +                                          free_pages);
> > +     } while (pmd++, addr = next, addr != end);
> > +
> > +     flush_tlb_kernel_range(start, end);
> > +}
> > +
> > +static void split_vmemmap_pmd(pmd_t *pmd, pte_t *pte_p, unsigned long
> > addr)
>
> Hi Muchun,
>
> Are you going to restore the pmd mapping after you free the hugetlb? I mean,
> When you free continuous 128MB hugetlb pages with 2MB size, will you
> redo the PMD vmemmap since 2MB PMD can just contain the page struct of
> 128MB memory?

Now we only restore the pmd mapping for the 1GB HugeTLB page. For the
2MB HugeTLB page, we do not(I haven't figured out how to handle it gracefully).

>
> If no, wouldn't it be simpler to only use base pages while populating vmemmap?
> I mean, once we enable the Kconfig option you add for VMEMMAP_FREE, we
> only use base pages to place "page struct" but not split PMD into base pages
> afterwards.
>
> One negative side effect might be that base pages are also used for those pages
> which won't be hugetlb later. but if most pages of host will be hugetlb for
> guest and SPDK, it shouldn't hurt too much.

Yeah, I agree with you. If the user uses a lot of HugeTLB pages(e.g.
SPDK/Guest),
it shouldn't hurt too much. And using base pages while populating vmemmap also
can decrease the overhead(of splitting PMD). In the end, if we don’t
come up with
a more suitable solution to deal with it(mentioned above for 2MB HugeTLB page).
Maybe this is also an idea.

Thanks.

>
> Or at least this can be done for hugetlb reserved by cmdline?
>
> > +{
> > +     int i;
> > +     pgprot_t pgprot = PAGE_KERNEL;
> > +     struct mm_struct *mm = &init_mm;
> > +     struct page *page;
> > +     pmd_t old_pmd, _pmd;
> > +
> > +     old_pmd = READ_ONCE(*pmd);
> > +     page = pmd_page(old_pmd);
> > +     pmd_populate_kernel(mm, &_pmd, pte_p);
> > +
> > +     for (i = 0; i < VMEMMAP_HPAGE_NR; i++, addr += PAGE_SIZE) {
> > +             pte_t entry, *pte;
> > +
> > +             entry = mk_pte(page + i, pgprot);
> > +             pte = pte_offset_kernel(&_pmd, addr);
> > +             VM_BUG_ON(!pte_none(*pte));
> > +             set_pte_at(mm, addr, pte, entry);
> > +     }
> > +
> > +     /* make pte visible before pmd */
> > +     smp_wmb();
> > +     pmd_populate_kernel(mm, pmd, pte_p);
> > +}
> > +
> > +static void split_vmemmap_huge_page(struct page *head, pmd_t *pmd)
> > +{
> > +     struct page *pte_page, *t_page;
> > +     unsigned long start = (unsigned long)head & VMEMMAP_HPAGE_MASK;
> > +     unsigned long addr = start;
> > +
> > +     list_for_each_entry_safe(pte_page, t_page, &head->lru, lru) {
> > +             list_del(&pte_page->lru);
> > +             VM_BUG_ON(freed_vmemmap_hpage(pte_page));
> > +             split_vmemmap_pmd(pmd++, page_to_virt(pte_page), addr);
> > +             addr += VMEMMAP_HPAGE_SIZE;
> > +     }
> > +
> > +     flush_tlb_kernel_range(start, addr);
> > +}
> > +
> > +void free_huge_page_vmemmap(struct hstate *h, struct page *head)
> > +{
> > +     pmd_t *pmd;
> > +     spinlock_t *ptl;
> > +     LIST_HEAD(free_pages);
> > +
> > +     if (!free_vmemmap_pages_per_hpage(h))
> > +             return;
> > +
> > +     pmd = vmemmap_to_pmd((unsigned long)head);
> > +     BUG_ON(!pmd);
> > +
> > +     ptl = vmemmap_pmd_lock(pmd);
> > +     if (vmemmap_pmd_huge(pmd))
> > +             split_vmemmap_huge_page(head, pmd);
> > +
> > +     __free_huge_page_pmd_vmemmap(h, pmd, (unsigned long)head,
> > &free_pages);
> > +     freed_vmemmap_hpage_inc(pmd_page(*pmd));
> > +     spin_unlock(ptl);
> > +
> > +     free_vmemmap_page_list(&free_pages);
> > +}
> > +
> >  void __init hugetlb_vmemmap_init(struct hstate *h)
> >  {
> >       unsigned int order = huge_page_order(h);
> > diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
> > index 2a72d2f62411..fb8b77659ed5 100644
> > --- a/mm/hugetlb_vmemmap.h
> > +++ b/mm/hugetlb_vmemmap.h
> > @@ -15,6 +15,7 @@
> >  void __init hugetlb_vmemmap_init(struct hstate *h);
> >  int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page);
> >  void vmemmap_pgtable_free(struct page *page);
> > +void free_huge_page_vmemmap(struct hstate *h, struct page *head);
> >  #else
> >  static inline void hugetlb_vmemmap_init(struct hstate *h)
> >  {
> > @@ -28,5 +29,9 @@ static inline int vmemmap_pgtable_prealloc(struct hstate
> > *h, struct page *page)
> >  static inline void vmemmap_pgtable_free(struct page *page)
> >  {
> >  }
> > +
> > +static inline void free_huge_page_vmemmap(struct hstate *h, struct page
> > *head)
> > +{
> > +}
> >  #endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
> >  #endif /* _LINUX_HUGETLB_VMEMMAP_H */
> > --
> > 2.11.0
> >
>
> Thanks
> Barry
>


-- 
Yours,
Muchun




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux