On Mon, Jan 25, 2021 at 2:40 PM Muchun Song <songmuchun@xxxxxxxxxxxxx> wrote: > > On Mon, Jan 25, 2021 at 8:05 AM David Rientjes <rientjes@xxxxxxxxxx> wrote: > > > > > > On Sun, 17 Jan 2021, Muchun Song wrote: > > > > > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c > > > index ce4be1fa93c2..3b146d5949f3 100644 > > > --- a/mm/sparse-vmemmap.c > > > +++ b/mm/sparse-vmemmap.c > > > @@ -29,6 +29,7 @@ > > > #include <linux/sched.h> > > > #include <linux/pgtable.h> > > > #include <linux/bootmem_info.h> > > > +#include <linux/delay.h> > > > > > > #include <asm/dma.h> > > > #include <asm/pgalloc.h> > > > @@ -40,7 +41,8 @@ > > > * @remap_pte: called for each non-empty PTE (lowest-level) entry. > > > * @reuse_page: the page which is reused for the tail vmemmap pages. > > > * @reuse_addr: the virtual address of the @reuse_page page. > > > - * @vmemmap_pages: the list head of the vmemmap pages that can be freed. > > > + * @vmemmap_pages: the list head of the vmemmap pages that can be freed > > > + * or is mapped from. > > > */ > > > struct vmemmap_remap_walk { > > > void (*remap_pte)(pte_t *pte, unsigned long addr, > > > @@ -50,6 +52,10 @@ struct vmemmap_remap_walk { > > > struct list_head *vmemmap_pages; > > > }; > > > > > > +/* The gfp mask of allocating vmemmap page */ > > > +#define GFP_VMEMMAP_PAGE \ > > > + (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE) > > > + > > > > This is unnecessary, just use the gfp mask directly in allocator. > > Will do. Thanks. > > > > > > static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, > > > unsigned long end, > > > struct vmemmap_remap_walk *walk) > > > @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end, > > > free_vmemmap_page_list(&vmemmap_pages); > > > } > > > > > > +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, > > > + struct vmemmap_remap_walk *walk) > > > +{ > > > + pgprot_t pgprot = PAGE_KERNEL; > > > + struct page *page; > > > + void *to; > > > + > > > + BUG_ON(pte_page(*pte) != walk->reuse_page); > > > + > > > + page = list_first_entry(walk->vmemmap_pages, struct page, lru); > > > + list_del(&page->lru); > > > + to = page_to_virt(page); > > > + copy_page(to, (void *)walk->reuse_addr); > > > + > > > + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); > > > +} > > > + > > > +static void alloc_vmemmap_page_list(struct list_head *list, > > > + unsigned long start, unsigned long end) > > > +{ > > > + unsigned long addr; > > > + > > > + for (addr = start; addr < end; addr += PAGE_SIZE) { > > > + struct page *page; > > > + int nid = page_to_nid((const void *)addr); > > > + > > > +retry: > > > + page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0); > > > + if (unlikely(!page)) { > > > + msleep(100); > > > + /* > > > + * We should retry infinitely, because we cannot > > > + * handle allocation failures. Once we allocate > > > + * vmemmap pages successfully, then we can free > > > + * a HugeTLB page. > > > + */ > > > + goto retry; > > > > Ugh, I don't think this will work, there's no guarantee that we'll ever > > succeed and now we can't free a 2MB hugepage because we cannot allocate a > > 4KB page. We absolutely have to ensure we make forward progress here. > > This can trigger a OOM when there is no memory and kill someone to release > some memory. Right? > > > > > We're going to be freeing the hugetlb page after this succeeeds, can we > > not use part of the hugetlb page that we're freeing for this memory > > instead? > > It seems a good idea. We can try to allocate memory firstly, if successful, > just use the new page to remap (it can reduce memory fragmentation). > If not, we can use part of the hugetlb page to remap. What's your opinion > about this? If the HugeTLB page is a gigantic page which is allocated from CMA. In this case, we cannot use part of the hugetlb page to remap. Right? > > > > > > + } > > > + list_add_tail(&page->lru, list); > > > + } > > > +} > > > + > > > +/** > > > + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) > > > + * to the page which is from the @vmemmap_pages > > > + * respectively. > > > + * @start: start address of the vmemmap virtual address range. > > > + * @end: end address of the vmemmap virtual address range. > > > + * @reuse: reuse address. > > > + */ > > > +void vmemmap_remap_alloc(unsigned long start, unsigned long end, > > > + unsigned long reuse) > > > +{ > > > + LIST_HEAD(vmemmap_pages); > > > + struct vmemmap_remap_walk walk = { > > > + .remap_pte = vmemmap_restore_pte, > > > + .reuse_addr = reuse, > > > + .vmemmap_pages = &vmemmap_pages, > > > + }; > > > + > > > + might_sleep(); > > > + > > > + /* See the comment in the vmemmap_remap_free(). */ > > > + BUG_ON(start - reuse != PAGE_SIZE); > > > + > > > + alloc_vmemmap_page_list(&vmemmap_pages, start, end); > > > + vmemmap_remap_range(reuse, end, &walk); > > > +} > > > + > > > /* > > > * Allocate a block of memory to be used to back the virtual memory map > > > * or to back the page tables that are used to create the mapping. > > > -- > > > 2.11.0 > > > > > >