"Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> writes: > This is in preparation to update radix to implement vmemmap optimization > for devdax. Below are the rules w.r.t radix vmemmap mapping > > 1. First try to map things using PMD (2M) > 2. With altmap if altmap cross-boundary check returns true, fall back to > PAGE_SIZE > 3. If we can't allocate PMD_SIZE backing memory for vmemmap, fallback to > PAGE_SIZE > > On removing vmemmap mapping, check if every subsection that is using the > vmemmap area is invalid. If found to be invalid, that implies we can safely > free the vmemmap area. We don't use the PAGE_UNUSED pattern used by x86 > because with 64K page size, we need to do the above check even at the > PAGE_SIZE granularity. > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> > --- > arch/powerpc/include/asm/book3s/64/radix.h | 2 + > arch/powerpc/include/asm/pgtable.h | 3 + > arch/powerpc/mm/book3s64/radix_pgtable.c | 319 +++++++++++++++++++-- > arch/powerpc/mm/init_64.c | 26 +- > 4 files changed, 319 insertions(+), 31 deletions(-) > > diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h > index 8cdff5a05011..87d4c1e62491 100644 > --- a/arch/powerpc/include/asm/book3s/64/radix.h > +++ b/arch/powerpc/include/asm/book3s/64/radix.h > @@ -332,6 +332,8 @@ extern int __meminit radix__vmemmap_create_mapping(unsigned long start, > unsigned long phys); > int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, > int node, struct vmem_altmap *altmap); > +void __ref radix__vmemmap_free(unsigned long start, unsigned long end, > + struct vmem_altmap *altmap); > extern void radix__vmemmap_remove_mapping(unsigned long start, > unsigned long page_size); > > diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h > index 9972626ddaf6..6d4cd2ebae6e 100644 > --- a/arch/powerpc/include/asm/pgtable.h > +++ b/arch/powerpc/include/asm/pgtable.h > @@ -168,6 +168,9 @@ static inline bool is_ioremap_addr(const void *x) > > struct seq_file; > void arch_report_meminfo(struct seq_file *m); > +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size); > +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, > + unsigned long page_size); > #endif /* CONFIG_PPC64 */ > > #endif /* __ASSEMBLY__ */ > diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c > index d7e2dd3d4add..ef886fab643d 100644 > --- a/arch/powerpc/mm/book3s64/radix_pgtable.c > +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c > @@ -742,8 +742,57 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d) > p4d_clear(p4d); > } > > +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) > +{ > + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); > + > + return !vmemmap_populated(start, PMD_SIZE); > +} > + > +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end) > +{ > + unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); > + > + return !vmemmap_populated(start, PAGE_SIZE); > + > +} > + > +static void __meminit free_vmemmap_pages(struct page *page, > + struct vmem_altmap *altmap, > + int order) > +{ > + unsigned int nr_pages = 1 << order; > + > + if (altmap) { > + unsigned long alt_start, alt_end; > + unsigned long base_pfn = page_to_pfn(page); > + > + /* > + * with 1G vmemmap mmaping we can have things setup > + * such that even though atlmap is specified we never > + * used altmap. > + */ > + alt_start = altmap->base_pfn; > + alt_end = altmap->base_pfn + altmap->reserve + > + altmap->free + altmap->alloc + altmap->align; > + > + if (base_pfn >= alt_start && base_pfn < alt_end) { > + vmem_altmap_free(altmap, nr_pages); > + return; > + } > + } > + > + if (PageReserved(page)) { > + /* allocated from memblock */ > + while (nr_pages--) > + free_reserved_page(page++); > + } else > + free_pages((unsigned long)page_address(page), order); > +} > + > static void remove_pte_table(pte_t *pte_start, unsigned long addr, > - unsigned long end, bool direct) > + unsigned long end, bool direct, > + struct vmem_altmap *altmap) > { > unsigned long next, pages = 0; > pte_t *pte; > @@ -757,24 +806,23 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr, > if (!pte_present(*pte)) > continue; > > - if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { > - /* > - * The vmemmap_free() and remove_section_mapping() > - * codepaths call us with aligned addresses. > - */ > - WARN_ONCE(1, "%s: unaligned range\n", __func__); > - continue; > + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { > + if (!direct) > + free_vmemmap_pages(pte_page(*pte), altmap, 0); > + pte_clear(&init_mm, addr, pte); > + pages++; > + } else if (!direct && vmemmap_page_is_unused(addr, next)) { > + free_vmemmap_pages(pte_page(*pte), altmap, 0); > + pte_clear(&init_mm, addr, pte); > } > - > - pte_clear(&init_mm, addr, pte); > - pages++; > } > if (direct) > update_page_count(mmu_virtual_psize, -pages); > } > > static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, > - unsigned long end, bool direct) > + unsigned long end, bool direct, > + struct vmem_altmap *altmap) > { > unsigned long next, pages = 0; > pte_t *pte_base; > @@ -788,18 +836,21 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, > continue; > > if (pmd_is_leaf(*pmd)) { > - if (!IS_ALIGNED(addr, PMD_SIZE) || > - !IS_ALIGNED(next, PMD_SIZE)) { > - WARN_ONCE(1, "%s: unaligned range\n", __func__); > - continue; > + if (IS_ALIGNED(addr, PMD_SIZE) && > + IS_ALIGNED(next, PMD_SIZE)) { > + if (!direct) > + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); > + pte_clear(&init_mm, addr, (pte_t *)pmd); > + pages++; > + } else if (vmemmap_pmd_is_unused(addr, next)) { > + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); > + pte_clear(&init_mm, addr, (pte_t *)pmd); > } > - pte_clear(&init_mm, addr, (pte_t *)pmd); > - pages++; > continue; > } > > pte_base = (pte_t *)pmd_page_vaddr(*pmd); > - remove_pte_table(pte_base, addr, next, direct); > + remove_pte_table(pte_base, addr, next, direct, altmap); > free_pte_table(pte_base, pmd); > } > if (direct) > @@ -807,7 +858,8 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, > } > > static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, > - unsigned long end, bool direct) > + unsigned long end, bool direct, > + struct vmem_altmap *altmap) > { > unsigned long next, pages = 0; > pmd_t *pmd_base; > @@ -832,15 +884,16 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, > } > > pmd_base = pud_pgtable(*pud); > - remove_pmd_table(pmd_base, addr, next, direct); > + remove_pmd_table(pmd_base, addr, next, direct, altmap); > free_pmd_table(pmd_base, pud); > } > if (direct) > update_page_count(MMU_PAGE_1G, -pages); > } > > -static void __meminit remove_pagetable(unsigned long start, unsigned long end, > - bool direct) > +static void __meminit > +remove_pagetable(unsigned long start, unsigned long end, bool direct, > + struct vmem_altmap *altmap) > { > unsigned long addr, next; > pud_t *pud_base; > @@ -869,7 +922,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end, > } > > pud_base = p4d_pgtable(*p4d); > - remove_pud_table(pud_base, addr, next, direct); > + remove_pud_table(pud_base, addr, next, direct, altmap); > free_pud_table(pud_base, p4d); > } > > @@ -892,7 +945,7 @@ int __meminit radix__create_section_mapping(unsigned long start, > > int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) > { > - remove_pagetable(start, end, true); > + remove_pagetable(start, end, true, NULL); > return 0; > } > #endif /* CONFIG_MEMORY_HOTPLUG */ > @@ -924,10 +977,224 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, > return 0; > } > > +int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, > + unsigned long addr, unsigned long next) > +{ > + int large = pmd_large(*pmd); > + > + if (pmd_large(*pmd)) we already got the value of pmd_large into "large" variable. we can use just if (large) right? > + vmemmap_verify((pte_t *)pmd, node, addr, next); maybe we can use pmdp_ptep() function here which we used in the 1st patch? also shouldn't this be pmdp in the function argument instead of pmd? > + > + return large; > +} > + > +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, > + unsigned long addr, unsigned long next) > +{ > + pte_t entry; > + pte_t *ptep = pmdp_ptep(pmdp); > + > + VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE)); > + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); > + set_pte_at(&init_mm, addr, ptep, entry); > + asm volatile("ptesync": : :"memory"); > + > + vmemmap_verify(ptep, node, addr, next); > +} > + > +static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, > + struct vmem_altmap *altmap, > + struct page *reuse) > +{ > + pte_t *pte = pte_offset_kernel(pmd, addr); > + > + if (pte_none(*pte)) { > + pte_t entry; > + void *p; > + > + if (!reuse) { > + /* > + * make sure we don't create altmap mappings > + * covering things outside the device. > + */ > + if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE)) > + altmap = NULL; > + > + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); > + if (!p) { > + if (altmap) > + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); > + if (!p) > + return NULL; > + } Above if conditions are quite confusing when looking for the 1st time? Can we do this? Did I get it right? if (!p && altmap) p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); if (!p) return NULL; -ritesh