Re: [RFC PATCH v2 07/20] powerpc/8xx: Rework support for 8M pages using contiguous PTE entries

Oscar Salvador <osalvador@xxxxxxx> · Fri, 24 May 2024 12:02:09 +0200

On Fri, May 17, 2024 at 09:00:01PM +0200, Christophe Leroy wrote:
> In order to fit better with standard Linux page tables layout, add
> support for 8M pages using contiguous PTE entries in a standard
> page table. Page tables will then be populated with 1024 similar
> entries and two PMD entries will point to that page table.
> 
> The PMD entries also get a flag to tell it is addressing an 8M page,
> this is required for the HW tablewalk assistance.
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@xxxxxxxxxx>

I guess that this will slightly change if you remove patch#1 and patch#2
as you said you will.
So I will not comment on the overall design because I do not know how it will
look afterwards, but just some things that caught my eye

> --- a/arch/powerpc/include/asm/hugetlb.h
> +++ b/arch/powerpc/include/asm/hugetlb.h
> @@ -41,7 +41,16 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
>  static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>  					    unsigned long addr, pte_t *ptep)
>  {
> -	return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
> +	pmd_t *pmdp = (pmd_t *)ptep;
> +	pte_t pte;
> +
> +	if (IS_ENABLED(CONFIG_PPC_8xx) && pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {

There are quite some places where you do the "pmd_off" to check whether that
is a 8MB entry.
I think it would make somse sense to have some kind of macro/function to make
more clear what we are checking against.
e.g:

 #define pmd_is_SZ_8M(mm, addr, pmdp) (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M)))
 (or whatever name you see fit)

then you would just need

 if (IS_ENABLED(CONFIG_PPC_8xx && pmd_is_SZ_8M(mm, addr, pdmp))

Because I see that is also scaterred in 8xx code.

> +		pte = __pte(pte_update(mm, addr, pte_offset_kernel(pmdp, 0), ~0UL, 0, 1));
> +		pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 1);

I have this fresh one because I recently read about 8xx pagetables, but not sure
how my memory will survive this, so maybe throw a little comment in there that
we are pointing the two pmds to the area.

Also, the way we pass the parameters here to pte_update() is a bit awkward.
Ideally we should be using some meaningful names?

 clr_all_bits = ~0UL
 set_bits = 0
 bool is_huge = true

 pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), clr_all_bits, set_bits, is_huge)

or something along those lines

> -static inline int check_and_get_huge_psize(int shift)
> -{
> -	return shift_to_mmu_psize(shift);
> +	if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M)))

Here you could also use the pmd_is_SZ_8M()

> +		ptep = pte_offset_kernel(pmdp, 0);
> +	return ptep_get(ptep);
>  }
>  
>  #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
> @@ -53,7 +33,14 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
>  static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
>  				  pte_t *ptep, unsigned long sz)
>  {
> -	pte_update(mm, addr, ptep, ~0UL, 0, 1);
> +	pmd_t *pmdp = (pmd_t *)ptep;
> +
> +	if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {
> +		pte_update(mm, addr, pte_offset_kernel(pmdp, 0), ~0UL, 0, 1);
> +		pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 1);
> +	} else {
> +		pte_update(mm, addr, ptep, ~0UL, 0, 1);
> +	}

Could we not leverage this in huge_ptep_get_and_clear()?
AFAICS,

 huge_pet_get_and_clear(mm, addr, pte_t *p) 
 {
      pte_t pte = pte_val(*p);

      huge_pte_clear(mm, addr, p);
      return pte;
 }

Or maybe it is not that easy if different powerpc platforms provide their own.
It might be worth checking though.

>  }
>  
>  #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
> @@ -63,7 +50,14 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
>  	unsigned long clr = ~pte_val(pte_wrprotect(__pte(~0)));
>  	unsigned long set = pte_val(pte_wrprotect(__pte(0)));
>  
> -	pte_update(mm, addr, ptep, clr, set, 1);
> +	pmd_t *pmdp = (pmd_t *)ptep;
> +
> +	if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {
> +		pte_update(mm, addr, pte_offset_kernel(pmdp, 0), clr, set, 1);
> +		pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), clr, set, 1);
> +	} else {
> +		pte_update(mm, addr, ptep, clr, set, 1);

I would replace the "1" with "is_huge" or "huge", as being done in
__ptep_set_access_flags , something that makes it more clear without the need
to check pte_update().

>  #endif /* _ASM_POWERPC_PGALLOC_32_H */
> diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
> index 07df6b664861..b05cc4f87713 100644
> --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
> +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
...
> - * For other page sizes, we have a single entry in the table.
> + * For 8M pages, we have 1024 entries as if it was
> + * 4M pages, but they are flagged as 8M pages for the hardware.

Maybe drop a comment that a single PMD entry is worth 4MB, so

> + * For 4k pages, we have a single entry in the table.
>   */
> -static pmd_t *pmd_off(struct mm_struct *mm, unsigned long addr);
> -static int hugepd_ok(hugepd_t hpd);
> -
>  static inline int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge)
>  {
>  	if (!huge)
>  		return PAGE_SIZE / SZ_4K;
> -	else if (hugepd_ok(*((hugepd_t *)pmd)))
> -		return 1;
> +	else if ((pmd_val(*pmd) & _PMD_PAGE_MASK) == _PMD_PAGE_8M)
> +		return SZ_4M / SZ_4K;

this becomes more intuitive.

> +static inline void pmd_populate_kernel_size(struct mm_struct *mm, pmd_t *pmdp,
> +					    pte_t *pte, unsigned long sz)
> +{
> +	if (sz == SZ_8M)
> +		*pmdp = __pmd(__pa(pte) | _PMD_PRESENT | _PMD_PAGE_8M);
> +	else
> +		*pmdp = __pmd(__pa(pte) | _PMD_PRESENT);
> +}
> +
> +static inline void pmd_populate_size(struct mm_struct *mm, pmd_t *pmdp,
> +				     pgtable_t pte_page, unsigned long sz)
> +{
> +	if (sz == SZ_8M)
> +		*pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT | _PMD_PAGE_8M);
> +	else
> +		*pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT);
> +}

In patch#1 you mentioned this will change with the removal of patch#1
and patch#2.

> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -183,9 +183,6 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
>  	if (!hpdp)
>  		return NULL;
>  
> -	if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT)
> -		return pte_alloc_huge(mm, (pmd_t *)hpdp, addr, sz);
> -
>  	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
>  
>  	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
> @@ -198,10 +195,18 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
>  pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
>  		      unsigned long addr, unsigned long sz)
>  {
> +	pmd_t *pmd = pmd_off(mm, addr);
> +
>  	if (sz < PMD_SIZE)
> -		return pte_alloc_huge(mm, pmd_off(mm, addr), addr, sz);
> +		return pte_alloc_huge(mm, pmd, addr, sz);
>  
> -	return NULL;
> +	if (sz != SZ_8M)
> +		return NULL;
> +	if (!pte_alloc_huge(mm, pmd, addr, sz))
> +		return NULL;
> +	if (!pte_alloc_huge(mm, pmd + 1, addr, sz))
> +		return NULL;
> +	return (pte_t *)pmd;

I think that having the check for invalid huge page sizes upfront would
make more sense, maybe just a matter of taste.

 /* Unsupported size */
 if (sz > PMD_SIZE && sz = SZ_8M)
     return NULL;

 if (sz < PMD_SIZE)
    ...
 /* 8MB huge pages */
 ...

 return (pte_t *) pmd;

Also, I am not a big fan of the two separate pte_alloc_huge() for pmd#0+pmd#1,
and I am thinking we might want to hide that within a function and drop a
comment in there explaining why we are updatng both pmds.

> diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
> index d93433e26ded..99f656b3f9f3 100644
> --- a/arch/powerpc/mm/nohash/8xx.c
> +++ b/arch/powerpc/mm/nohash/8xx.c
> @@ -48,20 +48,6 @@ unsigned long p_block_mapped(phys_addr_t pa)
>  	return 0;
>  }
>  
> -static pte_t __init *early_hugepd_alloc_kernel(hugepd_t *pmdp, unsigned long va)
> -{
> -	if (hpd_val(*pmdp) == 0) {
> -		pte_t *ptep = memblock_alloc(sizeof(pte_basic_t), SZ_4K);
> -
> -		if (!ptep)
> -			return NULL;
> -
> -		hugepd_populate_kernel((hugepd_t *)pmdp, ptep, PAGE_SHIFT_8M);
> -		hugepd_populate_kernel((hugepd_t *)pmdp + 1, ptep, PAGE_SHIFT_8M);
> -	}
> -	return hugepte_offset(*(hugepd_t *)pmdp, va, PGDIR_SHIFT);
> -}
> -
>  static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa,
>  					     pgprot_t prot, int psize, bool new)

Am I blind or do we never use the 'new' parameter?
I checked the tree and it seems we always pass it 'true'.

arch/powerpc/mm/nohash/8xx.c:		err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
arch/powerpc/mm/nohash/8xx.c:		err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new);
arch/powerpc/mm/nohash/8xx.c:		err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
arch/powerpc/mm/nohash/8xx.c:
__early_map_kernel_hugepage(VIRT_IMMR_BASE, PHYS_IMMR_BASE, PAGE_KERNEL_NCG, MMU_PAGE_512K, true);

I think we can drop the 'new' and the block code that tries to handle
it?

> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> index acdf64c9b93e..59f0d7706d2f 100644
> --- a/arch/powerpc/mm/pgtable.c
> +++ b/arch/powerpc/mm/pgtable.c

> +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
> +		     pte_t pte, unsigned long sz)
> +{
> +	pmd_t *pmdp = pmd_off(mm, addr);
> +
> +	pte = set_pte_filter(pte, addr);
> +
> +	if (sz == SZ_8M) {
> +		__set_huge_pte_at(pmdp, pte_offset_kernel(pmdp, 0), pte_val(pte));
> +		__set_huge_pte_at(pmdp, pte_offset_kernel(pmdp + 1, 0), pte_val(pte) + SZ_4M);

You also mentioned that this would slightly change after you drop
patch#0 and patch#1.
The only comment I have right know would be to add a little comment
explaining the layout (the replication of 1024 entries), or just
something like "see comment from number_of_cells_per_pte".

-- 
Oscar Salvador
SUSE Labs