On 02/18/23 00:27, James Houghton wrote: > These functions are used to allocate new PTEs below the hstate PTE. This > will be used by hugetlb_walk_step, which implements stepping forwards in > a HugeTLB high-granularity page table walk. > > The reasons that we don't use the standard pmd_alloc/pte_alloc* > functions are: > 1) This prevents us from accidentally overwriting swap entries or > attempting to use swap entries as present non-leaf PTEs (see > pmd_alloc(); we assume that !pte_none means pte_present and > non-leaf). > 2) Locking hugetlb PTEs can different than regular PTEs. (Although, as > implemented right now, locking is the same.) > 3) We can maintain compatibility with CONFIG_HIGHPTE. That is, HugeTLB > HGM won't use HIGHPTE, but the kernel can still be built with it, > and other mm code will use it. > > When GENERAL_HUGETLB supports P4D-based hugepages, we will need to > implement hugetlb_pud_alloc to implement hugetlb_walk_step. > > Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx> > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index eeacadf3272b..9d839519c875 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -72,6 +72,11 @@ unsigned long hugetlb_pte_mask(const struct hugetlb_pte *hpte) > > bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte, pte_t pte); > > +pmd_t *hugetlb_alloc_pmd(struct mm_struct *mm, struct hugetlb_pte *hpte, > + unsigned long addr); > +pte_t *hugetlb_alloc_pte(struct mm_struct *mm, struct hugetlb_pte *hpte, > + unsigned long addr); > + > struct hugepage_subpool { > spinlock_t lock; > long count; > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 6c74adff43b6..bb424cdf79e4 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -483,6 +483,120 @@ static bool has_same_uncharge_info(struct file_region *rg, > #endif > } > > +/* > + * hugetlb_alloc_pmd -- Allocate or find a PMD beneath a PUD-level hpte. > + * > + * This is meant to be used to implement hugetlb_walk_step when one must go to > + * step down to a PMD. Different architectures may implement hugetlb_walk_step > + * differently, but hugetlb_alloc_pmd and hugetlb_alloc_pte are architecture- > + * independent. > + * > + * Returns: > + * On success: the pointer to the PMD. This should be placed into a > + * hugetlb_pte. @hpte is not changed. > + * ERR_PTR(-EINVAL): hpte is not PUD-level > + * ERR_PTR(-EEXIST): there is a non-leaf and non-empty PUD in @hpte I often get this confused, should this really be 'non-leaf'? Because, ... > + * ERR_PTR(-ENOMEM): could not allocate the new PMD > + */ > +pmd_t *hugetlb_alloc_pmd(struct mm_struct *mm, struct hugetlb_pte *hpte, > + unsigned long addr) > +{ > + spinlock_t *ptl = hugetlb_pte_lockptr(hpte); > + pmd_t *new; > + pud_t *pudp; > + pud_t pud; > + > + if (hpte->level != HUGETLB_LEVEL_PUD) > + return ERR_PTR(-EINVAL); > + > + pudp = (pud_t *)hpte->ptep; > +retry: > + pud = READ_ONCE(*pudp); > + if (likely(pud_present(pud))) > + return unlikely(pud_leaf(pud)) > + ? ERR_PTR(-EEXIST) > + : pmd_offset(pudp, addr); ... it seems we return -EEXIST in the pud_leaf case. -- Mike Kravetz > + else if (!pud_none(pud)) > + /* > + * Not present and not none means that a swap entry lives here, > + * and we can't get rid of it. > + */ > + return ERR_PTR(-EEXIST); > + > + new = pmd_alloc_one(mm, addr); > + if (!new) > + return ERR_PTR(-ENOMEM); > + > + spin_lock(ptl); > + if (!pud_same(pud, *pudp)) { > + spin_unlock(ptl); > + pmd_free(mm, new); > + goto retry; > + } > + > + mm_inc_nr_pmds(mm); > + smp_wmb(); /* See comment in pmd_install() */ > + pud_populate(mm, pudp, new); > + spin_unlock(ptl); > + return pmd_offset(pudp, addr); > +} > + > +/* > + * hugetlb_alloc_pte -- Allocate a PTE beneath a pmd_none PMD-level hpte. > + * > + * See the comment above hugetlb_alloc_pmd. > + */ > +pte_t *hugetlb_alloc_pte(struct mm_struct *mm, struct hugetlb_pte *hpte, > + unsigned long addr) > +{ > + spinlock_t *ptl = hugetlb_pte_lockptr(hpte); > + pgtable_t new; > + pmd_t *pmdp; > + pmd_t pmd; > + > + if (hpte->level != HUGETLB_LEVEL_PMD) > + return ERR_PTR(-EINVAL); > + > + pmdp = (pmd_t *)hpte->ptep; > +retry: > + pmd = READ_ONCE(*pmdp); > + if (likely(pmd_present(pmd))) > + return unlikely(pmd_leaf(pmd)) > + ? ERR_PTR(-EEXIST) > + : pte_offset_kernel(pmdp, addr); > + else if (!pmd_none(pmd)) > + /* > + * Not present and not none means that a swap entry lives here, > + * and we can't get rid of it. > + */ > + return ERR_PTR(-EEXIST); > + > + /* > + * With CONFIG_HIGHPTE, calling `pte_alloc_one` directly may result > + * in page tables being allocated in high memory, needing a kmap to > + * access. Instead, we call __pte_alloc_one directly with > + * GFP_PGTABLE_USER to prevent these PTEs being allocated in high > + * memory. > + */ > + new = __pte_alloc_one(mm, GFP_PGTABLE_USER); > + if (!new) > + return ERR_PTR(-ENOMEM); > + > + spin_lock(ptl); > + if (!pmd_same(pmd, *pmdp)) { > + spin_unlock(ptl); > + pgtable_pte_page_dtor(new); > + __free_page(new); > + goto retry; > + } > + > + mm_inc_nr_ptes(mm); > + smp_wmb(); /* See comment in pmd_install() */ > + pmd_populate(mm, pmdp, new); > + spin_unlock(ptl); > + return pte_offset_kernel(pmdp, addr); > +} > + > static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) > { > struct file_region *nrg, *prg; > -- > 2.39.2.637.g21b0678d19-goog >