On 31.07.24 14:21, David Hildenbrand wrote:
We recently made GUP's common page table walking code to also walk hugetlb
VMAs without most hugetlb special-casing, preparing for the future of
having less hugetlb-specific page table walking code in the codebase.
Turns out that we missed one page table locking detail: page table locking
for hugetlb folios that are not mapped using a single PMD/PUD.
James, Peter,
the following seems to get the job done. Thoughts?
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8e462205400d..776dc3914d9e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -938,10 +938,40 @@ static inline bool htlb_allow_alloc_fallback(int reason)
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
struct mm_struct *mm, pte_t *pte)
{
- if (huge_page_size(h) == PMD_SIZE)
+ unsigned long size = huge_page_size(h);
+
+ VM_WARN_ON(size == PAGE_SIZE);
+
+ /*
+ * hugetlb must use the exact same PT locks as core-mm page table
+ * walkers would. When modifying a PTE table, hugetlb must take the
+ * PTE PT lock, when modifying a PMD table, hugetlb must take the PMD
+ * PT lock etc.
+ *
+ * The expectation is that any hugetlb folio smaller than a PMD is
+ * always mapped into a single PTE table and that any hugetlb folio
+ * smaller than a PUD (but at least as big as a PMD) is always mapped
+ * into a single PMD table.
+ *
+ * If that does not hold for an architecture, then that architecture
+ * must disable split PT locks such that all *_lockptr() functions
+ * will give us the same result: the per-MM PT lock.
+ *
+ * Note that with e.g., CONFIG_PGTABLE_LEVELS=2 where
+ * PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE, we'd use the MM PT lock
+ * directly with a PMD hugetlb size, whereby core-mm would call
+ * pmd_lockptr() instead. However, in such configurations split PMD
+ * locks are disabled -- split locks don't make sense on a single
+ * PGDIR page table -- and the end result is the same.
+ */
+ if (size >= P4D_SIZE)
+ return &mm->page_table_lock;
+ else if (size >= PUD_SIZE)
+ return pud_lockptr(mm, (pud_t *) pte);
+ else if (size >= PMD_SIZE || IS_ENABLED(CONFIG_HIGHPTE))
return pmd_lockptr(mm, (pmd_t *) pte);
- VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
- return &mm->page_table_lock;
+ /* pte_alloc_huge() only applies with !CONFIG_HIGHPTE */
+ return ptep_lockptr(mm, pte);
}
#ifndef hugepages_supported
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a890a1731c14..bd219ac9c026 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2869,6 +2869,13 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
}
+static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
+{
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE));
+ BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE);
+ return ptlock_ptr(virt_to_ptdesc(pte));
+}
+
static inline bool ptlock_init(struct ptdesc *ptdesc)
{
/*
@@ -2893,6 +2900,10 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
return &mm->page_table_lock;
}
+static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
+{
+ return &mm->page_table_lock;
+}
static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
--
2.45.2
--
Cheers,
David / dhildenb