On 24/06/22 11:06 pm, James Houghton wrote:
This function is to be used to do a HugeTLB page table walk where we may
need to split a leaf-level huge PTE into a new page table level.
Consider the case where we want to install 4K inside an empty 1G page:
1. We walk to the PUD and notice that it is pte_none.
2. We split the PUD by calling `hugetlb_split_to_shift`, creating a
standard PUD that points to PMDs that are all pte_none.
3. We continue the PT walk to find the PMD. We split it just like we
split the PUD.
4. We find the PTE and give it back to the caller.
To avoid concurrent splitting operations on the same page table entry,
we require that the mapping rwsem is held for writing while collapsing
and for reading when doing a high-granularity PT walk.
Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx>
---
include/linux/hugetlb.h | 23 ++++++++++++++
mm/hugetlb.c | 67 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 90 insertions(+)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 605aa19d8572..321f5745d87f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1176,14 +1176,37 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
}
#endif /* CONFIG_HUGETLB_PAGE */
+enum split_mode {
+ HUGETLB_SPLIT_NEVER = 0,
+ HUGETLB_SPLIT_NONE = 1 << 0,
+ HUGETLB_SPLIT_PRESENT = 1 << 1,
huge
+ HUGETLB_SPLIT_ALWAYS = HUGETLB_SPLIT_NONE | HUGETLB_SPLIT_PRESENT,
+};
#ifdef CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING
/* If HugeTLB high-granularity mappings are enabled for this VMA. */
bool hugetlb_hgm_enabled(struct vm_area_struct *vma);
+int huge_pte_alloc_high_granularity(struct hugetlb_pte *hpte,
+ struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned int desired_sz,
+ enum split_mode mode,
+ bool write_locked);
#else
static inline bool hugetlb_hgm_enabled(struct vm_area_struct *vma)
{
return false;
}
+static inline int huge_pte_alloc_high_granularity(struct hugetlb_pte *hpte,
+ struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned int desired_sz,
+ enum split_mode mode,
+ bool write_locked)
+{
+ return -EINVAL;
+}
#endif
static inline spinlock_t *huge_pte_lock(struct hstate *h,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eaffe7b4f67c..6e0c5fbfe32c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7166,6 +7166,73 @@ static int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *v
tlb_finish_mmu(&tlb);
return ret;
}
+
+/*
+ * Similar to huge_pte_alloc except that this can be used to create or walk
+ * high-granularity mappings. It will automatically split existing HugeTLB PTEs
+ * if required by @mode. The resulting HugeTLB PTE will be returned in @hpte.
+ *
+ * There are three options for @mode:
+ * - HUGETLB_SPLIT_NEVER - Never split.
+ * - HUGETLB_SPLIT_NONE - Split empty PTEs.
+ * - HUGETLB_SPLIT_PRESENT - Split present PTEs.
+ * - HUGETLB_SPLIT_ALWAYS - Split both empty and present PTEs.
+ */
+int huge_pte_alloc_high_granularity(struct hugetlb_pte *hpte,
+ struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned int desired_shift,
+ enum split_mode mode,
+ bool write_locked)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ bool has_write_lock = write_locked;
+ unsigned long desired_sz = 1UL << desired_shift;
+ int ret;
+
+ BUG_ON(!hpte);
+
+ if (has_write_lock)
+ i_mmap_assert_write_locked(mapping);
+ else
+ i_mmap_assert_locked(mapping);
+
+retry:
+ ret = 0;
+ hugetlb_pte_init(hpte);
+
+ ret = hugetlb_walk_to(mm, hpte, addr, desired_sz,
+ !(mode & HUGETLB_SPLIT_NONE));
hugetlb_walk_to when called with split_non mode can change mappings?
If so should be ensure we are holding write-lock here.
+ if (ret || hugetlb_pte_size(hpte) == desired_sz)
+ goto out;
+
+ if (
+ ((mode & HUGETLB_SPLIT_NONE) && hugetlb_pte_none(hpte)) ||
+ ((mode & HUGETLB_SPLIT_PRESENT) &&
+ hugetlb_pte_present_leaf(hpte))
+ ) {
+ if (!has_write_lock) {
+ i_mmap_unlock_read(mapping);
Should lock upgrade be used here?
+ i_mmap_lock_write(mapping);
+ has_write_lock = true;
+ goto retry;
+ }
+ ret = hugetlb_split_to_shift(mm, vma, hpte, addr,
+ desired_shift);
+ }
+
+out:
+ if (has_write_lock && !write_locked) {
+ /* Drop the write lock. */
+ i_mmap_unlock_write(mapping);
+ i_mmap_lock_read(mapping);
same here lock downgrade?
+ has_write_lock = false;
+ goto retry;
+ }
+
+ return ret;
+}
#endif /* CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING */
/*