On 17.12.24 06:12, Alistair Popple wrote:
Currently DAX folio/page reference counts are managed differently to
normal pages. To allow these to be managed the same as normal pages
introduce vmf_insert_folio_pud. This will map the entire PUD-sized folio
and take references as it would for a normally mapped page.
This is distinct from the current mechanism, vmf_insert_pfn_pud, which
simply inserts a special devmap PUD entry into the page table without
holding a reference to the page for the mapping.
Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx>
---
include/linux/huge_mm.h | 11 +++++-
mm/huge_memory.c | 96 ++++++++++++++++++++++++++++++++++++------
2 files changed, 95 insertions(+), 12 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 93e509b..012137b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,6 +39,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, bool write);
enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_UNSUPPORTED,
@@ -458,6 +459,11 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
}
+static inline bool is_huge_zero_pud(pud_t pud)
+{
+ return false;
+}
+
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);
@@ -604,6 +610,11 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
return false;
}
+static inline bool is_huge_zero_pud(pud_t pud)
+{
+ return false;
+}
+
I'm really not a fan of these, because I assume we will never ever
implement these any time soon. (who will waste 1 GiG or more on faster
reading of 0s?)
static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
{
return;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 120cd2c..5081808 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1482,19 +1482,17 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
struct mm_struct *mm = vma->vm_mm;
pgprot_t prot = vma->vm_page_prot;
pud_t entry;
- spinlock_t *ptl;
- ptl = pud_lock(mm, pud);
if (!pud_none(*pud)) {
if (write) {
if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn)))
- goto out_unlock;
+ return;
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
update_mmu_cache_pud(vma, addr, pud);
}
- goto out_unlock;
+ return;
}
entry = pud_mkhuge(pfn_t_pud(pfn, prot));
@@ -1508,9 +1506,6 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
}
set_pud_at(mm, addr, pud, entry);
update_mmu_cache_pud(vma, addr, pud);
-
-out_unlock:
- spin_unlock(ptl);
}
/**
@@ -1528,6 +1523,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
unsigned long addr = vmf->address & PUD_MASK;
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
+ spinlock_t *ptl;
/*
* If we had pud_special, we could avoid all these restrictions,
@@ -1545,10 +1541,55 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
track_pfn_insert(vma, &pgprot, pfn);
+ ptl = pud_lock(vma->vm_mm, vmf->pud);
insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
+ spin_unlock(ptl);
+
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
+
+/**
+ * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
+ * @vmf: Structure describing the fault
+ * @pfn: pfn of the page to insert
+ * @write: whether it's a write fault
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, bool write)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long addr = vmf->address & PUD_MASK;
+ pfn_t pfn = pfn_to_pfn_t(folio_pfn(folio));
+ pud_t *pud = vmf->pud;
+ pgprot_t prot = vma->vm_page_prot;
See below, pfn, prot and page can likely go.
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ struct page *page;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
+ return VM_FAULT_SIGBUS;
+
+ track_pfn_insert(vma, &prot, pfn);
Oh, why is that required? We are inserting a folio and start messing
with VM_PAT on x86 that only applies to VM_PFNMAP mappings? :)
+
+ ptl = pud_lock(mm, pud);
+ if (pud_none(*vmf->pud)) {
+ page = pfn_t_to_page(pfn);
Why are we suddenly working with that pfn_t whichcraft? :)
+ folio = page_folio(page);
Ehm, you got the folio ... passed into this function?
Why can't that simply be
folio_get(folio);
folio_add_file_rmap_pud(folio, folio_page(folio, 0), vma);
+ folio_get(folio);
+ folio_add_file_rmap_pud(folio, page, vma);
+ add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
+ }
+ insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
+ spin_unlock(ptl);
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -2146,7 +2187,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else if (is_huge_zero_pmd(orig_pmd)) {
- zap_deposited_table(tlb->mm, pmd);
+ if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
+ zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else {
struct folio *folio = NULL;
@@ -2634,12 +2676,24 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
arch_check_zapped_pud(vma, orig_pud);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
- if (vma_is_special_huge(vma)) {
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
spin_unlock(ptl);
/* No zero page support yet */
} else {
- /* No support for anonymous PUD pages yet */
- BUG();
+ struct page *page = NULL;
+ struct folio *folio;
+
+ /* No support for anonymous PUD pages or migration yet */
+ BUG_ON(vma_is_anonymous(vma) || !pud_present(orig_pud));
VM_WARN_ON_ONCE().
+
+ page = pud_page(orig_pud);
+ folio = page_folio(page);
+ folio_remove_rmap_pud(folio, page, vma);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
Please drop that or so something like
VM_WARN_ON_ONCE(page != folio_page(folio, 0));
+ add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
+
+ spin_unlock(ptl);
+ tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
}
return 1;
}
@@ -2647,6 +2701,8 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
unsigned long haddr)
{
+ pud_t old_pud;
+
VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
@@ -2654,7 +2710,23 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
count_vm_event(THP_SPLIT_PUD);
- pudp_huge_clear_flush(vma, haddr, pud);
+ old_pud = pudp_huge_clear_flush(vma, haddr, pud);
+ if (is_huge_zero_pud(old_pud))
+ return;
+
+ if (vma_is_dax(vma)) {
Maybe you want
if (!vma_is_dax(vma))
return;
To then reduce alignment. I suspect all other splitting code (besides
anon memory) will want to do the same thing here in the future.
--
Cheers,
David / dhildenb