The page fault number can be reduced by batched PTEs population. The batch size of PTEs population is not allowed to cross: - page table boundaries - vma range - large folio size - fault_around_bytes fault_around_bytes allows to control batch size if user has attention to to so. Signed-off-by: Yin Fengwei <fengwei.yin@xxxxxxxxx> --- * base on next-20230112 mm/memory.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 99 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 56b571c83a0e..755e6e590481 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -104,6 +104,10 @@ EXPORT_SYMBOL(mem_map); #endif static vm_fault_t do_fault(struct vm_fault *vmf); +static inline bool allowed_batched_set_ptes(struct vm_fault *vmf, + struct page *page); +static void do_set_multi_ptes(struct vm_fault *vmf, struct page *page, + unsigned long addr); /* * A number of key systems in x86 including ioremap() rely on the assumption @@ -4359,10 +4363,16 @@ vm_fault_t finish_fault(struct vm_fault *vmf) /* Re-check under ptl */ if (likely(!vmf_pte_changed(vmf))) { - do_set_pte(vmf, page, vmf->address); + if (allowed_batched_set_ptes(vmf, page)) + do_set_multi_ptes(vmf, page, vmf->address); + else { + do_set_pte(vmf, page, vmf->address); - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, vmf->address, vmf->pte); + /* no need to invalidate: a not-present page + * won't be cached + */ + update_mmu_cache(vma, vmf->address, vmf->pte); + } ret = 0; } else { @@ -4476,6 +4486,92 @@ static inline bool should_fault_around(struct vm_fault *vmf) return fault_around_bytes >> PAGE_SHIFT > 1; } +/* Return true if we should do fault-around for file fault, false otherwise */ +static inline bool allowed_batched_set_ptes(struct vm_fault *vmf, + struct page *page) +{ + struct folio *folio = page_folio(page); + + if (uffd_disable_fault_around(vmf->vma)) + return false; + + if (!folio_test_large(folio)) + return false; + + /* TODO: Will revise after anon mapping support folio */ + if ((vmf->flags & FAULT_FLAG_WRITE) && + !(vmf->vma->vm_flags & VM_SHARED)) + return false; + + return fault_around_bytes >> PAGE_SHIFT > 1; +} + +static void do_set_multi_ptes(struct vm_fault *vmf, struct page *pg, + unsigned long addr) +{ + struct folio *folio = page_folio(pg); + struct vm_area_struct *vma = vmf->vma; + unsigned long size, mask, start, end, folio_start, folio_end; + int dist, first_idx, i = 0; + pte_t *pte; + + /* in page table range */ + start = ALIGN_DOWN(addr, PMD_SIZE); + end = ALIGN(addr, PMD_SIZE); + + /* in fault_around_bytes range */ + size = READ_ONCE(fault_around_bytes); + mask = ~(size - 1) & PAGE_MASK; + + /* in vma range */ + start = max3(start, (addr & mask), vma->vm_start); + end = min3(end, (addr & mask) + size, vma->vm_end); + + /* folio is locked and referenced. It will not be split or + * removed from page cache in this function. + */ + folio_start = addr - (folio_page_idx(folio, pg) << PAGE_SHIFT); + folio_end = folio_start + (folio_nr_pages(folio) << PAGE_SHIFT); + + /* in folio size range */ + start = max(start, folio_start); + end = min(end, folio_end); + + dist = (addr - start) >> PAGE_SHIFT; + first_idx = folio_page_idx(folio, pg) - dist; + pte = vmf->pte - dist; + + do { + struct page *page = folio_page(folio, first_idx + i); + bool write = vmf->flags & FAULT_FLAG_WRITE; + bool prefault = page != pg; + pte_t entry; + + if (!pte_none(*pte)) + continue; + + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + + if (prefault) + folio_get(folio); + + if (prefault && arch_wants_old_prefaulted_pte()) + entry = pte_mkold(entry); + else + entry = pte_sw_mkyoung(entry); + + if (write) + entry=maybe_mkwrite(pte_mkdirty(entry), vma); + + inc_mm_counter(vma->vm_mm, mm_counter_file(&folio->page)); + page_add_file_rmap(page, vma, false); + + set_pte_at(vma->vm_mm, start, pte, entry); + update_mmu_cache(vma, start, pte); + } while (pte++, start += PAGE_SIZE, i++, start < end); +} + static vm_fault_t do_read_fault(struct vm_fault *vmf) { vm_fault_t ret = 0; -- 2.30.2