A read fault causes the zero page to be mapped read-only. A subsequent write fault causes the zero page to be replaced with a zero-filled private anonymous page. Change the write fault behaviour to replace the zero page with a large anonymous folio, allocated using the same policy as if the write fault had happened without the previous read fault. Experimentation shows that reading multiple contiguous pages is extremely rare without interleved writes, so we don't bother to map a large zero page. We just use the small zero page as a marker and expand the allocation at the write fault. Signed-off-by: Ryan Roberts <ryan.roberts@xxxxxxx> --- mm/memory.c | 115 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 80 insertions(+), 35 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 61cec97a57f3..fac686e9f895 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3110,6 +3110,23 @@ static inline int check_ptes_contig_ro(pte_t *pte, int nr, unsigned long pfn) return nr; } +/* + * Checks that all ptes are none except for the pte at offset, which should be + * entry. Returns index of first pte that does not meet expectations, or nr if + * all are correct. + */ +static inline int check_ptes_none_or_entry(pte_t *pte, int nr, + pte_t entry, unsigned long offset) +{ + int ret; + + ret = check_ptes_none(pte, offset); + if (ret == offset && pte_same(pte[offset], entry)) + ret += 1 + check_ptes_none(pte + offset + 1, nr - offset - 1); + + return ret; +} + static int calc_anon_folio_order_alloc(struct vm_fault *vmf, int order) { /* @@ -3141,6 +3158,7 @@ static int calc_anon_folio_order_alloc(struct vm_fault *vmf, int order) pte_t *pte; pte_t *first_set = NULL; int ret; + unsigned long offset; if (has_transparent_hugepage()) { order = min(order, PMD_SHIFT - PAGE_SHIFT); @@ -3148,7 +3166,8 @@ static int calc_anon_folio_order_alloc(struct vm_fault *vmf, int order) for (; order > 1; order--) { nr = 1 << order; addr = ALIGN_DOWN(vmf->address, nr << PAGE_SHIFT); - pte = vmf->pte - ((vmf->address - addr) >> PAGE_SHIFT); + offset = ((vmf->address - addr) >> PAGE_SHIFT); + pte = vmf->pte - offset; /* Check vma bounds. */ if (addr < vma->vm_start || @@ -3163,8 +3182,9 @@ static int calc_anon_folio_order_alloc(struct vm_fault *vmf, int order) if (pte <= first_set) continue; - /* Need to check if all the ptes are none. */ - ret = check_ptes_none(pte, nr); + /* Need to check if all the ptes are none or entry. */ + ret = check_ptes_none_or_entry(pte, nr, + vmf->orig_pte, offset); if (ret == nr) break; @@ -3479,13 +3499,15 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) struct mmu_notifier_range range; int ret; pte_t orig_pte; - unsigned long addr = vmf->address; - int order = 0; - int pgcount = BIT(order); - unsigned long offset = 0; + unsigned long addr; + int order; + int pgcount; + unsigned long offset; unsigned long pfn; struct page *page; int i; + bool zero; + bool anon; delayacct_wpcopy_start(); @@ -3494,36 +3516,54 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (unlikely(anon_vma_prepare(vma))) goto oom; + /* + * Set the upper bound of the folio allocation order. If we hit a zero + * page, we allocate a folio with the same policy as allocation upon + * write fault. If we are copying an anon folio, then limit ourself to + * its order as we don't want to copy from multiple folios. For all + * other cases (e.g. file-mapped) CoW a single page. + */ if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { - new_folio = vma_alloc_movable_folio(vma, vmf->address, 0, true); - if (!new_folio) - goto oom; - } else { - if (old_folio && folio_test_anon(old_folio)) { - order = min_t(int, folio_order(old_folio), + zero = true; + anon = false; + order = max_anon_folio_order(vma); + } else if (old_folio && folio_test_anon(old_folio)) { + zero = false; + anon = true; + order = min_t(int, folio_order(old_folio), max_anon_folio_order(vma)); + } else { + zero = false; + anon = false; + order = 0; + } + retry: - /* - * Estimate the folio order to allocate. We are not - * under the ptl here so this estimate needs to be - * re-checked later once we have the lock. - */ - vmf->pte = pte_offset_map(vmf->pmd, vmf->address); - order = calc_anon_folio_order_copy(vmf, old_folio, order); - pte_unmap(vmf->pte); - } + /* + * Estimate the folio order to allocate. We are not under the ptl here + * so this estimate needs to be re-checked later once we have the lock. + */ + if (zero || anon) { + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + order = zero ? calc_anon_folio_order_alloc(vmf, order) : + calc_anon_folio_order_copy(vmf, old_folio, order); + pte_unmap(vmf->pte); + } - new_folio = try_vma_alloc_movable_folio(vma, vmf->address, - order, false); - if (!new_folio) - goto oom; + /* Allocate the new folio. */ + new_folio = try_vma_alloc_movable_folio(vma, vmf->address, order, zero); + if (!new_folio) + goto oom; - /* We may have been granted less than we asked for. */ - order = folio_order(new_folio); - pgcount = BIT(order); - addr = ALIGN_DOWN(vmf->address, pgcount << PAGE_SHIFT); - offset = ((vmf->address - addr) >> PAGE_SHIFT); + /* We may have been granted less than we asked for. */ + order = folio_order(new_folio); + pgcount = BIT(order); + addr = ALIGN_DOWN(vmf->address, pgcount << PAGE_SHIFT); + offset = ((vmf->address - addr) >> PAGE_SHIFT); + pfn = pte_pfn(vmf->orig_pte) - offset; + /* Copy contents. */ + if (!zero) { if (likely(old_folio)) ret = __wp_page_copy_user_range(&new_folio->page, vmf->page - offset, @@ -3561,8 +3601,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * Re-check the pte(s) - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - pfn = pte_pfn(vmf->orig_pte) - offset; - if (likely(check_ptes_contig_ro(vmf->pte, pgcount, pfn) == pgcount)) { + + if (zero) + ret = check_ptes_none_or_entry(vmf->pte, pgcount, + vmf->orig_pte, offset); + else + ret = check_ptes_contig_ro(vmf->pte, pgcount, pfn); + + if (likely(ret == pgcount)) { if (old_folio) { if (!folio_test_anon(old_folio)) { VM_BUG_ON(order != 0); @@ -3570,8 +3616,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) inc_mm_counter(mm, MM_ANONPAGES); } } else { - VM_BUG_ON(order != 0); - inc_mm_counter(mm, MM_ANONPAGES); + add_mm_counter(mm, MM_ANONPAGES, pgcount); } flush_cache_range(vma, addr, addr + (pgcount << PAGE_SHIFT)); -- 2.25.1