Ryan Roberts <ryan.roberts@xxxxxxx> writes: <snip> > /* > * On some architectures hardware does not set page access bit when accessing > * memory page, it is responsibility of software setting this bit. It brings > diff --git a/mm/memory.c b/mm/memory.c > index 1f18ed4a5497..8a87a488950c 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -924,68 +924,162 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma > return 0; > } > > +static int folio_nr_pages_cont_mapped(struct folio *folio, > + struct page *page, pte_t *pte, > + unsigned long addr, unsigned long end, > + pte_t ptent, bool enforce_uffd_wp, > + int *dirty_nr, int *writable_nr) > +{ > + int floops; > + int i; > + unsigned long pfn; > + bool prot_none; > + bool uffd_wp; > + > + if (!folio_test_large(folio)) > + return 1; > + > + /* > + * Loop either to `end` or to end of folio if its contiguously mapped, > + * whichever is smaller. > + */ > + floops = (end - addr) >> PAGE_SHIFT; > + floops = min_t(int, floops, > + folio_pfn(folio_next(folio)) - page_to_pfn(page)); Much better, thanks for addressing my comments here. > + > + pfn = page_to_pfn(page); > + prot_none = pte_protnone(ptent); > + uffd_wp = pte_uffd_wp(ptent); > + > + *dirty_nr = !!pte_dirty(ptent); > + *writable_nr = !!pte_write(ptent); > + > + pfn++; > + pte++; > + > + for (i = 1; i < floops; i++) { > + ptent = ptep_get(pte); > + > + if (!pte_present(ptent) || pte_pfn(ptent) != pfn || > + prot_none != pte_protnone(ptent) || > + (enforce_uffd_wp && uffd_wp != pte_uffd_wp(ptent))) > + break; > + > + if (pte_dirty(ptent)) > + (*dirty_nr)++; > + if (pte_write(ptent)) > + (*writable_nr)++; > + > + pfn++; > + pte++; > + } > + > + return i; > +} > + > /* > - * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page > - * is required to copy this pte. > + * Copy set of contiguous ptes. Returns number of ptes copied if succeeded > + * (always gte 1), or -EAGAIN if one preallocated page is required to copy the > + * first pte. > */ > static inline int > -copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, > - pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, > - struct folio **prealloc) > +copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, > + pte_t *dst_pte, pte_t *src_pte, > + unsigned long addr, unsigned long end, > + int *rss, struct folio **prealloc) > { > struct mm_struct *src_mm = src_vma->vm_mm; > unsigned long vm_flags = src_vma->vm_flags; > pte_t pte = ptep_get(src_pte); > struct page *page; > struct folio *folio; > + int nr = 1; > + bool anon = false; > + bool enforce_uffd_wp = userfaultfd_wp(dst_vma); > + int nr_dirty = !!pte_dirty(pte); > + int nr_writable = !!pte_write(pte); > + int i, ret; > > page = vm_normal_page(src_vma, addr, pte); > - if (page) > + if (page) { > folio = page_folio(page); > - if (page && folio_test_anon(folio)) { > - /* > - * If this page may have been pinned by the parent process, > - * copy the page immediately for the child so that we'll always > - * guarantee the pinned page won't be randomly replaced in the > - * future. > - */ > - folio_get(folio); > - if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) { > - /* Page may be pinned, we have to copy. */ > - folio_put(folio); > - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, > - addr, rss, prealloc, page); > + anon = folio_test_anon(folio); > + nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end, > + pte, enforce_uffd_wp, &nr_dirty, > + &nr_writable); > + folio_ref_add(folio, nr); > + > + for (i = 0; i < nr; i++, page++) { > + if (anon) { > + /* > + * If this page may have been pinned by the > + * parent process, copy the page immediately for > + * the child so that we'll always guarantee the > + * pinned page won't be randomly replaced in the > + * future. > + */ > + if (unlikely(page_try_dup_anon_rmap( > + page, false, src_vma))) { > + if (i != 0) > + break; > + /* Page may be pinned, we have to copy. */ > + folio_ref_sub(folio, nr); > + ret = copy_present_page( > + dst_vma, src_vma, dst_pte, > + src_pte, addr, rss, prealloc, > + page); > + return ret == 0 ? 1 : ret; > + } > + rss[MM_ANONPAGES]++; > + VM_BUG_ON(PageAnonExclusive(page)); > + } else { > + page_dup_file_rmap(page, false); > + rss[mm_counter_file(page)]++; > + } > } > - rss[MM_ANONPAGES]++; > - } else if (page) { > - folio_get(folio); > - page_dup_file_rmap(page, false); > - rss[mm_counter_file(page)]++; > - } > > - /* > - * If it's a COW mapping, write protect it both > - * in the parent and the child > - */ > - if (is_cow_mapping(vm_flags) && pte_write(pte)) { > - ptep_set_wrprotect(src_mm, addr, src_pte); > - pte = pte_wrprotect(pte); > + if (i < nr) { > + folio_ref_sub(folio, nr - i); > + nr = i; > + } > } > - VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page)); > > /* > - * If it's a shared mapping, mark it clean in > - * the child > + * If it's a shared mapping, mark it clean and write protected in the > + * child, and rely on a write fault to fix up the permissions. This > + * allows determining batch size without having to consider RO/RW > + * permissions. As an optimization, skip wrprotect if all ptes in the > + * batch have the same permissions. > + * > + * If its a private (CoW) mapping, mark it dirty in the child if _any_ > + * of the parent mappings in the block were marked dirty. The contiguous > + * block of mappings are all backed by the same folio, so if any are > + * dirty then the whole folio is dirty. This allows determining batch > + * size without having to consider the dirty bit. Further, write protect > + * it both in the parent and the child so that a future write will cause > + * a CoW. As as an optimization, skip the wrprotect if all the ptes in > + * the batch are already readonly. > */ > - if (vm_flags & VM_SHARED) > + if (vm_flags & VM_SHARED) { > pte = pte_mkclean(pte); > - pte = pte_mkold(pte); > + if (nr_writable > 0 && nr_writable < nr) > + pte = pte_wrprotect(pte); > + } else { > + if (nr_dirty) > + pte = pte_mkdirty(pte); > + if (nr_writable) { > + ptep_set_wrprotects(src_mm, addr, src_pte, nr); > + pte = pte_wrprotect(pte); > + } > + } > > - if (!userfaultfd_wp(dst_vma)) > + pte = pte_mkold(pte); > + pte = pte_clear_soft_dirty(pte); > + if (!enforce_uffd_wp) > pte = pte_clear_uffd_wp(pte); > > - set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); > - return 0; > + set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); > + return nr; I don't have any further comments and you have addressed my previous ones so feel free to add: Reviewed-by: Alistair Popple <apopple@xxxxxxxxxx> However whilst I think the above CoW sequence looks correct it would be nice if someone else could take a look as well. > } > > static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm, > @@ -1021,6 +1115,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, > int rss[NR_MM_COUNTERS]; > swp_entry_t entry = (swp_entry_t){0}; > struct folio *prealloc = NULL; > + int nr_ptes; > > again: > progress = 0; > @@ -1051,6 +1146,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, > arch_enter_lazy_mmu_mode(); > > do { > + nr_ptes = 1; > + > /* > * We are holding two locks at this point - either of them > * could generate latencies in another task on another CPU. > @@ -1086,16 +1183,21 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, > * the now present pte. > */ > WARN_ON_ONCE(ret != -ENOENT); > + ret = 0; > } > - /* copy_present_pte() will clear `*prealloc' if consumed */ > - ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, > - addr, rss, &prealloc); > + /* copy_present_ptes() will clear `*prealloc' if consumed */ > + nr_ptes = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, > + addr, end, rss, &prealloc); > + > /* > * If we need a pre-allocated page for this pte, drop the > * locks, allocate, and try again. > */ > - if (unlikely(ret == -EAGAIN)) > + if (unlikely(nr_ptes == -EAGAIN)) { > + ret = -EAGAIN; > break; > + } > + > if (unlikely(prealloc)) { > /* > * pre-alloc page cannot be reused by next time so as > @@ -1106,8 +1208,9 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, > folio_put(prealloc); > prealloc = NULL; > } > - progress += 8; > - } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); > + progress += 8 * nr_ptes; > + } while (dst_pte += nr_ptes, src_pte += nr_ptes, > + addr += PAGE_SIZE * nr_ptes, addr != end); > > arch_leave_lazy_mmu_mode(); > pte_unmap_unlock(orig_src_pte, src_ptl);