We have introduced the ability to indicate that a VMA's anon_vma field is 'unfaulted', that is that we have a desire for there to be propagation of page tables on fork, but no anon_vma is yet initialised. Utilise that on guard region installation (via MADV_GUARD_INSTALL) to ensure that page table propagation on fork occurs, but without occupying one byte of memory more than is required. Note that this is a no-op if a 'real' anon_vma is already in place. This also avoids any issue with THP inferring that it should not immediately attempt huge page collapse. More importantly, for file-backed mappings, this avoids otherwise unnecessary kernel memory allocation purely for the purposes of indicating on-fork page table propagation requirements. We adjust when we do this, so we do it only after a successful guard region installation and one which installs a guard region of at least one page in size. This means we only set this once guard regions are definitely installed. We are safe from a fork racing here, because we hold the mmap read lock, and the fork requires a write lock. We also adjust MADV_GUARD_REMOVE to remove this flag if the range specified spans the entire VMA (and no 'real' anon_vma has been installed yet), meaning we do not cause unnecessary page table propagation. This is protected from racing with guard region installation through use of the mm->page_table_lock, which is being used to prevent races between mmap read-locked modifiers of vma->anon_vma. Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> --- mm/madvise.c | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 388dc289b5d1..0e2ae32f057b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1119,24 +1119,14 @@ static long madvise_guard_install(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { - long err; + long err = 0; + unsigned long nr_pages; int i; *prev = vma; if (!is_valid_guard_vma(vma, /* allow_locked = */false)) return -EINVAL; - /* - * If we install guard markers, then the range is no longer - * empty from a page table perspective and therefore it's - * appropriate to have an anon_vma. - * - * This ensures that on fork, we copy page tables correctly. - */ - err = anon_vma_prepare(vma); - if (err) - return err; - /* * Optimistically try to install the guard marker pages first. If any * non-guard pages are encountered, give up and zap the range before @@ -1150,19 +1140,20 @@ static long madvise_guard_install(struct vm_area_struct *vma, * with no zap or looping. */ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { - unsigned long nr_pages = 0; + /* We count existing guard region pages each retry also. */ + nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ err = walk_page_range_mm(vma->vm_mm, start, end, &guard_install_walk_ops, &nr_pages); if (err < 0) - return err; + break; if (err == 0) { unsigned long nr_expected_pages = PHYS_PFN(end - start); VM_WARN_ON(nr_pages != nr_expected_pages); - return 0; + break; } /* @@ -1172,12 +1163,19 @@ static long madvise_guard_install(struct vm_area_struct *vma, zap_page_range_single(vma, start, end - start, NULL); } + /* Ensure that page tables are propagated on fork. */ + if (nr_pages > 0) + vma_set_anon_vma_unfaulted(vma); + /* * We were unable to install the guard pages due to being raced by page * faults. This should not happen ordinarily. We return to userspace and * immediately retry, relieving lock contention. */ - return restart_syscall(); + if (err > 0) + return restart_syscall(); + + return err; } static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, @@ -1229,6 +1227,8 @@ static long madvise_guard_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { + long err; + *prev = vma; /* * We're ok with removing guards in mlock()'d ranges, as this is a @@ -1237,8 +1237,21 @@ static long madvise_guard_remove(struct vm_area_struct *vma, if (!is_valid_guard_vma(vma, /* allow_locked = */true)) return -EINVAL; - return walk_page_range(vma->vm_mm, start, end, - &guard_remove_walk_ops, NULL); + err = walk_page_range(vma->vm_mm, start, end, + &guard_remove_walk_ops, NULL); + + /* + * If we have successfully cleared the guard flags, and we span the + * whole VMA, clear the unfaulted state so this VMA doesn't + * unnecessarily propagate page tables. + * + * The operation is protected via mm->page_table_lock avoiding races + * with a guard install operation. + */ + if (!err && start == vma->vm_start && end == vma->vm_end) + vma_clear_anon_vma_unfaulted(vma); + + return err; } /* -- 2.48.1