Following patches need to wait for migration and take rmap locks before they work with the pte itself. This feature is a compact change and is therefore extracted into this patch. In order to wait for migration when a page is being migrated, new flag is added to pagewalk to optionally enable waiting for migration at the walk_pte_range_inner() level. Similar flag was added to take rmap locks at the same level. When waiting for migration pte lock and rmap locks must be dropped and taken again after the migration has ended. Similar mechanism is taken if pte_entry() sets ACTION_AGAIN, which happens in the following patch when a deadlock is encountered, because of a different lock order used during the page update. Migration waiting is done only at the PTE level and is presumes no pmd entry is specified. If pmd_entry() is set together with page migration flag a warning is logged. PMD migration waiting can implemented later if anyone needs it. At this time flags can be specified only by calling walk_page_vma(). If needed flags can also be added to other pagewalk API calls. Signed-off-by: Jakub Matěna <matenajakub@xxxxxxxxx> --- fs/proc/task_mmu.c | 4 +-- include/linux/pagewalk.h | 11 ++++++- include/linux/rmap.h | 2 ++ mm/mremap.c | 17 +--------- mm/pagewalk.c | 71 +++++++++++++++++++++++++++++++++++++--- mm/rmap.c | 16 +++++++++ 6 files changed, 97 insertions(+), 24 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f46060eb91b5..fd72263456e9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -794,7 +794,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, #endif /* mmap_lock is held in m_start */ if (!start) - walk_page_vma(vma, ops, mss); + walk_page_vma(vma, ops, mss, 0); else walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); } @@ -1938,7 +1938,7 @@ static int show_numa_map(struct seq_file *m, void *v) seq_puts(m, " huge"); /* mmap_lock is held by m_start */ - walk_page_vma(vma, &show_numa_ops, md); + walk_page_vma(vma, &show_numa_ops, md, 0); if (!md->pages) goto out; diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index ac7b38ad5903..07345df51324 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -70,6 +70,13 @@ enum page_walk_action { ACTION_AGAIN = 2 }; +/* Walk flags */ + +/* Wait for migration before pte entry, not implemented for pmd entries */ +#define WALK_MIGRATION 0x1 +/* Take rmap locks before pte entries */ +#define WALK_LOCK_RMAP 0x2 + /** * struct mm_walk - walk_page_range data * @ops: operation to call during the walk @@ -77,6 +84,7 @@ enum page_walk_action { * @pgd: pointer to PGD; only valid with no_vma (otherwise set to NULL) * @vma: vma currently walked (NULL if walking outside vmas) * @action: next action to perform (see enum page_walk_action) + * @flags: flags performing additional operations (see walk flags) * @no_vma: walk ignoring vmas (vma will always be NULL) * @private: private data for callbacks' usage * @@ -88,6 +96,7 @@ struct mm_walk { pgd_t *pgd; struct vm_area_struct *vma; enum page_walk_action action; + unsigned long flags; bool no_vma; void *private; }; @@ -100,7 +109,7 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, pgd_t *pgd, void *private); int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, - void *private); + void *private, unsigned long flags); int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, const struct mm_walk_ops *ops, void *private); diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 17230c458341..d2d5e511dd93 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -138,6 +138,8 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) */ void anon_vma_init(void); /* create anon_vma_cachep */ int __anon_vma_prepare(struct vm_area_struct *); +void take_rmap_locks(struct vm_area_struct *vma); +void drop_rmap_locks(struct vm_area_struct *vma); void unlink_anon_vmas(struct vm_area_struct *); int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); diff --git a/mm/mremap.c b/mm/mremap.c index 75cda854ec58..309fab7ed706 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -24,6 +24,7 @@ #include <linux/mmu_notifier.h> #include <linux/uaccess.h> #include <linux/userfaultfd_k.h> +#include <linux/rmap.h> #include <asm/cacheflush.h> #include <asm/tlb.h> @@ -101,22 +102,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return pmd; } -static void take_rmap_locks(struct vm_area_struct *vma) -{ - if (vma->vm_file) - i_mmap_lock_write(vma->vm_file->f_mapping); - if (vma->anon_vma) - anon_vma_lock_write(vma->anon_vma); -} - -static void drop_rmap_locks(struct vm_area_struct *vma) -{ - if (vma->anon_vma) - anon_vma_unlock_write(vma->anon_vma); - if (vma->vm_file) - i_mmap_unlock_write(vma->vm_file->f_mapping); -} - static pte_t move_soft_dirty_pte(pte_t pte) { /* diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 9b3db11a4d1d..0bfb8c9255f3 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -3,6 +3,9 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/rmap.h> /* * We want to know the real level where a entry is located ignoring any @@ -20,14 +23,62 @@ static int real_depth(int depth) return depth; } +/* + * Relock pte lock and optionally rmap locks to prevent possible deadlock + * @pte: Locked pte + * @addr: Address of the pte + * @walk: Pagewalk structure + * @ptl: Pte spinlock + * @pmd: Pmd to wait for migration * + */ +static void walk_pte_relock(pte_t **pte, unsigned long addr, struct mm_walk *walk, + spinlock_t *ptl, pmd_t *pmd) +{ + if (walk->no_vma) + pte_unmap(*pte); + else + pte_unmap_unlock(*pte, ptl); + + if (walk->flags & WALK_LOCK_RMAP) + drop_rmap_locks(walk->vma); + + if (walk->flags & WALK_MIGRATION) + migration_entry_wait(walk->mm, pmd, addr); + + if (walk->flags & WALK_LOCK_RMAP) + take_rmap_locks(walk->vma); + + if (walk->no_vma) + *pte = pte_offset_map(pmd, addr); + else + *pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); +} + static int walk_pte_range_inner(pte_t *pte, unsigned long addr, - unsigned long end, struct mm_walk *walk) + unsigned long end, struct mm_walk *walk, + spinlock_t *ptl, pmd_t *pmd) { const struct mm_walk_ops *ops = walk->ops; int err = 0; for (;;) { + walk->action = ACTION_SUBTREE; + if ((walk->flags & WALK_MIGRATION) && !pte_present(*pte)) { + swp_entry_t entry; + + if (!pte_none(*pte)) { + entry = pte_to_swp_entry(*pte); + if (is_migration_entry(entry)) { + walk_pte_relock(&pte, addr, walk, ptl, pmd); + continue; /* retry iteration */ + } + } + } err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); + if (walk->action == ACTION_AGAIN) { + walk_pte_relock(&pte, addr, walk, ptl, pmd); + continue; /* retry iteration */ + } if (err) break; if (addr >= end - PAGE_SIZE) @@ -45,16 +96,22 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, int err = 0; spinlock_t *ptl; + if (walk->flags & WALK_LOCK_RMAP) + take_rmap_locks(walk->vma); + if (walk->no_vma) { pte = pte_offset_map(pmd, addr); - err = walk_pte_range_inner(pte, addr, end, walk); + err = walk_pte_range_inner(pte, addr, end, walk, ptl, pmd); pte_unmap(pte); } else { pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - err = walk_pte_range_inner(pte, addr, end, walk); + err = walk_pte_range_inner(pte, addr, end, walk, ptl, pmd); pte_unmap_unlock(pte, ptl); } + if (walk->flags & WALK_LOCK_RMAP) + drop_rmap_locks(walk->vma); + return err; } @@ -124,8 +181,11 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, * This implies that each ->pmd_entry() handler * needs to know about pmd_trans_huge() pmds */ - if (ops->pmd_entry) + if (ops->pmd_entry) { + /* Migration waiting is not implemented for pmd entries */ + WARN_ON_ONCE(walk->flags & WALK_MIGRATION); err = ops->pmd_entry(pmd, addr, next, walk); + } if (err) break; @@ -507,13 +567,14 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, } int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, - void *private) + void *private, unsigned long flags) { struct mm_walk walk = { .ops = ops, .mm = vma->vm_mm, .vma = vma, .private = private, + .flags = flags }; int err; diff --git a/mm/rmap.c b/mm/rmap.c index fedb82371efe..d4d95ada0946 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2200,6 +2200,22 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, EXPORT_SYMBOL_GPL(make_device_exclusive_range); #endif +void take_rmap_locks(struct vm_area_struct *vma) +{ + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + if (vma->anon_vma) + anon_vma_lock_write(vma->anon_vma); +} + +void drop_rmap_locks(struct vm_area_struct *vma) +{ + if (vma->anon_vma) + anon_vma_unlock_write(vma->anon_vma); + if (vma->vm_file) + i_mmap_unlock_write(vma->vm_file->f_mapping); +} + void __put_anon_vma(struct anon_vma *anon_vma) { struct anon_vma *root = anon_vma->root; -- 2.35.1