Since we already have list of mm_struct in the system, add a module to scan each mm that walks VMAs of each mm_struct and scan all the pages associated with that. In the scan path: Check for the recently acccessed pages (folios) belonging to slowtier nodes. Add all those folios to a migration list. Signed-off-by: Raghavendra K T <raghavendra.kt@xxxxxxx> --- mm/kmmscand.c | 268 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 264 insertions(+), 4 deletions(-) diff --git a/mm/kmmscand.c b/mm/kmmscand.c index 957128d4e425..0496359d07f5 100644 --- a/mm/kmmscand.c +++ b/mm/kmmscand.c @@ -4,12 +4,19 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> +#include <linux/rmap.h> +#include <linux/pagewalk.h> +#include <linux/page_ext.h> +#include <linux/page_idle.h> +#include <linux/page_table_check.h> +#include <linux/pagemap.h> #include <linux/swap.h> #include <linux/mm_inline.h> #include <linux/kthread.h> #include <linux/kmmscand.h> +#include <linux/memory-tiers.h> +#include <linux/mempolicy.h> #include <linux/string.h> -#include <linux/delay.h> #include <linux/cleanup.h> #include <asm/pgalloc.h> @@ -32,6 +39,7 @@ static bool need_wakeup; static unsigned long kmmscand_sleep_expire; static DEFINE_SPINLOCK(kmmscand_mm_lock); +static DEFINE_SPINLOCK(kmmscand_migrate_lock); static DECLARE_WAIT_QUEUE_HEAD(kmmscand_wait); #define KMMSCAND_SLOT_HASH_BITS 10 @@ -41,6 +49,7 @@ static struct kmem_cache *kmmscand_slot_cache __read_mostly; struct kmmscand_mm_slot { struct mm_slot slot; + long address; }; struct kmmscand_scan { @@ -52,6 +61,21 @@ struct kmmscand_scan kmmscand_scan = { .mm_head = LIST_HEAD_INIT(kmmscand_scan.mm_head), }; +struct kmmscand_migrate_list { + struct list_head migrate_head; +}; + +struct kmmscand_migrate_list kmmscand_migrate_list = { + .migrate_head = LIST_HEAD_INIT(kmmscand_migrate_list.migrate_head), +}; + +struct kmmscand_migrate_info { + struct list_head migrate_node; + struct mm_struct *mm; + struct vm_area_struct *vma; + struct folio *folio; + unsigned long address; +}; static int kmmscand_has_work(void) { return !list_empty(&kmmscand_scan.mm_head); @@ -84,8 +108,140 @@ static void kmmscand_wait_work(void) } } -static void kmmscand_migrate_folio(void) +static bool kmmscand_eligible_srcnid(int nid) { + if (!node_is_toptier(nid)) + return true; + return false; +} + +static bool folio_idle_clear_pte_refs_one(struct folio *folio, + struct vm_area_struct *vma, + unsigned long addr, + pte_t *ptep) +{ + bool referenced = false; + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd = pmd_off(mm, addr); + + if (ptep) { + if (ptep_clear_young_notify(vma, addr, ptep)) + referenced = true; + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (!pmd_present(*pmd)) + WARN_ON_ONCE(1); + if (pmdp_clear_young_notify(vma, addr, pmd)) + referenced = true; + } else { + WARN_ON_ONCE(1); + } + + if (referenced) { + folio_clear_idle(folio); + folio_set_young(folio); + } + return true; +} + +static void page_idle_clear_pte_refs(struct page *page, pte_t *pte, struct mm_walk *walk) +{ + bool need_lock; + struct folio *folio = page_folio(page); + unsigned long address; + + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) + return; + + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); + if (need_lock && !folio_trylock(folio)) + return; + address = vma_address(walk->vma, page_pgoff(folio, page), compound_nr(page)); + VM_BUG_ON_VMA(address == -EFAULT, vma); + folio_idle_clear_pte_refs_one(folio, walk->vma, address, pte); + + if (need_lock) + folio_unlock(folio); +} + +static int hot_vma_idle_pte_entry(pte_t *pte, + unsigned long addr, + unsigned long next, + struct mm_walk *walk) +{ + struct page *page; + struct folio *folio; + struct mm_struct *mm; + struct vm_area_struct *vma; + struct kmmscand_migrate_info *info; + struct kmmscand_migrate_list *migrate_list = walk->private; + int srcnid; + + pte_t pteval = ptep_get(pte); + + if (pte_none(pteval)) + return 1; + vma = walk->vma; + mm = vma->vm_mm; + page = pte_page(*pte); + + page_idle_clear_pte_refs(page, pte, walk); + + folio = page_folio(page); + folio_get(folio); + + if (!folio || folio_is_zone_device(folio)) { + folio_put(folio); + return 1; + } + + srcnid = folio_nid(folio); + + if (!folio_test_idle(folio) || folio_test_young(folio) || + mmu_notifier_test_young(mm, addr) || + folio_test_referenced(folio) || pte_young(pteval)) { + + /* Do not try to promote pages from regular nodes */ + if (!kmmscand_eligible_srcnid(srcnid)) + goto end; + + info = kzalloc(sizeof(struct kmmscand_migrate_info), GFP_KERNEL); + if (info && migrate_list) { + + info->mm = mm; + info->vma = vma; + info->folio = folio; + + spin_lock(&kmmscand_migrate_lock); + list_add_tail(&info->migrate_node, &migrate_list->migrate_head); + spin_unlock(&kmmscand_migrate_lock); + } + } +end: + folio_set_idle(folio); + folio_put(folio); + return 0; +} + +static const struct mm_walk_ops hot_vma_set_idle_ops = { + .pte_entry = hot_vma_idle_pte_entry, + .walk_lock = PGWALK_RDLOCK, +}; + +static void kmmscand_walk_page_vma(struct vm_area_struct *vma) +{ + if (!vma_migratable(vma) || !vma_policy_mof(vma) || + is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { + return; + } + + if (!vma->vm_mm || + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + return; + + if (!vma_is_accessible(vma)) + return; + + walk_page_vma(vma, &hot_vma_set_idle_ops, &kmmscand_migrate_list); } static inline int kmmscand_test_exit(struct mm_struct *mm) @@ -93,10 +249,113 @@ static inline int kmmscand_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static void kmmscand_collect_mm_slot(struct kmmscand_mm_slot *mm_slot) +{ + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; + + lockdep_assert_held(&kmmscand_mm_lock); + + if (kmmscand_test_exit(mm)) { + /* free mm_slot */ + hash_del(&slot->hash); + list_del(&slot->mm_node); + + mm_slot_free(kmmscand_slot_cache, mm_slot); + mmdrop(mm); + } +} + +static void kmmscand_migrate_folio(void) +{ +} + static unsigned long kmmscand_scan_mm_slot(void) { - /* placeholder for scanning */ - msleep(100); + bool update_mmslot_info = false; + + unsigned long address; + + struct mm_slot *slot; + struct mm_struct *mm; + struct vma_iterator vmi; + struct vm_area_struct *vma = NULL; + struct kmmscand_mm_slot *mm_slot; + + /* Retrieve mm */ + spin_lock(&kmmscand_mm_lock); + + if (kmmscand_scan.mm_slot) { + mm_slot = kmmscand_scan.mm_slot; + slot = &mm_slot->slot; + address = mm_slot->address; + } else { + slot = list_entry(kmmscand_scan.mm_head.next, + struct mm_slot, mm_node); + mm_slot = mm_slot_entry(slot, struct kmmscand_mm_slot, slot); + address = mm_slot->address; + kmmscand_scan.mm_slot = mm_slot; + } + + mm = slot->mm; + + spin_unlock(&kmmscand_mm_lock); + + if (unlikely(!mmap_read_trylock(mm))) + goto outerloop_mmap_lock; + + if (unlikely(kmmscand_test_exit(mm))) + goto outerloop; + + + vma_iter_init(&vmi, mm, address); + + for_each_vma(vmi, vma) { + /* Count the scanned pages here to decide exit */ + kmmscand_walk_page_vma(vma); + + address = vma->vm_end; + } + + if (!vma) + address = 0; + + update_mmslot_info = true; + +outerloop: + /* exit_mmap will destroy ptes after this */ + mmap_read_unlock(mm); + +outerloop_mmap_lock: + spin_lock(&kmmscand_mm_lock); + VM_BUG_ON(kmmscand_scan.mm_slot != mm_slot); + + if (update_mmslot_info) + mm_slot->address = address; + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (unlikely(kmmscand_test_exit(mm)) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * kmmscand runs here, kmmscand_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (slot->mm_node.next != &kmmscand_scan.mm_head) { + slot = list_entry(slot->mm_node.next, + struct mm_slot, mm_node); + kmmscand_scan.mm_slot = + mm_slot_entry(slot, struct kmmscand_mm_slot, slot); + + } else + kmmscand_scan.mm_slot = NULL; + + if (kmmscand_test_exit(mm)) + kmmscand_collect_mm_slot(mm_slot); + } + + spin_unlock(&kmmscand_mm_lock); return 0; } @@ -159,6 +418,7 @@ void __kmmscand_enter(struct mm_struct *mm) if (!kmmscand_slot) return; + kmmscand_slot->address = 0; slot = &kmmscand_slot->slot; spin_lock(&kmmscand_mm_lock); -- 2.39.3