I observe that it is unlikely for KSM to merge new pages from an area that has already been scanned twice on Android mobile devices, so it's a waste of power to continue to scan these areas in high frequency. In this patch a defer mechanism is introduced which is borrowed from page compaction to KSM. This defer mechanism can automatic lower the scan frequency in the above case. Signed-off-by: Wenwei Tao <wenweitaowenwei@xxxxxxxxx> --- mm/ksm.c | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 203 insertions(+), 27 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 4162dce..54ffcb2 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -104,6 +104,7 @@ struct mm_slot { struct list_head mm_list; struct rmap_item *rmap_list; struct mm_struct *mm; + unsigned long seqnr; }; /** @@ -117,9 +118,12 @@ struct mm_slot { */ struct ksm_scan { struct mm_slot *mm_slot; + struct mm_slot *active_slot; unsigned long address; struct rmap_item **rmap_list; unsigned long seqnr; + unsigned long ksm_considered; + unsigned int ksm_defer_shift; }; /** @@ -182,6 +186,11 @@ struct rmap_item { #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ +#define ACTIVE_SLOT_FLAG 0x100 +#define ACTIVE_SLOT_SEQNR 0x200 +#define KSM_MAX_DEFER_SHIFT 6 + + /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; static struct rb_root one_unstable_tree[1] = { RB_ROOT }; @@ -197,14 +206,22 @@ static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct mm_slot ksm_mm_head = { .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), }; + +static struct mm_slot ksm_mm_active = { + .mm_list = LIST_HEAD_INIT(ksm_mm_active.mm_list), +}; + static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, + .active_slot = &ksm_mm_active, }; static struct kmem_cache *rmap_item_cache; static struct kmem_cache *stable_node_cache; static struct kmem_cache *mm_slot_cache; +static bool ksm_merged_or_unstable; + /* The number of nodes in the stable tree */ static unsigned long ksm_pages_shared; @@ -336,6 +353,23 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); } +static void move_to_active_list(struct mm_slot *mm_slot) +{ + if (mm_slot && !(mm_slot->seqnr & ACTIVE_SLOT_FLAG)) { + if (ksm_run & KSM_RUN_UNMERGE && mm_slot->rmap_list) + return; + if (mm_slot == ksm_scan.mm_slot) { + if (ksm_scan.active_slot == &ksm_mm_active) + return; + ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, + struct mm_slot, mm_list); + } + list_move_tail(&mm_slot->mm_list, + &ksm_scan.active_slot->mm_list); + mm_slot->seqnr |= (ACTIVE_SLOT_FLAG | ACTIVE_SLOT_SEQNR); + } +} + /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -772,6 +806,15 @@ static int unmerge_and_remove_all_rmap_items(void) int err = 0; spin_lock(&ksm_mmlist_lock); + mm_slot = list_entry(ksm_mm_active.mm_list.next, + struct mm_slot, mm_list); + while (mm_slot != &ksm_mm_active) { + list_move_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); + mm_slot->seqnr &= ~(ACTIVE_SLOT_FLAG | ACTIVE_SLOT_SEQNR); + mm_slot = list_entry(ksm_mm_active.mm_list.next, + struct mm_slot, mm_list); + } + ksm_scan.active_slot = &ksm_mm_active; ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, struct mm_slot, mm_list); spin_unlock(&ksm_mmlist_lock); @@ -790,8 +833,8 @@ static int unmerge_and_remove_all_rmap_items(void) if (err) goto error; } - remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); + mm_slot->seqnr = 0; spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, @@ -806,6 +849,7 @@ static int unmerge_and_remove_all_rmap_items(void) up_read(&mm->mmap_sem); mmdrop(mm); } else { + move_to_active_list(mm_slot); spin_unlock(&ksm_mmlist_lock); up_read(&mm->mmap_sem); } @@ -1401,6 +1445,9 @@ static void stable_tree_append(struct rmap_item *rmap_item, ksm_pages_sharing++; else ksm_pages_shared++; + + if (!ksm_merged_or_unstable) + ksm_merged_or_unstable = true; } /* @@ -1468,6 +1515,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) checksum = calc_checksum(page); if (rmap_item->oldchecksum != checksum) { rmap_item->oldchecksum = checksum; + if ((rmap_item->address & UNSTABLE_FLAG) && + !ksm_merged_or_unstable) + ksm_merged_or_unstable = true; return; } @@ -1504,6 +1554,31 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) } } +static bool skip_active_slot_vma(struct rmap_item ***rmap_list, + struct vm_area_struct *vma, struct mm_slot *mm_slot) +{ + unsigned char age; + struct rmap_item *rmap_item = **rmap_list; + + age = (unsigned char)(ksm_scan.seqnr - mm_slot->seqnr); + if (age > 0) + return false; + if (!(vma->vm_flags & VM_HUGETLB)) { + while (rmap_item && rmap_item->address < vma->vm_end) { + if (rmap_item->address < vma->vm_start) { + **rmap_list = rmap_item->rmap_list; + remove_rmap_item_from_tree(rmap_item); + free_rmap_item(rmap_item); + rmap_item = **rmap_list; + } else { + *rmap_list = &rmap_item->rmap_list; + rmap_item = rmap_item->rmap_list; + } + } + return true; + } else + return false; +} static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, struct rmap_item **rmap_list, unsigned long addr) @@ -1535,15 +1610,18 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, static struct rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; - struct mm_slot *slot; + struct mm_slot *slot, *next_slot; struct vm_area_struct *vma; struct rmap_item *rmap_item; int nid; - if (list_empty(&ksm_mm_head.mm_list)) + if (list_empty(&ksm_mm_head.mm_list) && + list_empty(&ksm_mm_active.mm_list)) return NULL; - - slot = ksm_scan.mm_slot; + if (ksm_scan.active_slot != &ksm_mm_active) + slot = ksm_scan.active_slot; + else + slot = ksm_scan.mm_slot; if (slot == &ksm_mm_head) { /* * A number of pages can hang around indefinitely on per-cpu @@ -1582,8 +1660,16 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); - ksm_scan.mm_slot = slot; + if (unlikely(ksm_scan.seqnr == 0 && + !list_empty(&ksm_mm_active.mm_list))) { + slot = list_entry(ksm_mm_active.mm_list.next, + struct mm_slot, mm_list); + ksm_scan.active_slot = slot; + } else { + slot = list_entry(slot->mm_list.next, + struct mm_slot, mm_list); + ksm_scan.mm_slot = slot; + } spin_unlock(&ksm_mmlist_lock); /* * Although we tested list_empty() above, a racing __ksm_exit @@ -1608,7 +1694,8 @@ next_mm: continue; if (ksm_scan.address < vma->vm_start) ksm_scan.address = vma->vm_start; - if (!vma->anon_vma) + if (!vma->anon_vma || (ksm_scan.active_slot == slot && + skip_active_slot_vma(&ksm_scan.rmap_list, vma, slot))) ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { @@ -1639,6 +1726,9 @@ next_mm: ksm_scan.address += PAGE_SIZE; cond_resched(); } + if ((slot->seqnr & (ACTIVE_SLOT_FLAG | ACTIVE_SLOT_SEQNR)) == + ACTIVE_SLOT_FLAG && vma->vm_flags & VM_HUGETLB) + vma->vm_flags &= ~VM_HUGETLB; } if (ksm_test_exit(mm)) { @@ -1652,8 +1742,25 @@ next_mm: remove_trailing_rmap_items(slot, ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(slot->mm_list.next, - struct mm_slot, mm_list); + slot->seqnr &= ~SEQNR_MASK; + slot->seqnr |= (ksm_scan.seqnr & SEQNR_MASK); + next_slot = list_entry(slot->mm_list.next, + struct mm_slot, mm_list); + if (slot == ksm_scan.active_slot) { + if (slot->seqnr & ACTIVE_SLOT_SEQNR) + slot->seqnr &= ~ACTIVE_SLOT_SEQNR; + else { + slot->seqnr &= ~ACTIVE_SLOT_FLAG; + list_move_tail(&slot->mm_list, + &ksm_scan.mm_slot->mm_list); + } + ksm_scan.active_slot = next_slot; + } else + ksm_scan.mm_slot = next_slot; + + if (ksm_scan.active_slot == &ksm_mm_active) + ksm_scan.active_slot = list_entry(ksm_mm_active.mm_list.next, + struct mm_slot, mm_list); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_sem @@ -1664,6 +1771,9 @@ next_mm: * or when all VM_MERGEABLE areas have been unmapped (and * mmap_sem then protects against race with MADV_MERGEABLE). */ + if (ksm_scan.active_slot == slot) + ksm_scan.active_slot = list_entry(slot->mm_list.next, + struct mm_slot, mm_list); hash_del(&slot->link); list_del(&slot->mm_list); spin_unlock(&ksm_mmlist_lock); @@ -1678,10 +1788,13 @@ next_mm: } /* Repeat until we've completed scanning the whole list */ - slot = ksm_scan.mm_slot; - if (slot != &ksm_mm_head) + if (ksm_scan.active_slot != &ksm_mm_active) { + slot = ksm_scan.active_slot; goto next_mm; - + } else if (ksm_scan.mm_slot != &ksm_mm_head) { + slot = ksm_scan.mm_slot; + goto next_mm; + } ksm_scan.seqnr++; return NULL; } @@ -1705,9 +1818,40 @@ static void ksm_do_scan(unsigned int scan_npages) } } +/*This is copyed from page compaction*/ + +static inline void defer_ksm(void) +{ + ksm_scan.ksm_considered = 0; + if (++ksm_scan.ksm_defer_shift > KSM_MAX_DEFER_SHIFT) + ksm_scan.ksm_defer_shift = KSM_MAX_DEFER_SHIFT; + +} + +static inline bool ksm_defered(void) +{ + unsigned long defer_limit = 1UL << ksm_scan.ksm_defer_shift; + + if (++ksm_scan.ksm_considered > defer_limit) + ksm_scan.ksm_considered = defer_limit; + return ksm_scan.ksm_considered < defer_limit && + list_empty(&ksm_mm_active.mm_list); +} + +static inline void reset_ksm_defer(void) +{ + if (ksm_scan.ksm_defer_shift != 0) { + ksm_scan.ksm_considered = 0; + ksm_scan.ksm_defer_shift = 0; + } +} + + static int ksmd_should_run(void) { - return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); + return (ksm_run & KSM_RUN_MERGE) && + !(list_empty(&ksm_mm_head.mm_list) && + list_empty(&ksm_mm_active.mm_list)); } static int ksm_scan_thread(void *nothing) @@ -1718,8 +1862,14 @@ static int ksm_scan_thread(void *nothing) while (!kthread_should_stop()) { mutex_lock(&ksm_thread_mutex); wait_while_offlining(); - if (ksmd_should_run()) + if (ksmd_should_run() && !ksm_defered()) { + ksm_merged_or_unstable = false; ksm_do_scan(ksm_thread_pages_to_scan); + if (ksm_merged_or_unstable) + reset_ksm_defer(); + else + defer_ksm(); + } mutex_unlock(&ksm_thread_mutex); try_to_freeze(); @@ -1739,6 +1889,8 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; + unsigned long vma_length = + (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; int err; switch (advice) { @@ -1761,8 +1913,19 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (err) return err; } - - *vm_flags |= VM_MERGEABLE; + /* + * Since hugetlb vma is not supported by ksm + * use VM_HUGETLB to indicate new mergeable vma + */ + *vm_flags |= VM_MERGEABLE | VM_HUGETLB; + if (vma_length > ksm_thread_pages_to_scan) { + struct mm_slot *mm_slot; + + spin_lock(&ksm_mmlist_lock); + mm_slot = get_mm_slot(mm); + move_to_active_list(mm_slot); + spin_unlock(&ksm_mmlist_lock); + } break; case MADV_UNMERGEABLE: @@ -1775,7 +1938,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return err; } - *vm_flags &= ~VM_MERGEABLE; + *vm_flags &= ~(VM_MERGEABLE | VM_HUGETLB); break; } @@ -1792,7 +1955,8 @@ int __ksm_enter(struct mm_struct *mm) return -ENOMEM; /* Check ksm_run too? Would need tighter locking */ - needs_wakeup = list_empty(&ksm_mm_head.mm_list); + needs_wakeup = list_empty(&ksm_mm_head.mm_list) && + list_empty(&ksm_mm_active.mm_list); spin_lock(&ksm_mmlist_lock); insert_to_mm_slots_hash(mm, mm_slot); @@ -1806,10 +1970,8 @@ int __ksm_enter(struct mm_struct *mm) * scanning cursor, otherwise KSM pages in newly forked mms will be * missed: then we might as well insert at the end of the list. */ - if (ksm_run & KSM_RUN_UNMERGE) - list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); - else - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + list_add_tail(&mm_slot->mm_list, &ksm_scan.active_slot->mm_list); + mm_slot->seqnr |= (ACTIVE_SLOT_FLAG | ACTIVE_SLOT_SEQNR); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); @@ -1823,7 +1985,7 @@ int __ksm_enter(struct mm_struct *mm) void __ksm_exit(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct mm_slot *mm_slot, *current_slot; int easy_to_free = 0; /* @@ -1837,14 +1999,28 @@ void __ksm_exit(struct mm_struct *mm) spin_lock(&ksm_mmlist_lock); mm_slot = get_mm_slot(mm); - if (mm_slot && ksm_scan.mm_slot != mm_slot) { + if (ksm_scan.active_slot != &ksm_mm_active) + current_slot = ksm_scan.active_slot; + else + current_slot = ksm_scan.mm_slot; + if (mm_slot && mm_slot != current_slot) { if (!mm_slot->rmap_list) { hash_del(&mm_slot->link); list_del(&mm_slot->mm_list); easy_to_free = 1; } else { - list_move(&mm_slot->mm_list, - &ksm_scan.mm_slot->mm_list); + if (mm_slot == ksm_scan.mm_slot) + ksm_scan.mm_slot = + list_entry(mm_slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_run & KSM_RUN_UNMERGE) + list_move(&mm_slot->mm_list, + &ksm_scan.mm_slot->mm_list); + else { + list_move(&mm_slot->mm_list, + &ksm_scan.active_slot->mm_list); + mm_slot->seqnr |= ACTIVE_SLOT_FLAG; + } } } spin_unlock(&ksm_mmlist_lock); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html