I think your suggestions as follows are valuable. I will make adjustments according to the actual situation. Since patch series v6 have been merged into next branch, I think submitting a new patch is better to reduce maintainers' workload. >> Cc: Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx> >> Cc: Xuexin Jiang <jiang.xuexin@xxxxxxxxxx> >> Reviewed-by: Xiaokai Ran <ran.xiaokai@xxxxxxxxxx> >> Reviewed-by: Yang Yang <yang.yang29@xxxxxxxxxx> >> --- >> mm/ksm.c | 141 +++++++++++++++++++++++++++++++++++++++++++++++++-------------- >> 1 file changed, 111 insertions(+), 30 deletions(-) >> >> diff --git a/mm/ksm.c b/mm/ksm.c >> index 905a79d213da..ab04b44679c8 100644 >> --- a/mm/ksm.c >> +++ b/mm/ksm.c >> @@ -214,6 +214,7 @@ struct ksm_rmap_item { >> #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ >> #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ >> #define STABLE_FLAG 0x200 /* is listed from the stable tree */ >> +#define ZERO_PAGE_FLAG 0x400 /* is zero page placed by KSM */ >> >> /* The stable and unstable tree heads */ >> static struct rb_root one_stable_tree[1] = { RB_ROOT }; > > @@ -420,6 +421,11 @@ static inline bool ksm_test_exit(struct mm_struct *mm) >> return atomic_read(&mm->mm_users) == 0; >> } >> >> +enum break_ksm_pmd_entry_return_flag { > >what about 0 ? I think it would look cleaner if every possible value >was explicitly listed here You're right. I'll fix it in a new patch. >>> + HAVE_KSM_PAGE = 1, >> + HAVE_ZERO_PAGE >> +}; >> + >> static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, >> struct mm_walk *walk) >> { >> @@ -427,6 +433,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex >> spinlock_t *ptl; >> pte_t *pte; >> int ret; >> + bool is_zero_page = false; > >this ^ should probably be moved further up in the list of variables >(i.e. reverse christmas tree) > Good. Fix it in a new patch. >> >> if (pmd_leaf(*pmd) || !pmd_present(*pmd)) >> return 0; >> @@ -434,6 +441,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex >> pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); >> if (pte_present(*pte)) { >> page = vm_normal_page(walk->vma, addr, *pte); >> + if (!page) >> + is_zero_page = is_zero_pfn(pte_pfn(*pte)); >> } else if (!pte_none(*pte)) { >> swp_entry_t entry = pte_to_swp_entry(*pte); >> >> @@ -444,7 +453,14 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex >> if (is_migration_entry(entry)) >> page = pfn_swap_entry_to_page(entry); >> } >> - ret = page && PageKsm(page); >> + >> + if (page && PageKsm(page)) >> + ret = HAVE_KSM_PAGE; >> + else if (is_zero_page) >> + ret = HAVE_ZERO_PAGE; >> + else >> + ret = 0; > >and here it would be a great place to use the enum instead of >hardcoding 0 > Good. Fix it in a new patch. >> + >> pte_unmap_unlock(pte, ptl); >> return ret; >> } >> @@ -466,19 +482,22 @@ static const struct mm_walk_ops break_ksm_ops = { >> * of the process that owns 'vma'. We also do not want to enforce >> * protection keys here anyway. > >please add a (short) explanation of when and why the new >unshare_zero_page flag should be used. > >> */ >> -static int break_ksm(struct vm_area_struct *vma, unsigned long addr) >> +static int break_ksm(struct vm_area_struct *vma, unsigned long addr, >> + bool unshare_zero_page) >> { >> vm_fault_t ret = 0; >> >> do { >> - int ksm_page; >> + int walk_result; >> >> cond_resched(); >> - ksm_page = walk_page_range_vma(vma, addr, addr + 1, >> + walk_result = walk_page_range_vma(vma, addr, addr + 1, >> &break_ksm_ops, NULL); >> - if (WARN_ON_ONCE(ksm_page < 0)) >> - return ksm_page; >> - if (!ksm_page) >> + if (WARN_ON_ONCE(walk_result < 0)) >> + return walk_result; >> + if (!walk_result) > >if (walk_result == ...) > Fine. >> + return 0; >> + if (walk_result == HAVE_ZERO_PAGE && !unshare_zero_page) >> return 0; >> ret = handle_mm_fault(vma, addr, >> FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, >> @@ -539,7 +558,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item) >> mmap_read_lock(mm); >> vma = find_mergeable_vma(mm, addr); >> if (vma) >> - break_ksm(vma, addr); >> + break_ksm(vma, addr, false); >> mmap_read_unlock(mm); >> } >> >> @@ -764,6 +783,30 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node, >> return NULL; >> } >> >> +/* >> + * Cleaning the rmap_item's ZERO_PAGE_FLAG > >this is not what you are doing, though. in case new flags are added, >this function will cause problems. > >> + * This function will be called when unshare or writing on zero pages. >> + */ >> +static inline void clean_rmap_item_zero_flag(struct ksm_rmap_item *rmap_item) >> +{ >> + if (rmap_item->address & ZERO_PAGE_FLAG) >> + rmap_item->address &= PAGE_MASK; > > ... &= ~ZERO_PAGE_FLAG; > >this way you only clear the flag, and you won't interfere with >potential new flags that might be introduced in the future. (I >really hope we won't need new flags in the future because the code is >already complex enough as it is, but you never know) How about 'rmap_item->address &= (PAGE_MASK | ~ZERO_PAGE_FLAG);' ? > >> +} >> + >> +/* Only called when rmap_item is going to be freed */ >> +static inline void unshare_zero_pages(struct ksm_rmap_item *rmap_item) >> +{ >> + struct vm_area_struct *vma; >> + >> + if (rmap_item->address & ZERO_PAGE_FLAG) { >> + vma = vma_lookup(rmap_item->mm, rmap_item->address); >> + if (vma && !ksm_test_exit(rmap_item->mm)) >> + break_ksm(vma, rmap_item->address, true); >> + } >> + /* Put at last. */ >> + clean_rmap_item_zero_flag(rmap_item); >> +} >> + >> /* >> * Removing rmap_item from stable or unstable tree. >> * This function will clean the information from the stable/unstable tree. >> @@ -824,6 +867,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) >> struct ksm_rmap_item *rmap_item = *rmap_list; >> *rmap_list = rmap_item->rmap_list; >> remove_rmap_item_from_tree(rmap_item); >> + unshare_zero_pages(rmap_item); >> free_rmap_item(rmap_item); >> } >> } >> @@ -853,7 +897,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, >> if (signal_pending(current)) >> err = -ERESTARTSYS; >> else >> - err = break_ksm(vma, addr); >> + err = break_ksm(vma, addr, false); >> } >> return err; >> } >> @@ -2044,6 +2088,39 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, >> rmap_item->mm->ksm_merging_pages++; >> } >> >> +static int try_to_merge_with_kernel_zero_page(struct ksm_rmap_item *rmap_item, >> + struct page *page) > >this line is less than 100 columns, please don't break it ^ > Fine. Fix in a new patch. >> +{ >> + struct mm_struct *mm = rmap_item->mm; >> + int err = 0; >> + >> + /* >> + * It should not take ZERO_PAGE_FLAG because on one hand, >> + * get_next_rmap_item don't return zero pages' rmap_item. >> + * On the other hand, even if zero page was writen as >> + * anonymous page, rmap_item has been cleaned after >> + * stable_tree_search >> + */ >> + if (!WARN_ON_ONCE(rmap_item->address & ZERO_PAGE_FLAG)) { >> + struct vm_area_struct *vma; >> + >> + mmap_read_lock(mm); >> + vma = find_mergeable_vma(mm, rmap_item->address); >> + if (vma) { >> + err = try_to_merge_one_page(vma, page, >> + ZERO_PAGE(rmap_item->address)); > >this line is also less than 100 columns, please don't break it ^ > >> + if (!err) >> + rmap_item->address |= ZERO_PAGE_FLAG; >> + } else { >> + /* If the vma is out of date, we do not need to continue. */ >> + err = 0; >> + } >> + mmap_read_unlock(mm); >> + } >> + >> + return err; >> +} >> + >> /* >> * cmp_and_merge_page - first see if page can be merged into the stable tree; >> * if not, compare checksum to previous and if it's the same, see if page can >> @@ -2055,7 +2132,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, >> */ >> static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) >> { >> - struct mm_struct *mm = rmap_item->mm; >> struct ksm_rmap_item *tree_rmap_item; >> struct page *tree_page = NULL; >> struct ksm_stable_node *stable_node; >> @@ -2092,6 +2168,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite >> } >> >> remove_rmap_item_from_tree(rmap_item); >> + clean_rmap_item_zero_flag(rmap_item); >> >> if (kpage) { >> if (PTR_ERR(kpage) == -EBUSY) >> @@ -2128,29 +2205,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite >> * Same checksum as an empty page. We attempt to merge it with the >> * appropriate zero page if the user enabled this via sysfs. >> */ >> - if (ksm_use_zero_pages && (checksum == zero_checksum)) { > >(like here, see comment below) > >> - struct vm_area_struct *vma; >> - >> - mmap_read_lock(mm); >> - vma = find_mergeable_vma(mm, rmap_item->address); >> - if (vma) { >> - err = try_to_merge_one_page(vma, page, >> - ZERO_PAGE(rmap_item->address)); >> - } else { >> + if (ksm_use_zero_pages) { >> + if (checksum == zero_checksum) > >if I see correctly, you end up with three ifs nested? why not just one >if with all the conditions? > Yes, I thought three 'if' would be clearer in terms of structures. But let's not modify this here for now, because I'm going to submit a patch about using static_key instead of ksm_use_zero_pages. >> /* >> - * If the vma is out of date, we do not need to >> - * continue. >> + * In case of failure, the page was not really empty, so we >> + * need to continue. Otherwise we're done. >> */ >> - err = 0; >> - } >> - mmap_read_unlock(mm); >> - /* >> - * In case of failure, the page was not really empty, so we >> - * need to continue. Otherwise we're done. >> - */ >> - if (!err) >> - return; >> + if (!try_to_merge_with_kernel_zero_page(rmap_item, page)) >> + return; >> } >> + >> tree_rmap_item = >> unstable_tree_search_insert(rmap_item, page, &tree_page); >> if (tree_rmap_item) { >> @@ -2230,6 +2294,7 @@ static struct ksm_rmap_item *try_to_get_old_rmap_item(unsigned long addr, >> * is ineligible or discarded, e.g. MADV_DONTNEED. >> */ >> remove_rmap_item_from_tree(rmap_item); >> + unshare_zero_pages(rmap_item); >> free_rmap_item(rmap_item); >> } >> >> @@ -2352,6 +2417,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) >> } >> if (is_zone_device_page(*page)) >> goto next_page; >> + if (is_zero_pfn(page_to_pfn(*page))) { >> + /* >> + * To monitor ksm zero pages which becomes non-anonymous, >> + * we have to save each rmap_item of zero pages by >> + * try_to_get_old_rmap_item() walking on >> + * ksm_scan.rmap_list, otherwise their rmap_items will be >> + * freed by the next turn of get_next_rmap_item(). The >> + * function get_next_rmap_item() will free all "skipped" >> + * rmap_items because it thinks its areas as UNMERGEABLE. >> + */ >> + rmap_item = try_to_get_old_rmap_item(ksm_scan.address, >> + ksm_scan.rmap_list); >> + if (rmap_item && (rmap_item->address & ZERO_PAGE_FLAG)) >> + ksm_scan.rmap_list = &rmap_item->rmap_list; >> + goto next_page; >> + } >> if (PageAnon(*page)) { >> flush_anon_page(vma, *page, ksm_scan.address); >> flush_dcache_page(*page); >