Re: [PATCH v6 2/6] ksm: support unsharing zero pages placed by KSM

xu xin <xu.xin.sc@xxxxxxxxx> · Sat, 11 Mar 2023 05:03:46 +0000

I think your suggestions as follows are valuable. I will make adjustments according to the actual
situation.

Since patch series v6 have been merged into next branch, I think submitting a new patch is better
to reduce maintainers' workload.

>> Cc: Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx>
>> Cc: Xuexin Jiang <jiang.xuexin@xxxxxxxxxx>
>> Reviewed-by: Xiaokai Ran <ran.xiaokai@xxxxxxxxxx>
>> Reviewed-by: Yang Yang <yang.yang29@xxxxxxxxxx>
>> ---
>>  mm/ksm.c | 141 +++++++++++++++++++++++++++++++++++++++++++++++++--------------
>>  1 file changed, 111 insertions(+), 30 deletions(-)
>> 
>> diff --git a/mm/ksm.c b/mm/ksm.c
>> index 905a79d213da..ab04b44679c8 100644

>> --- a/mm/ksm.c
>> +++ b/mm/ksm.c
>> @@ -214,6 +214,7 @@ struct ksm_rmap_item {
>>  #define SEQNR_MASK	0x0ff	/* low bits of unstable tree seqnr */
>>  #define UNSTABLE_FLAG	0x100	/* is a node of the unstable tree */
>>  #define STABLE_FLAG	0x200	/* is listed from the stable tree */
>> +#define ZERO_PAGE_FLAG 0x400 /* is zero page placed by KSM */
>> 
>>  /* The stable and unstable tree heads */
>>  static struct rb_root one_stable_tree[1] = { RB_ROOT };
>
> @@ -420,6 +421,11 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
>>  	return atomic_read(&mm->mm_users) == 0;
>>  }
>> 
>> +enum break_ksm_pmd_entry_return_flag {
>
>what about 0 ? I think it would look cleaner if every possible value
>was explicitly listed here
You're right. I'll fix it in a new patch.

>>> +	HAVE_KSM_PAGE = 1,
>> +	HAVE_ZERO_PAGE
>> +};
>> +
>>  static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
>>  			struct mm_walk *walk)
>>  {
>> @@ -427,6 +433,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
>>  	spinlock_t *ptl;
>>  	pte_t *pte;
>>  	int ret;
>> +	bool is_zero_page = false;
>
>this ^ should probably be moved further up in the list of variables
>(i.e. reverse christmas tree)
>
Good. Fix it in a new patch.

>> 
>>  	if (pmd_leaf(*pmd) || !pmd_present(*pmd))
>>  		return 0;
>> @@ -434,6 +441,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
>>  	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
>>  	if (pte_present(*pte)) {
>>  		page = vm_normal_page(walk->vma, addr, *pte);
>> +		if (!page)
>> +			is_zero_page = is_zero_pfn(pte_pfn(*pte));
>>  	} else if (!pte_none(*pte)) {
>>  		swp_entry_t entry = pte_to_swp_entry(*pte);
>> 
>> @@ -444,7 +453,14 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
>>  		if (is_migration_entry(entry))
>>  			page = pfn_swap_entry_to_page(entry);
>>  	}
>> -	ret = page && PageKsm(page);
>> +
>> +	if (page && PageKsm(page))
>> +		ret = HAVE_KSM_PAGE;
>> +	else if (is_zero_page)
>> +		ret = HAVE_ZERO_PAGE;
>> +	else
>> +		ret = 0;
>
>and here it would be a great place to use the enum instead of
>hardcoding 0
>
Good. Fix it in a new patch.

>> +
>>  	pte_unmap_unlock(pte, ptl);
>>  	return ret;
>>  }
>> @@ -466,19 +482,22 @@ static const struct mm_walk_ops break_ksm_ops = {
>>   * of the process that owns 'vma'.  We also do not want to enforce
>>   * protection keys here anyway.
>
>please add a (short) explanation of when and why the new
>unshare_zero_page flag should be used.
>
>>   */
>> -static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>> +static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
>> +				     bool unshare_zero_page)
>>  {
>>  	vm_fault_t ret = 0;
>> 
>>  	do {
>> -		int ksm_page;
>> +		int walk_result;
>> 
>>  		cond_resched();
>> -		ksm_page = walk_page_range_vma(vma, addr, addr + 1,
>> +		walk_result = walk_page_range_vma(vma, addr, addr + 1,
>>  					       &break_ksm_ops, NULL);
>> -		if (WARN_ON_ONCE(ksm_page < 0))
>> -			return ksm_page;
>> -		if (!ksm_page)
>> +		if (WARN_ON_ONCE(walk_result < 0))
>> +			return walk_result;
>> +		if (!walk_result)
>
>if (walk_result == ...)
>
Fine.

>> +			return 0;
>> +		if (walk_result == HAVE_ZERO_PAGE && !unshare_zero_page)
>>  			return 0;
>>  		ret = handle_mm_fault(vma, addr,
>>  				      FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
>> @@ -539,7 +558,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
>>  	mmap_read_lock(mm);
>>  	vma = find_mergeable_vma(mm, addr);
>>  	if (vma)
>> -		break_ksm(vma, addr);
>> +		break_ksm(vma, addr, false);
>>  	mmap_read_unlock(mm);
>>  }
>> 
>> @@ -764,6 +783,30 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node,
>>  	return NULL;
>>  }
>> 
>> +/*
>> + * Cleaning the rmap_item's ZERO_PAGE_FLAG
>
>this is not what you are doing, though. in case new flags are added,
>this function will cause problems.
>
>> + * This function will be called when unshare or writing on zero pages.
>> + */
>> +static inline void clean_rmap_item_zero_flag(struct ksm_rmap_item *rmap_item)
>> +{
>> +	if (rmap_item->address & ZERO_PAGE_FLAG)
>> +		rmap_item->address &= PAGE_MASK;
>
>	... &= ~ZERO_PAGE_FLAG;
>
>this way you only clear the flag, and you won't interfere with
>potential new flags that might be introduced in the future. (I
>really hope we won't need new flags in the future because the code is
>already complex enough as it is, but you never know)

How about 'rmap_item->address &= (PAGE_MASK | ~ZERO_PAGE_FLAG);' ?

>
>> +}
>> +
>> +/* Only called when rmap_item is going to be freed */
>> +static inline void unshare_zero_pages(struct ksm_rmap_item *rmap_item)
>> +{
>> +	struct vm_area_struct *vma;
>> +
>> +	if (rmap_item->address & ZERO_PAGE_FLAG) {
>> +		vma = vma_lookup(rmap_item->mm, rmap_item->address);
>> +		if (vma && !ksm_test_exit(rmap_item->mm))
>> +			break_ksm(vma, rmap_item->address, true);
>> +	}
>> +	/* Put at last. */
>> +	clean_rmap_item_zero_flag(rmap_item);
>> +}
>> +
>>  /*
>>   * Removing rmap_item from stable or unstable tree.
>>   * This function will clean the information from the stable/unstable tree.
>> @@ -824,6 +867,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
>>  		struct ksm_rmap_item *rmap_item = *rmap_list;
>>  		*rmap_list = rmap_item->rmap_list;
>>  		remove_rmap_item_from_tree(rmap_item);
>> +		unshare_zero_pages(rmap_item);
>>  		free_rmap_item(rmap_item);
>>  	}
>>  }
>> @@ -853,7 +897,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
>>  		if (signal_pending(current))
>>  			err = -ERESTARTSYS;
>>  		else
>> -			err = break_ksm(vma, addr);
>> +			err = break_ksm(vma, addr, false);
>>  	}
>>  	return err;
>>  }
>> @@ -2044,6 +2088,39 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
>>  	rmap_item->mm->ksm_merging_pages++;
>>  }
>> 
>> +static int try_to_merge_with_kernel_zero_page(struct ksm_rmap_item *rmap_item,
>> +									struct page *page)
>
>this line is less than 100 columns, please don't break it ^
>

Fine. Fix in a new patch.
>> +{
>> +	struct mm_struct *mm = rmap_item->mm;
>> +	int err = 0;
>> +
>> +	/*
>> +	 * It should not take ZERO_PAGE_FLAG because on one hand,
>> +	 * get_next_rmap_item don't return zero pages' rmap_item.
>> +	 * On the other hand, even if zero page was writen as
>> +	 * anonymous page, rmap_item has been cleaned after
>> +	 * stable_tree_search
>> +	 */
>> +	if (!WARN_ON_ONCE(rmap_item->address & ZERO_PAGE_FLAG)) {
>> +		struct vm_area_struct *vma;
>> +
>> +		mmap_read_lock(mm);
>> +		vma = find_mergeable_vma(mm, rmap_item->address);
>> +		if (vma) {
>> +			err = try_to_merge_one_page(vma, page,
>> +						ZERO_PAGE(rmap_item->address));
>
>this line is also less than 100 columns, please don't break it ^
>
>> +			if (!err)
>> +				rmap_item->address |= ZERO_PAGE_FLAG;
>> +		} else {
>> +			/* If the vma is out of date, we do not need to continue. */
>> +			err = 0;
>> +		}
>> +		mmap_read_unlock(mm);
>> +	}
>> +
>> +	return err;
>> +}
>> +
>>  /*
>>   * cmp_and_merge_page - first see if page can be merged into the stable tree;
>>   * if not, compare checksum to previous and if it's the same, see if page can
>> @@ -2055,7 +2132,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
>>   */
>>  static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
>>  {
>> -	struct mm_struct *mm = rmap_item->mm;
>>  	struct ksm_rmap_item *tree_rmap_item;
>>  	struct page *tree_page = NULL;
>>  	struct ksm_stable_node *stable_node;
>> @@ -2092,6 +2168,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
>>  	}
>> 
>>  	remove_rmap_item_from_tree(rmap_item);
>> +	clean_rmap_item_zero_flag(rmap_item);
>> 
>>  	if (kpage) {
>>  		if (PTR_ERR(kpage) == -EBUSY)
>> @@ -2128,29 +2205,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
>>  	 * Same checksum as an empty page. We attempt to merge it with the
>>  	 * appropriate zero page if the user enabled this via sysfs.
>>  	 */
>> -	if (ksm_use_zero_pages && (checksum == zero_checksum)) {
>
>(like here, see comment below)
>
>> -		struct vm_area_struct *vma;
>> -
>> -		mmap_read_lock(mm);
>> -		vma = find_mergeable_vma(mm, rmap_item->address);
>> -		if (vma) {
>> -			err = try_to_merge_one_page(vma, page,
>> -					ZERO_PAGE(rmap_item->address));
>> -		} else {
>> +	if (ksm_use_zero_pages) {
>> +		if (checksum == zero_checksum)
>
>if I see correctly, you end up with three ifs nested? why not just one
>if with all the conditions?
>
Yes, I thought three 'if'  would be clearer in terms of structures. But let's not modify
this here for now, because I'm going to submit a patch about using static_key instead of
ksm_use_zero_pages.

>>  			/*
>> -			 * If the vma is out of date, we do not need to
>> -			 * continue.
>> +			 * In case of failure, the page was not really empty, so we
>> +			 * need to continue. Otherwise we're done.
>>  			 */
>> -			err = 0;
>> -		}
>> -		mmap_read_unlock(mm);
>> -		/*
>> -		 * In case of failure, the page was not really empty, so we
>> -		 * need to continue. Otherwise we're done.
>> -		 */
>> -		if (!err)
>> -			return;
>> +			if (!try_to_merge_with_kernel_zero_page(rmap_item, page))
>> +				return;
>>  	}
>> +
>>  	tree_rmap_item =
>>  		unstable_tree_search_insert(rmap_item, page, &tree_page);
>>  	if (tree_rmap_item) {
>> @@ -2230,6 +2294,7 @@ static struct ksm_rmap_item *try_to_get_old_rmap_item(unsigned long addr,
>>  		 * is ineligible or discarded, e.g. MADV_DONTNEED.
>>  		 */
>>  		remove_rmap_item_from_tree(rmap_item);
>> +		unshare_zero_pages(rmap_item);
>>  		free_rmap_item(rmap_item);
>>  	}
>> 
>> @@ -2352,6 +2417,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>>  			}
>>  			if (is_zone_device_page(*page))
>>  				goto next_page;
>> +			if (is_zero_pfn(page_to_pfn(*page))) {
>> +				/*
>> +				 * To monitor ksm zero pages which becomes non-anonymous,
>> +				 * we have to save each rmap_item of zero pages by
>> +				 * try_to_get_old_rmap_item() walking on
>> +				 * ksm_scan.rmap_list, otherwise their rmap_items will be
>> +				 * freed by the next turn of get_next_rmap_item(). The
>> +				 * function get_next_rmap_item() will free all "skipped"
>> +				 * rmap_items because it thinks its areas as UNMERGEABLE.
>> +				 */
>> +				rmap_item = try_to_get_old_rmap_item(ksm_scan.address,
>> +									ksm_scan.rmap_list);
>> +				if (rmap_item && (rmap_item->address & ZERO_PAGE_FLAG))
>> +					ksm_scan.rmap_list = &rmap_item->rmap_list;
>> +				goto next_page;
>> +			}
>>  			if (PageAnon(*page)) {
>>  				flush_anon_page(vma, *page, ksm_scan.address);
>>  				flush_dcache_page(*page);
>