Re: [PATCH] mm: always consider THP when adjusting min_free_kbytes

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 4 Feb 2020, Mike Kravetz wrote:

> At system initialization time, min_free_kbytes is calculated based
> on the amount of memory in the system.  If THP is enabled, then
> khugepaged is started and min_free_kbytes may be adjusted in an
> attempt to reserve some pageblocks for THP allocations.
> 
> When memory is offlined or onlined, min_free_kbytes is recalculated
> and adjusted based on the amount of memory.  However, the adjustment
> for THP is not considered.  Here is an example from a 2 node system
> with 8GB of memory.
> 
>  # cat /proc/sys/vm/min_free_kbytes
>  90112
>  # echo 0 > /sys/devices/system/node/node1/memory56/online
>  # cat /proc/sys/vm/min_free_kbytes
>  11243
>  # echo 1 > /sys/devices/system/node/node1/memory56/online
>  # cat /proc/sys/vm/min_free_kbytes
>  11412
> 

Ah, that doesn't look good.

> One would expect that min_free_kbytes would return to it's original
> value after the offline/online operations.
> 
> Create a simple interface for THP/khugepaged based adjustment and
> call this whenever min_free_kbytes is adjusted.
> 
> Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
> ---
>  include/linux/khugepaged.h |  5 +++++
>  mm/khugepaged.c            | 35 ++++++++++++++++++++++++++++++-----
>  mm/page_alloc.c            |  4 +++-
>  3 files changed, 38 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index bc45ea1efbf7..8f02d3575829 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -15,6 +15,7 @@ extern int __khugepaged_enter(struct mm_struct *mm);
>  extern void __khugepaged_exit(struct mm_struct *mm);
>  extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
>  				      unsigned long vm_flags);
> +extern bool khugepaged_adjust_min_free_kbytes(void);
>  #ifdef CONFIG_SHMEM
>  extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
>  #else
> @@ -81,6 +82,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
>  {
>  	return 0;
>  }
> +static bool khugepaged_adjust_min_free_kbytes(void)
> +{
> +	return false;
> +}
>  static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
>  					   unsigned long addr)
>  {
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index b679908743cb..d8040cf19e98 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2138,7 +2138,7 @@ static int khugepaged(void *none)
>  	return 0;
>  }
>  
> -static void set_recommended_min_free_kbytes(void)
> +bool __khugepaged_adjust_min_free_kbytes(void)
>  {
>  	struct zone *zone;
>  	int nr_zones = 0;
> @@ -2174,17 +2174,26 @@ static void set_recommended_min_free_kbytes(void)
>  
>  	if (recommended_min > min_free_kbytes) {
>  		if (user_min_free_kbytes >= 0)
> -			pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
> +			pr_info_once("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
>  				min_free_kbytes, recommended_min);
>  
>  		min_free_kbytes = recommended_min;
> +		return true;
>  	}
> -	setup_per_zone_wmarks();
> +
> +	return false;
> +}
> +
> +static void set_recommended_min_free_kbytes(void)
> +{
> +	if (__khugepaged_adjust_min_free_kbytes())
> +		setup_per_zone_wmarks();
>  }
>  
> -int start_stop_khugepaged(void)
> +static struct task_struct *khugepaged_thread __read_mostly;
> +
> +int __ref start_stop_khugepaged(void)
>  {
> -	static struct task_struct *khugepaged_thread __read_mostly;
>  	static DEFINE_MUTEX(khugepaged_mutex);
>  	int err = 0;
>  
> @@ -2207,8 +2216,24 @@ int start_stop_khugepaged(void)
>  	} else if (khugepaged_thread) {
>  		kthread_stop(khugepaged_thread);
>  		khugepaged_thread = NULL;
> +		init_per_zone_wmark_min();
>  	}
>  fail:
>  	mutex_unlock(&khugepaged_mutex);
>  	return err;
>  }
> +
> +bool khugepaged_adjust_min_free_kbytes(void)
> +{
> +	bool ret = false;
> +
> +	/*
> +	 * This is a bit racy, and we could miss transitions.  However,
> +	 * start/stop code above will make additional adjustments at the
> +	 * end of transitions.
> +	 */
> +	if (khugepaged_enabled() && khugepaged_thread)
> +		ret = __khugepaged_adjust_min_free_kbytes();
> +
> +	return ret;
> +}
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d047bf7d8fd4..a7b3a6663ba6 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -68,6 +68,7 @@
>  #include <linux/lockdep.h>
>  #include <linux/nmi.h>
>  #include <linux/psi.h>
> +#include <linux/khugepaged.h>
>  
>  #include <asm/sections.h>
>  #include <asm/tlbflush.h>
> @@ -7827,9 +7828,10 @@ int __meminit init_per_zone_wmark_min(void)
>  		if (min_free_kbytes > 65536)
>  			min_free_kbytes = 65536;
>  	} else {
> -		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
> +		pr_warn_once("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
>  				new_min_free_kbytes, user_min_free_kbytes);
>  	}
> +	(void)khugepaged_adjust_min_free_kbytes();
>  	setup_per_zone_wmarks();
>  	refresh_zone_stat_thresholds();
>  	setup_per_zone_lowmem_reserve();

Hmm, if khugepaged_adjust_min_free_kbytes() increases min_free_kbytes for 
thp, then the user has no ability to override this increase by using 
vm.min_free_kbytes?

IIUC, with this change, it looks like memory hotplug events properly 
increase min_free_kbytes for thp optimization but also doesn't respect a 
previous user-defined value?

So it looks like this is fixing an obvious correctness issue but also now 
requires users to rewrite the sysctl if they want to decrease the min 
watermark.




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux