At system initialization time, min_free_kbytes is calculated based on the amount of memory in the system. If THP is enabled, then khugepaged is started and min_free_kbytes may be adjusted in an attempt to reserve some pageblocks for THP allocations. When memory is offlined or onlined, min_free_kbytes is recalculated and adjusted based on the amount of memory. However, the adjustment for THP is not considered. Here is an example from a 2 node system with 8GB of memory. # cat /proc/sys/vm/min_free_kbytes 90112 # echo 0 > /sys/devices/system/node/node1/memory56/online # cat /proc/sys/vm/min_free_kbytes 11243 # echo 1 > /sys/devices/system/node/node1/memory56/online # cat /proc/sys/vm/min_free_kbytes 11412 One would expect that min_free_kbytes would return to it's original value after the offline/online operations. Create a simple interface for THP/khugepaged based adjustment and call this whenever min_free_kbytes is adjusted. Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx> --- include/linux/khugepaged.h | 5 +++++ mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++----- mm/page_alloc.c | 4 +++- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index bc45ea1efbf7..8f02d3575829 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,6 +15,7 @@ extern int __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags); +extern bool khugepaged_adjust_min_free_kbytes(void); #ifdef CONFIG_SHMEM extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); #else @@ -81,6 +82,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, { return 0; } +static bool khugepaged_adjust_min_free_kbytes(void) +{ + return false; +} static inline void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b679908743cb..d8040cf19e98 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2138,7 +2138,7 @@ static int khugepaged(void *none) return 0; } -static void set_recommended_min_free_kbytes(void) +bool __khugepaged_adjust_min_free_kbytes(void) { struct zone *zone; int nr_zones = 0; @@ -2174,17 +2174,26 @@ static void set_recommended_min_free_kbytes(void) if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) - pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", + pr_info_once("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; + return true; } - setup_per_zone_wmarks(); + + return false; +} + +static void set_recommended_min_free_kbytes(void) +{ + if (__khugepaged_adjust_min_free_kbytes()) + setup_per_zone_wmarks(); } -int start_stop_khugepaged(void) +static struct task_struct *khugepaged_thread __read_mostly; + +int __ref start_stop_khugepaged(void) { - static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); int err = 0; @@ -2207,8 +2216,24 @@ int start_stop_khugepaged(void) } else if (khugepaged_thread) { kthread_stop(khugepaged_thread); khugepaged_thread = NULL; + init_per_zone_wmark_min(); } fail: mutex_unlock(&khugepaged_mutex); return err; } + +bool khugepaged_adjust_min_free_kbytes(void) +{ + bool ret = false; + + /* + * This is a bit racy, and we could miss transitions. However, + * start/stop code above will make additional adjustments at the + * end of transitions. + */ + if (khugepaged_enabled() && khugepaged_thread) + ret = __khugepaged_adjust_min_free_kbytes(); + + return ret; +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d047bf7d8fd4..a7b3a6663ba6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -68,6 +68,7 @@ #include <linux/lockdep.h> #include <linux/nmi.h> #include <linux/psi.h> +#include <linux/khugepaged.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -7827,9 +7828,10 @@ int __meminit init_per_zone_wmark_min(void) if (min_free_kbytes > 65536) min_free_kbytes = 65536; } else { - pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", + pr_warn_once("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", new_min_free_kbytes, user_min_free_kbytes); } + (void)khugepaged_adjust_min_free_kbytes(); setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); -- 2.24.1