Add a debug interface to control the range of speculative numa fault, which can be used to tuning the performance or event close the speculative numa fault window for some workloads. Signed-off-by: Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx> --- mm/memory.c | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2c9ed63e4e23..a0f4a2a008cc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4052,7 +4052,29 @@ vm_fault_t finish_fault(struct vm_fault *vmf) static unsigned long fault_around_bytes __read_mostly = rounddown_pow_of_two(65536); +static unsigned long numa_around_bytes __read_mostly; + #ifdef CONFIG_DEBUG_FS +static int numa_around_bytes_get(void *data, u64 *val) +{ + *val = numa_around_bytes; + return 0; +} + +static int numa_around_bytes_set(void *data, u64 val) +{ + if (val / PAGE_SIZE > PTRS_PER_PTE) + return -EINVAL; + if (val > PAGE_SIZE) + numa_around_bytes = rounddown_pow_of_two(val); + else + numa_around_bytes = 0; /* rounddown_pow_of_two(0) is undefined */ + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(numa_around_bytes_fops, + numa_around_bytes_get, + numa_around_bytes_set, "%llu\n"); + static int fault_around_bytes_get(void *data, u64 *val) { *val = fault_around_bytes; @@ -4080,6 +4102,8 @@ static int __init fault_around_debugfs(void) { debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, &fault_around_bytes_fops); + debugfs_create_file_unsafe("numa_around_bytes", 0644, NULL, NULL, + &numa_around_bytes_fops); return 0; } late_initcall(fault_around_debugfs); @@ -4348,10 +4372,13 @@ static bool try_next_numa_page(struct vm_fault *vmf, unsigned int win_pages, ((win) & NUMA_FAULT_WINDOW_SIZE_MASK)) static inline unsigned int numa_fault_max_pages(struct vm_area_struct *vma, - unsigned long fault_address) + unsigned long fault_address, + unsigned long numa_around_size) { + unsigned long numa_around_addr = + (fault_address + numa_around_size) & PAGE_MASK; unsigned long pmd_end_addr = (fault_address & PMD_MASK) + PMD_SIZE; - unsigned long max_fault_addr = min_t(unsigned long, pmd_end_addr, + unsigned long max_fault_addr = min3(numa_around_addr, pmd_end_addr, vma->vm_end); return (max_fault_addr - fault_address - 1) >> PAGE_SHIFT; @@ -4360,12 +4387,24 @@ static inline unsigned int numa_fault_max_pages(struct vm_area_struct *vma, static unsigned int adjust_numa_fault_window(struct vm_area_struct *vma, unsigned long fault_address) { + unsigned long numa_around_size = READ_ONCE(numa_around_bytes); unsigned long numafault_ahead = GET_NUMA_FAULT_INFO(vma); unsigned long prev_start = NUMA_FAULT_WINDOW_START(numafault_ahead); unsigned int prev_pages = NUMA_FAULT_WINDOW_SIZE(numafault_ahead); unsigned long win_start; unsigned int win_pages, max_fault_pages; + /* + * Shut down the proactive numa fault if the numa_around_bytes + * is set to 0. + */ + if (!numa_around_size) { + if (numafault_ahead) + atomic_long_set(&vma->numafault_ahead_info, + NUMA_FAULT_INFO(0, 0)); + return 0; + } + win_start = fault_address + PAGE_SIZE; /* @@ -4437,7 +4476,8 @@ static unsigned int adjust_numa_fault_window(struct vm_area_struct *vma, * Make sure the size of ahead numa fault address is less than the * size of current VMA or PMD. */ - max_fault_pages = numa_fault_max_pages(vma, fault_address); + max_fault_pages = numa_fault_max_pages(vma, fault_address, + numa_around_size); if (win_pages > max_fault_pages) win_pages = max_fault_pages; -- 2.27.0