On Mon, Nov 22, 2021 at 2:22 AM Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx> wrote: > > Some workloads access a set of data entities will follow the data locality, > also known as locality of reference, which means the probability of accessing > some data soon after some nearby data has been accessed. > > On some systems with different memory types, which will rely on the numa > balancing to promote slow hot memory to fast memory to improve performance. > So we can promote several sequential pages on slow memory at one time > according to the data locality for some workloads to improve the performance. Fault around for NUMA fault definitely could reduce the overhead for NUMA balancing by having fewer faults. I think this could be extended to regular NUMA balancing too. But I'm not sure whether false positives are worth concerning or not. I recall Mel proposed fault around too (not in patch, but shared some ideas). Added Mel in this thread. > > Testing with mysql can show about 5% performance improved as below. > > Machine: 16 CPUs, 64G DRAM, 256G AEP > > sysbench /usr/share/sysbench/tests/include/oltp_legacy/oltp.lua > --mysql-user=root --mysql-password=root --oltp-test-mode=complex > --oltp-tables-count=65 --oltp-table-size=5000000 --threads=20 --time=600 > --report-interval=10 > > No proactive promotion: > transactions > 2259245 (3765.37 per sec.) > 2312605 (3854.31 per sec.) > 2325907 (3876.47 per sec.) > > Proactive promotion bytes=16384: > transactions > 2419023 (4031.66 per sec.) > 2451903 (4086.47 per sec.) > 2441941 (4068.68 per sec.) > > Suggested-by: Xunlei Pang <xlpang@xxxxxxxxxxxxxxxxx> > Signed-off-by: Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx> > --- > Note: This patch is based on "NUMA balancing: optimize memory placement > for memory tiering system" [1] from Huang Ying. > > [1] https://lore.kernel.org/lkml/87bl2gsnrd.fsf@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/T/ > --- > mm/memory.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 89 insertions(+), 9 deletions(-) > > diff --git a/mm/memory.c b/mm/memory.c > index 5d9ed74c66f9..626523cbd60f 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4058,7 +4058,28 @@ vm_fault_t finish_fault(struct vm_fault *vmf) > static unsigned long fault_around_bytes __read_mostly = > rounddown_pow_of_two(65536); > > +static unsigned long numa_around_bytes __read_mostly; > + > #ifdef CONFIG_DEBUG_FS > +static int numa_around_bytes_get(void *data, u64 *val) > +{ > + *val = numa_around_bytes; > + return 0; > +} > + > +static int numa_around_bytes_set(void *data, u64 val) > +{ > + if (val / PAGE_SIZE > PTRS_PER_PTE) > + return -EINVAL; > + if (val > PAGE_SIZE) > + numa_around_bytes = rounddown_pow_of_two(val); > + else > + numa_around_bytes = 0; /* rounddown_pow_of_two(0) is undefined */ > + return 0; > +} > +DEFINE_DEBUGFS_ATTRIBUTE(numa_around_bytes_fops, > + numa_around_bytes_get, numa_around_bytes_set, "%llu\n"); > + > static int fault_around_bytes_get(void *data, u64 *val) > { > *val = fault_around_bytes; > @@ -4086,6 +4107,9 @@ static int __init fault_around_debugfs(void) > { > debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, > &fault_around_bytes_fops); > + > + debugfs_create_file_unsafe("numa_around_bytes", 0644, NULL, NULL, > + &numa_around_bytes_fops); > return 0; > } > late_initcall(fault_around_debugfs); > @@ -4323,16 +4347,55 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, > return mpol_misplaced(page, vma, addr); > } > > +static bool can_next_numa_page(struct vm_fault *vmf, unsigned long *fault_addr, > + unsigned long max_numa_addr) > +{ > + unsigned long next_fault_addr = vmf->address + PAGE_SIZE; > + > + if (next_fault_addr >= max_numa_addr) > + return false; > + > + *fault_addr = next_fault_addr; > + vmf->pte = pte_offset_map(vmf->pmd, next_fault_addr); > + vmf->orig_pte = *vmf->pte; > + if (pte_protnone(vmf->orig_pte)) > + return true; > + > + return false; > +} > + > static vm_fault_t do_numa_page(struct vm_fault *vmf) > { > struct vm_area_struct *vma = vmf->vma; > struct page *page = NULL; > - int page_nid = NUMA_NO_NODE; > + int page_nid; > int last_cpupid; > int target_nid; > pte_t pte, old_pte; > - bool was_writable = pte_savedwrite(vmf->orig_pte); > - int flags = 0; > + bool was_writable; > + int flags; > + unsigned long max_numa_addr = 0; > + unsigned long numa_around_size = READ_ONCE(numa_around_bytes); > + unsigned long fault_address = vmf->address; > + > + /* > + * Make sure the size of proactive numa fault address is less than the > + * size of current VMA or PMD. > + */ > + if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && > + numa_around_size > 0) { > + unsigned long around_addr = > + (fault_address + numa_around_size) & PAGE_MASK; > + unsigned long next_pmd = (fault_address & PMD_MASK) + PMD_SIZE; > + > + max_numa_addr = min3(around_addr, next_pmd, > + vm_end_gap(vmf->vma)); > + } > + > +try_next: > + was_writable = pte_savedwrite(vmf->orig_pte); > + flags = 0; > + page_nid = NUMA_NO_NODE; > > /* > * The "pte" at this point cannot be used safely without > @@ -4350,7 +4413,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) > old_pte = ptep_get(vmf->pte); > pte = pte_modify(old_pte, vma->vm_page_prot); > > - page = vm_normal_page(vma, vmf->address, pte); > + page = vm_normal_page(vma, fault_address, pte); > if (!page) > goto out_map; > > @@ -4369,6 +4432,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) > if (!was_writable) > flags |= TNF_NO_GROUP; > > + /* > + * According to the data locality for some workloads, the probability > + * of accessing some data soon after some nearby data has been accessed. > + * So for tiered memory systems, we can update the sequential page's age > + * located on slow memory type, to try to promote it to fast memory in > + * advance to improve the performance. > + */ > + if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && > + vmf->address != fault_address) > + xchg_page_access_time(page, jiffies_to_msecs(jiffies)); > + > /* > * Flag if the page is shared between multiple address spaces. This > * is later used when determining whether to group tasks together > @@ -4386,7 +4460,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) > last_cpupid = (-1 & LAST_CPUPID_MASK); > else > last_cpupid = page_cpupid_last(page); > - target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, > + target_nid = numa_migrate_prep(page, vma, fault_address, page_nid, > &flags); > if (target_nid == NUMA_NO_NODE) { > put_page(page); > @@ -4400,7 +4474,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) > flags |= TNF_MIGRATED; > } else { > flags |= TNF_MIGRATE_FAIL; > - vmf->pte = pte_offset_map(vmf->pmd, vmf->address); > + vmf->pte = pte_offset_map(vmf->pmd, fault_address); > spin_lock(vmf->ptl); > if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { > pte_unmap_unlock(vmf->pte, vmf->ptl); > @@ -4412,19 +4486,25 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) > out: > if (page_nid != NUMA_NO_NODE) > task_numa_fault(last_cpupid, page_nid, 1, flags); > + > + if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && > + max_numa_addr > 0 && page_nid != NUMA_NO_NODE && > + can_next_numa_page(vmf, &fault_address, max_numa_addr)) > + goto try_next; > + > return 0; > out_map: > /* > * Make it present again, depending on how arch implements > * non-accessible ptes, some can allow access by kernel mode. > */ > - old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); > + old_pte = ptep_modify_prot_start(vma, fault_address, vmf->pte); > pte = pte_modify(old_pte, vma->vm_page_prot); > pte = pte_mkyoung(pte); > if (was_writable) > pte = pte_mkwrite(pte); > - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); > - update_mmu_cache(vma, vmf->address, vmf->pte); > + ptep_modify_prot_commit(vma, fault_address, vmf->pte, old_pte, pte); > + update_mmu_cache(vma, fault_address, vmf->pte); > pte_unmap_unlock(vmf->pte, vmf->ptl); > goto out; > } > -- > 2.27.0 >