From: Barry Song <v-songbaohua@xxxxxxxx> Quote Ying's comment: A user space interface can be implemented to select different swap-in order policies, similar to the mTHP allocation order policy. We need a distinct policy because the performance characteristics of memory allocation differ significantly from those of swap-in. For example, SSD read speeds can be much slower than memory allocation. With policy selection, I believe we can implement mTHP swap-in for non-SWAP_SYNCHRONOUS scenarios as well. However, users need to understand the implications of their choices. I think that it's better to start with at least always never. I believe that we will add auto in the future to tune automatically, which can be used as default finally. Suggested-by: "Huang, Ying" <ying.huang@xxxxxxxxx> Signed-off-by: Barry Song <v-songbaohua@xxxxxxxx> --- Documentation/admin-guide/mm/transhuge.rst | 6 +++ include/linux/huge_mm.h | 1 + mm/huge_memory.c | 44 ++++++++++++++++++++++ mm/memory.c | 3 +- 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 058485daf186..2e94e956ee12 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -144,6 +144,12 @@ hugepage sizes have enabled="never". If enabling multiple hugepage sizes, the kernel will select the most appropriate enabled size for a given allocation. +Transparent Hugepage Swap-in for anonymous memory can be disabled or enabled +by per-supported-THP-size with one of:: + + echo always >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/swapin_enabled + echo never >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/swapin_enabled + It's also possible to limit defrag efforts in the VM to generate anonymous hugepages in case they're not immediately free to madvise regions or to never try to defrag memory and simply fallback to regular diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e25d9ebfdf89..25174305b17f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -92,6 +92,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define TVA_SMAPS (1 << 0) /* Will be used for procfs */ #define TVA_IN_PF (1 << 1) /* Page fault handler */ #define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */ +#define TVA_IN_SWAPIN (1 << 3) /* Do swap-in */ #define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0167dc27e365..41460847988c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -80,6 +80,7 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; +unsigned long huge_anon_orders_swapin_always __read_mostly; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, @@ -88,6 +89,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, { bool smaps = tva_flags & TVA_SMAPS; bool in_pf = tva_flags & TVA_IN_PF; + bool in_swapin = tva_flags & TVA_IN_SWAPIN; bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; unsigned long supported_orders; @@ -100,6 +102,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; orders &= supported_orders; + if (in_swapin) + orders &= READ_ONCE(huge_anon_orders_swapin_always); if (!orders) return 0; @@ -523,8 +527,48 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj, static struct kobj_attribute thpsize_enabled_attr = __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store); +static DEFINE_SPINLOCK(huge_anon_orders_swapin_lock); + +static ssize_t thpsize_swapin_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_anon_orders_swapin_always)) + output = "[always] never"; + else + output = "always [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t thpsize_swapin_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) { + spin_lock(&huge_anon_orders_swapin_lock); + set_bit(order, &huge_anon_orders_swapin_always); + spin_unlock(&huge_anon_orders_swapin_lock); + } else if (sysfs_streq(buf, "never")) { + spin_lock(&huge_anon_orders_swapin_lock); + clear_bit(order, &huge_anon_orders_swapin_always); + spin_unlock(&huge_anon_orders_swapin_lock); + } else + ret = -EINVAL; + + return ret; +} +static struct kobj_attribute thpsize_swapin_enabled_attr = + __ATTR(swapin_enabled, 0644, thpsize_swapin_enabled_show, thpsize_swapin_enabled_store); + static struct attribute *thpsize_attrs[] = { &thpsize_enabled_attr.attr, + &thpsize_swapin_enabled_attr.attr, #ifdef CONFIG_SHMEM &thpsize_shmem_enabled_attr.attr, #endif diff --git a/mm/memory.c b/mm/memory.c index 14048e9285d4..27c77f739a2c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4091,7 +4091,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * and suitable for swapping THP. */ orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + TVA_IN_PF | TVA_IN_SWAPIN | TVA_ENFORCE_SYSFS, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); -- 2.34.1