The high-order pages stored on PCP list may not always win, even herts some workloads, so it is disabled by default for high-orders except PMD_ORDER. Since there is already per-supported-THP-size interfaces to configrate mTHP behaviours, adding a new control pcp_enabled under above interfaces to allow user to enable/disable the specified high-order pages stored on PCP list or not, but it can't change the existing behaviour for order = PMD_ORDER and order <= PAGE_ALLOC_COSTLY_ORDER, they are always enabled and can't be disabled, meanwhile, when disabled by pcp_enabled for other high-orders, pcplists will be drained. Signed-off-by: Kefeng Wang <wangkefeng.wang@xxxxxxxxxx> --- Documentation/admin-guide/mm/transhuge.rst | 11 +++++ include/linux/gfp.h | 1 + include/linux/huge_mm.h | 1 + mm/huge_memory.c | 47 ++++++++++++++++++++++ mm/page_alloc.c | 16 ++++++++ 5 files changed, 76 insertions(+) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 04eb45a2f940..3cb91336f81a 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -189,6 +189,17 @@ madvise never should be self-explanatory. + +There's also sysfs knob to control hugepage to be stored on PCP lists for +high-orders(greated than PAGE_ALLOC_COSTLY_ORDER), which could reduce +the zone lock contention when allocate hige-order pages frequently. Please +note that the PCP behavior of low-order and PMD-order pages cannot changed, +it is possible to enable other higher-order pages stored on PCP lists by +writing 1 or disable it back by writing 0:: + + echo 0 >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/pcp_enabled + echo 1 >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/pcp_enabled + By default kernel tries to use huge, PMD-mappable zero page on read page fault to anonymous mapping. It's possible to disable huge zero page by writing 0 or enable it back by writing 1:: diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 450c2cbcf04b..2ae1157abd6e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -365,6 +365,7 @@ extern void page_frag_free(void *addr); void page_alloc_init_cpuhp(void); int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); +void drain_all_zone_pages(void); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b67294d5814f..86306becfd52 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -108,6 +108,7 @@ extern unsigned long transparent_hugepage_flags; extern unsigned long huge_anon_orders_always; extern unsigned long huge_anon_orders_madvise; extern unsigned long huge_anon_orders_inherit; +extern unsigned long huge_pcp_allow_orders; static inline bool hugepage_global_enabled(void) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9a1b57ef9c60..9b8a8aa36526 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -512,8 +512,49 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj, static struct kobj_attribute thpsize_enabled_attr = __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store); +unsigned long huge_pcp_allow_orders __read_mostly; +static ssize_t thpsize_pcp_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + + return sysfs_emit(buf, "%d\n", + !!test_bit(order, &huge_pcp_allow_orders)); +} + +static ssize_t thpsize_pcp_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + unsigned long value; + int ret; + + if (order <= PAGE_ALLOC_COSTLY_ORDER || order == PMD_ORDER) + return -EINVAL; + + ret = kstrtoul(buf, 10, &value); + if (ret < 0) + return ret; + if (value > 1) + return -EINVAL; + + if (value) { + set_bit(order, &huge_pcp_allow_orders); + } else { + if (test_and_clear_bit(order, &huge_pcp_allow_orders)) + drain_all_zone_pages(); + } + + return count; +} + +static struct kobj_attribute thpsize_pcp_enabled_attr = __ATTR(pcp_enabled, + 0644, thpsize_pcp_enabled_show, thpsize_pcp_enabled_store); + static struct attribute *thpsize_attrs[] = { &thpsize_enabled_attr.attr, + &thpsize_pcp_enabled_attr.attr, NULL, }; @@ -624,6 +665,8 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) */ huge_anon_orders_inherit = BIT(PMD_ORDER); + huge_pcp_allow_orders = BIT(PMD_ORDER); + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { pr_err("failed to create transparent hugepage kobject\n"); @@ -658,6 +701,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) err = PTR_ERR(thpsize); goto remove_all; } + + if (order <= PAGE_ALLOC_COSTLY_ORDER) + huge_pcp_allow_orders |= BIT(order); + list_add(&thpsize->node, &thpsize_list); order = next_order(&orders, order); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2248afc7b73a..25fd3fe30cb0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -537,6 +537,8 @@ static inline bool pcp_allowed_order(unsigned int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order == PCP_MAX_ORDER) return true; + if (BIT(order) & huge_pcp_allow_orders) + return true; #endif return false; } @@ -6705,6 +6707,20 @@ void zone_pcp_reset(struct zone *zone) } } +void drain_all_zone_pages(void) +{ + struct zone *zone; + + mutex_lock(&pcp_batch_high_lock); + for_each_populated_zone(zone) + __zone_set_pageset_high_and_batch(zone, 0, 0, 1); + __drain_all_pages(NULL, true); + for_each_populated_zone(zone) + __zone_set_pageset_high_and_batch(zone, zone->pageset_high_min, + zone->pageset_high_max, zone->pageset_batch); + mutex_unlock(&pcp_batch_high_lock); +} + #ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be in a single zone, must not contain holes, -- 2.27.0