[PATCH rfc 2/3] mm: add control to allow specified high-order pages stored on PCP list

Kefeng Wang <wangkefeng.wang@xxxxxxxxxx> · Mon, 15 Apr 2024 16:12:19 +0800

The high-order pages stored on PCP list may not always win, even herts
some workloads, so it is disabled by default for high-orders except
PMD_ORDER. Since there is already per-supported-THP-size interfaces
to configrate mTHP behaviours, adding a new control pcp_enabled under
above interfaces to allow user to enable/disable the specified high-order
pages stored on PCP list or not, but it can't change the existing behaviour
for order = PMD_ORDER and order <= PAGE_ALLOC_COSTLY_ORDER, they are
always enabled and can't be disabled, meanwhile, when disabled by
pcp_enabled for other high-orders, pcplists will be drained.

Signed-off-by: Kefeng Wang <wangkefeng.wang@xxxxxxxxxx>
---
 Documentation/admin-guide/mm/transhuge.rst | 11 +++++
 include/linux/gfp.h                        |  1 +
 include/linux/huge_mm.h                    |  1 +
 mm/huge_memory.c                           | 47 ++++++++++++++++++++++
 mm/page_alloc.c                            | 16 ++++++++
 5 files changed, 76 insertions(+)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 04eb45a2f940..3cb91336f81a 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -189,6 +189,17 @@ madvise
 never
 	should be self-explanatory.
 
+
+There's also sysfs knob to control hugepage to be stored on PCP lists for
+high-orders(greated than PAGE_ALLOC_COSTLY_ORDER), which could reduce
+the zone lock contention when allocate hige-order pages frequently. Please
+note that the PCP behavior of low-order and PMD-order pages cannot changed,
+it is possible to enable other higher-order pages stored on PCP lists by
+writing 1 or disable it back by writing 0::
+
+        echo 0 >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/pcp_enabled
+        echo 1 >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/pcp_enabled
+
 By default kernel tries to use huge, PMD-mappable zero page on read
 page fault to anonymous mapping. It's possible to disable huge zero
 page by writing 0 or enable it back by writing 1::
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 450c2cbcf04b..2ae1157abd6e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -365,6 +365,7 @@ extern void page_frag_free(void *addr);
 
 void page_alloc_init_cpuhp(void);
 int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
+void drain_all_zone_pages(void);
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b67294d5814f..86306becfd52 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -108,6 +108,7 @@ extern unsigned long transparent_hugepage_flags;
 extern unsigned long huge_anon_orders_always;
 extern unsigned long huge_anon_orders_madvise;
 extern unsigned long huge_anon_orders_inherit;
+extern unsigned long huge_pcp_allow_orders;
 
 static inline bool hugepage_global_enabled(void)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9a1b57ef9c60..9b8a8aa36526 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -512,8 +512,49 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj,
 static struct kobj_attribute thpsize_enabled_attr =
 	__ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
 
+unsigned long huge_pcp_allow_orders __read_mostly;
+static ssize_t thpsize_pcp_enabled_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	int order = to_thpsize(kobj)->order;
+
+	return sysfs_emit(buf, "%d\n",
+			  !!test_bit(order, &huge_pcp_allow_orders));
+}
+
+static ssize_t thpsize_pcp_enabled_store(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 const char *buf, size_t count)
+{
+	int order = to_thpsize(kobj)->order;
+	unsigned long value;
+	int ret;
+
+	if (order <= PAGE_ALLOC_COSTLY_ORDER || order == PMD_ORDER)
+		return -EINVAL;
+
+	ret = kstrtoul(buf, 10, &value);
+	if (ret < 0)
+		return ret;
+	if (value > 1)
+		return -EINVAL;
+
+	if (value) {
+		set_bit(order, &huge_pcp_allow_orders);
+	} else {
+		if (test_and_clear_bit(order, &huge_pcp_allow_orders))
+			drain_all_zone_pages();
+	}
+
+	return count;
+}
+
+static struct kobj_attribute thpsize_pcp_enabled_attr = __ATTR(pcp_enabled,
+		0644, thpsize_pcp_enabled_show, thpsize_pcp_enabled_store);
+
 static struct attribute *thpsize_attrs[] = {
 	&thpsize_enabled_attr.attr,
+	&thpsize_pcp_enabled_attr.attr,
 	NULL,
 };
 
@@ -624,6 +665,8 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 	 */
 	huge_anon_orders_inherit = BIT(PMD_ORDER);
 
+	huge_pcp_allow_orders = BIT(PMD_ORDER);
+
 	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
 	if (unlikely(!*hugepage_kobj)) {
 		pr_err("failed to create transparent hugepage kobject\n");
@@ -658,6 +701,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 			err = PTR_ERR(thpsize);
 			goto remove_all;
 		}
+
+		if (order <= PAGE_ALLOC_COSTLY_ORDER)
+			huge_pcp_allow_orders |= BIT(order);
+
 		list_add(&thpsize->node, &thpsize_list);
 		order = next_order(&orders, order);
 	}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2248afc7b73a..25fd3fe30cb0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -537,6 +537,8 @@ static inline bool pcp_allowed_order(unsigned int order)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	if (order == PCP_MAX_ORDER)
 		return true;
+	if (BIT(order) & huge_pcp_allow_orders)
+		return true;
 #endif
 	return false;
 }
@@ -6705,6 +6707,20 @@ void zone_pcp_reset(struct zone *zone)
 	}
 }
 
+void drain_all_zone_pages(void)
+{
+	struct zone *zone;
+
+	mutex_lock(&pcp_batch_high_lock);
+	for_each_populated_zone(zone)
+		__zone_set_pageset_high_and_batch(zone, 0, 0, 1);
+	__drain_all_pages(NULL, true);
+	for_each_populated_zone(zone)
+		__zone_set_pageset_high_and_batch(zone, zone->pageset_high_min,
+				zone->pageset_high_max, zone->pageset_batch);
+	mutex_unlock(&pcp_batch_high_lock);
+}
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be in a single zone, must not contain holes,
-- 
2.27.0