[RFC] mm: Proactive compaction

Nitin Gupta <nigupta@xxxxxxxxxx> · Fri, 16 Aug 2019 14:43:30 -0700

For some applications we need to allocate almost all memory as
hugepages. However, on a running system, higher order allocations can
fail if the memory is fragmented. Linux kernel currently does
on-demand compaction as we request more hugepages but this style of
compaction incurs very high latency. Experiments with one-time full
memory compaction (followed by hugepage allocations) shows that kernel
is able to restore a highly fragmented memory state to a fairly
compacted memory state within <1 sec for a 32G system. Such data
suggests that a more proactive compaction can help us allocate a large
fraction of memory as hugepages keeping allocation latencies low.

For a more proactive compaction, the approach taken here is to define
per page-order external fragmentation thresholds and let kcompactd
threads act on these thresholds.

The low and high thresholds are defined per page-order and exposed
through sysfs:

  /sys/kernel/mm/compaction/order-[1..MAX_ORDER]/extfrag_{low,high}

Per-node kcompactd thread is woken up every few seconds to check if
any zone on its node has extfrag above the extfrag_high threshold for
any order, in which case the thread starts compaction in the backgrond
till all zones are below extfrag_low level for all orders. By default
both these thresolds are set to 100 for all orders which essentially
disables kcompactd.

To avoid wasting CPU cycles when compaction cannot help, such as when
memory is full, we check both, extfrag > extfrag_high and
compaction_suitable(zone). This allows kcomapctd thread to stays inactive
even if extfrag thresholds are not met.

This patch is largely based on ideas from Michal Hocko posted here:
https://lore.kernel.org/linux-mm/20161230131412.GI13301@xxxxxxxxxxxxxx/

Testing done (on x86):
 - Set /sys/kernel/mm/compaction/order-9/extfrag_{low,high} = {25, 30}
 respectively.
 - Use a test program to fragment memory: the program allocates all memory
 and then for each 2M aligned section, frees 3/4 of base pages using
 munmap.
 - kcompactd0 detects fragmentation for order-9 > extfrag_high and starts
 compaction till extfrag < extfrag_low for order-9.

The patch has plenty of rough edges but posting it early to see if I'm
going in the right direction and to get some early feedback.

Signed-off-by: Nitin Gupta <nigupta@xxxxxxxxxx>
---
 include/linux/compaction.h |  12 ++
 mm/compaction.c            | 250 ++++++++++++++++++++++++++++++-------
 mm/vmstat.c                |  12 ++
 3 files changed, 228 insertions(+), 46 deletions(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 9569e7c786d3..26bfedbbc64b 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -60,6 +60,17 @@ enum compact_result {
 
 struct alloc_context; /* in mm/internal.h */
 
+// "order-%d"
+#define COMPACTION_ORDER_STATE_NAME_LEN 16
+// Per-order compaction state
+struct compaction_order_state {
+	unsigned int order;
+	unsigned int extfrag_low;
+	unsigned int extfrag_high;
+	unsigned int extfrag_curr;
+	char name[COMPACTION_ORDER_STATE_NAME_LEN];
+};
+
 /*
  * Number of free order-0 pages that should be available above given watermark
  * to make sure compaction has reasonable chance of not running out of free
@@ -90,6 +101,7 @@ extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 extern int sysctl_extfrag_threshold;
 extern int sysctl_compact_unevictable_allowed;
 
+extern int extfrag_for_order(struct zone *zone, unsigned int order);
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 		unsigned int order, unsigned int alloc_flags,
diff --git a/mm/compaction.c b/mm/compaction.c
index 952dc2fb24e5..21866b1ad249 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -25,6 +25,10 @@
 #include <linux/psi.h>
 #include "internal.h"
 
+#ifdef CONFIG_COMPACTION
+struct compaction_order_state compaction_order_states[MAX_ORDER+1];
+#endif
+
 #ifdef CONFIG_COMPACTION
 static inline void count_compact_event(enum vm_event_item item)
 {
@@ -1846,6 +1850,49 @@ static inline bool is_via_compact_memory(int order)
 	return order == -1;
 }
 
+static int extfrag_wmark_high(struct zone *zone)
+{
+	int order;
+
+	for (order = 1; order <= MAX_ORDER; order++) {
+		int extfrag = extfrag_for_order(zone, order);
+		int threshold = compaction_order_states[order].extfrag_high;
+
+		if (extfrag > threshold)
+			return order;
+	}
+	return 0;
+}
+
+static bool node_should_compact(pg_data_t *pgdat)
+{
+	struct zone *zone;
+
+	for_each_populated_zone(zone) {
+		int order = extfrag_wmark_high(zone);
+
+		if (order && compaction_suitable(zone, order,
+				0, zone_idx(zone)) == COMPACT_CONTINUE) {
+			return true;
+		}
+	}
+	return false;
+}
+
+static int extfrag_wmark_low(struct zone *zone)
+{
+	int order;
+
+	for (order = 1; order <= MAX_ORDER; order++) {
+		int extfrag = extfrag_for_order(zone, order);
+		int threshold = compaction_order_states[order].extfrag_low;
+
+		if (extfrag > threshold)
+			return order;
+	}
+	return 0;
+}
+
 static enum compact_result __compact_finished(struct compact_control *cc)
 {
 	unsigned int order;
@@ -1872,7 +1919,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
 			return COMPACT_PARTIAL_SKIPPED;
 	}
 
-	if (is_via_compact_memory(cc->order))
+	if (extfrag_wmark_low(cc->zone))
 		return COMPACT_CONTINUE;
 
 	/*
@@ -1962,18 +2009,6 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
 {
 	unsigned long watermark;
 
-	if (is_via_compact_memory(order))
-		return COMPACT_CONTINUE;
-
-	watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
-	/*
-	 * If watermarks for high-order allocation are already met, there
-	 * should be no need for compaction at all.
-	 */
-	if (zone_watermark_ok(zone, order, watermark, classzone_idx,
-								alloc_flags))
-		return COMPACT_SUCCESS;
-
 	/*
 	 * Watermarks for order-0 must be met for compaction to be able to
 	 * isolate free pages for migration targets. This means that the
@@ -2003,31 +2038,9 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
 					int classzone_idx)
 {
 	enum compact_result ret;
-	int fragindex;
 
 	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
 				    zone_page_state(zone, NR_FREE_PAGES));
-	/*
-	 * fragmentation index determines if allocation failures are due to
-	 * low memory or external fragmentation
-	 *
-	 * index of -1000 would imply allocations might succeed depending on
-	 * watermarks, but we already failed the high-order watermark check
-	 * index towards 0 implies failure is due to lack of memory
-	 * index towards 1000 implies failure is due to fragmentation
-	 *
-	 * Only compact if a failure would be due to fragmentation. Also
-	 * ignore fragindex for non-costly orders where the alternative to
-	 * a successful reclaim/compaction is OOM. Fragindex and the
-	 * vm.extfrag_threshold sysctl is meant as a heuristic to prevent
-	 * excessive compaction for costly orders, but it should not be at the
-	 * expense of system stability.
-	 */
-	if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
-		fragindex = fragmentation_index(zone, order);
-		if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-			ret = COMPACT_NOT_SUITABLE_ZONE;
-	}
 
 	trace_mm_compaction_suitable(zone, order, ret);
 	if (ret == COMPACT_NOT_SUITABLE_ZONE)
@@ -2416,7 +2429,6 @@ static void compact_node(int nid)
 		.gfp_mask = GFP_KERNEL,
 	};
 
-
 	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
 
 		zone = &pgdat->node_zones[zoneid];
@@ -2493,9 +2505,149 @@ void compaction_unregister_node(struct node *node)
 }
 #endif /* CONFIG_SYSFS && CONFIG_NUMA */
 
+#ifdef CONFIG_SYSFS
+
+#define COMPACTION_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define COMPACTION_ATTR(_name) \
+	static struct kobj_attribute _name##_attr = \
+		__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static struct kobject *compaction_kobj;
+static struct kobject *compaction_order_kobjs[MAX_ORDER];
+
+static struct compaction_order_state *kobj_to_compaction_order_state(
+						struct kobject *kobj)
+{
+	int i;
+
+	for (i = 1; i <= MAX_ORDER; i++) {
+		if (compaction_order_kobjs[i] == kobj)
+			return &compaction_order_states[i];
+	}
+
+	return NULL;
+}
+
+static ssize_t extfrag_store_common(bool is_low, struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	int err;
+	unsigned long input;
+	struct compaction_order_state *c = kobj_to_compaction_order_state(kobj);
+
+	err = kstrtoul(buf, 10, &input);
+	if (err)
+		return err;
+	if (input > 100)
+		return -EINVAL;
+
+	if (is_low)
+		c->extfrag_low = input;
+	else
+		c->extfrag_high = input;
+
+	return count;
+}
+
+static ssize_t extfrag_low_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct compaction_order_state *c = kobj_to_compaction_order_state(kobj);
+
+	return sprintf(buf, "%u\n", c->extfrag_low);
+}
+
+static ssize_t extfrag_low_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	return extfrag_store_common(true, kobj, attr, buf, count);
+}
+COMPACTION_ATTR(extfrag_low);
+
+static ssize_t extfrag_high_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	struct compaction_order_state *c = kobj_to_compaction_order_state(kobj);
+
+	return sprintf(buf, "%u\n", c->extfrag_high);
+}
+
+static ssize_t extfrag_high_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	return extfrag_store_common(false, kobj, attr, buf, count);
+}
+COMPACTION_ATTR(extfrag_high);
+
+static struct attribute *compaction_order_attrs[] = {
+	&extfrag_low_attr.attr,
+	&extfrag_high_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group compaction_order_attr_group = {
+	.attrs = compaction_order_attrs,
+};
+
+static int compaction_sysfs_add_order(struct compaction_order_state *c,
+	struct kobject *parent, struct kobject **compaction_order_kobjs,
+	const struct attribute_group *compaction_order_attr_group)
+{
+	int retval;
+
+	compaction_order_kobjs[c->order] =
+			kobject_create_and_add(c->name, parent);
+	if (!compaction_order_kobjs[c->order])
+		return -ENOMEM;
+
+	retval = sysfs_create_group(compaction_order_kobjs[c->order],
+				compaction_order_attr_group);
+	if (retval)
+		kobject_put(compaction_order_kobjs[c->order]);
+
+	return retval;
+}
+
+static void __init compaction_sysfs_init(void)
+{
+	struct compaction_order_state *c;
+	int i, err;
+
+	compaction_kobj = kobject_create_and_add("compaction", mm_kobj);
+	if (!compaction_kobj)
+		return;
+
+	for (i = 1; i <= MAX_ORDER; i++) {
+		c = &compaction_order_states[i];
+		err = compaction_sysfs_add_order(c, compaction_kobj,
+					compaction_order_kobjs,
+					&compaction_order_attr_group);
+		if (err)
+			pr_err("compaction: Unable to add state %s", c->name);
+	}
+}
+
+static void __init compaction_init_order_states(void)
+{
+	int i;
+
+	for (i = 0; i <= MAX_ORDER; i++) {
+		struct compaction_order_state *c = &compaction_order_states[i];
+
+		c->order = i;
+		c->extfrag_low = 100;
+		c->extfrag_high = 100;
+		snprintf(c->name, COMPACTION_ORDER_STATE_NAME_LEN,
+						"order-%d", i);
+	}
+}
+#endif
+
 static inline bool kcompactd_work_requested(pg_data_t *pgdat)
 {
-	return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
+	return kthread_should_stop() || node_should_compact(pgdat);
 }
 
 static bool kcompactd_node_suitable(pg_data_t *pgdat)
@@ -2527,15 +2679,16 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 	int zoneid;
 	struct zone *zone;
 	struct compact_control cc = {
-		.order = pgdat->kcompactd_max_order,
-		.search_order = pgdat->kcompactd_max_order,
+		.order = -1,
 		.total_migrate_scanned = 0,
 		.total_free_scanned = 0,
-		.classzone_idx = pgdat->kcompactd_classzone_idx,
-		.mode = MIGRATE_SYNC_LIGHT,
-		.ignore_skip_hint = false,
+		.mode = MIGRATE_SYNC,
+		.ignore_skip_hint = true,
+		.whole_zone = false,
 		.gfp_mask = GFP_KERNEL,
+		.classzone_idx = MAX_NR_ZONES - 1,
 	};
+
 	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
 							cc.classzone_idx);
 	count_compact_event(KCOMPACTD_WAKE);
@@ -2565,7 +2718,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 		if (kthread_should_stop())
 			return;
 		status = compact_zone(&cc, NULL);
-
 		if (status == COMPACT_SUCCESS) {
 			compaction_defer_reset(zone, cc.order, false);
 		} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
@@ -2650,11 +2802,14 @@ static int kcompactd(void *p)
 	pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
 
 	while (!kthread_should_stop()) {
-		unsigned long pflags;
+		unsigned long ret, pflags;
 
 		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
-		wait_event_freezable(pgdat->kcompactd_wait,
-				kcompactd_work_requested(pgdat));
+		ret = wait_event_freezable_timeout(pgdat->kcompactd_wait,
+				kcompactd_work_requested(pgdat),
+				msecs_to_jiffies(5000));
+		if (!ret)
+			continue;
 
 		psi_memstall_enter(&pflags);
 		kcompactd_do_work(pgdat);
@@ -2735,6 +2890,9 @@ static int __init kcompactd_init(void)
 		return ret;
 	}
 
+	compaction_init_order_states();
+	compaction_sysfs_init();
+
 	for_each_node_state(nid, N_MEMORY)
 		kcompactd_run(nid);
 	return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fd7e16ca6996..e9090a5595d1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1074,6 +1074,18 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
 	return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
 }
 
+int extfrag_for_order(struct zone *zone, unsigned int order)
+{
+	struct contig_page_info info;
+
+	fill_contig_page_info(zone, order, &info);
+	if (info.free_pages == 0)
+		return 0;
+
+	return (info.free_pages - (info.free_blocks_suitable << order)) * 100
+							/ info.free_pages;
+}
+
 /* Same as __fragmentation index but allocs contig_page_info on stack */
 int fragmentation_index(struct zone *zone, unsigned int order)
 {
-- 
2.20.1