[RFC PATCH] mm: Add reclaim type to memory.reclaim

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



In our container environment, we've observed that certain containers may
accumulate more than 40GB of slabs, predominantly negative dentries. These
negative dentries remain unreclaimed unless there is memory pressure. Even
after the containers exit, these negative dentries persist. To manage disk
storage efficiently, we employ an agent that identifies container images
eligible for destruction once all instances of that image exit.

However, during destruction, dealing with directories containing numerous
negative dentries can significantly impact performance. To mitigate this
issue, we aim to proactively reclaim these dentries using a user agent.
Extending the memory.reclaim functionality to specifically target slabs
aligns with our requirements.

We propose adding a "type=" parameter to memory.reclaim to allow
reclamation of pagecache pages only, slabs only, or both:

  - type=1: Reclaim pagecache pages only
  - type=2: Reclaim slabs only
  - type=3: Reclaim both pagecache pages and slabs

For instance:

  echo "1M type=1" > /sys/fs/cgroup/test/memory.reclaim

will perform the reclaim on the 'test' memcg to reclaim pagecache pages
only. Please note that due to the derferred freeing of slabs, the amount of
reclaimed slabs may higher than 1M during this process.

Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx>
---
 Documentation/admin-guide/cgroup-v2.rst | 12 ++++++++++++
 include/linux/swap.h                    |  2 ++
 mm/memcontrol.c                         | 22 +++++++++++++++++++++-
 mm/vmscan.c                             | 13 ++++++++++---
 4 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 0270517ade47..6807d0fa197d 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1322,6 +1322,18 @@ The following nested keys are defined.
 	same semantics as vm.swappiness applied to memcg reclaim with
 	all the existing limitations and potential future extensions.
 
+	  ====                  ==============================
+	  type                  Type of memory to reclaim with
+	  ====                  ==============================
+
+        Specifying a memory type value instructs the kernel to perform
+        the reclaim with that memory type. The current supported
+        values are:
+
+          1 - Reclaim pagecache pages only
+          2 - Reclaim slabs  only
+          3 - Reclaim both pagecache pages and slabs
+
   memory.peak
 	A read-only single value file which exists on non-root
 	cgroups.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 41e4b484bc34..27c432101032 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -404,6 +404,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 
 #define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
 #define MEMCG_RECLAIM_PROACTIVE (1 << 2)
+#define MEMCG_RECLAIM_PAGECACHE_ONLY (1 << 3)
+#define MEMCG_RECLAIM_SLAB_ONLY (1 << 4)
 #define MIN_SWAPPINESS 0
 #define MAX_SWAPPINESS 200
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4070ba84b508..3dfdbf5782c8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -54,6 +54,7 @@
 #include <linux/seq_file.h>
 #include <linux/parser.h>
 #include <linux/vmpressure.h>
+#include <linux/parser.h>
 #include <linux/memremap.h>
 #include <linux/mm_inline.h>
 #include <linux/swap_cgroup.h>
@@ -6930,11 +6931,13 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
 
 enum {
 	MEMORY_RECLAIM_SWAPPINESS = 0,
+	MEMORY_RECLAIM_TYPE = 1,
 	MEMORY_RECLAIM_NULL,
 };
 
 static const match_table_t tokens = {
 	{ MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+	{ MEMORY_RECLAIM_TYPE, "type=%d"},
 	{ MEMORY_RECLAIM_NULL, NULL },
 };
 
@@ -6944,7 +6947,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
 	unsigned long nr_to_reclaim, nr_reclaimed = 0;
-	int swappiness = -1;
+	int swappiness = -1, type = 0;
 	unsigned int reclaim_options;
 	char *old_buf, *start;
 	substring_t args[MAX_OPT_ARGS];
@@ -6968,12 +6971,29 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 			if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
 				return -EINVAL;
 			break;
+		case MEMORY_RECLAIM_TYPE:
+			if (match_int(&args[0], &type))
+				return -EINVAL;
+			if (type > 3 || type <= 0)
+				return -EINVAL;
+			break;
 		default:
 			return -EINVAL;
 		}
 	}
 
 	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
+	switch (type) {
+	case 1:
+		reclaim_options |= MEMCG_RECLAIM_PAGECACHE_ONLY;
+		break;
+	case 2:
+		reclaim_options |= MEMCG_RECLAIM_SLAB_ONLY;
+		break;
+	default:
+		break;
+	}
+
 	while (nr_reclaimed < nr_to_reclaim) {
 		/* Will converge on zero, but reclaim enforces a minimum */
 		unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4b1a609755bb..53cea01a1742 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -141,6 +141,9 @@ struct scan_control {
 	/* Always discard instead of demoting to lower tier memory */
 	unsigned int no_demotion:1;
 
+	unsigned int pagecache_only:1;
+	unsigned int slab_only:1;
+
 	/* Allocation order */
 	s8 order;
 
@@ -5881,10 +5884,12 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 		reclaimed = sc->nr_reclaimed;
 		scanned = sc->nr_scanned;
 
-		shrink_lruvec(lruvec, sc);
+		if (!sc->slab_only)
+			shrink_lruvec(lruvec, sc);
 
-		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
-			    sc->priority);
+		if (!sc->pagecache_only)
+			shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
+				    sc->priority);
 
 		/* Record the group's reclaim efficiency */
 		if (!sc->proactive)
@@ -6522,6 +6527,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_unmap = 1,
 		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
 		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+		.pagecache_only = !!(reclaim_options & MEMCG_RECLAIM_PAGECACHE_ONLY),
+		.slab_only = !!(reclaim_options & MEMCG_RECLAIM_SLAB_ONLY),
 	};
 	/*
 	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
-- 
2.39.1





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux