In our container environment, we've observed that certain containers may accumulate more than 40GB of slabs, predominantly negative dentries. These negative dentries remain unreclaimed unless there is memory pressure. Even after the containers exit, these negative dentries persist. To manage disk storage efficiently, we employ an agent that identifies container images eligible for destruction once all instances of that image exit. However, during destruction, dealing with directories containing numerous negative dentries can significantly impact performance. To mitigate this issue, we aim to proactively reclaim these dentries using a user agent. Extending the memory.reclaim functionality to specifically target slabs aligns with our requirements. We propose adding a "type=" parameter to memory.reclaim to allow reclamation of pagecache pages only, slabs only, or both: - type=1: Reclaim pagecache pages only - type=2: Reclaim slabs only - type=3: Reclaim both pagecache pages and slabs For instance: echo "1M type=1" > /sys/fs/cgroup/test/memory.reclaim will perform the reclaim on the 'test' memcg to reclaim pagecache pages only. Please note that due to the derferred freeing of slabs, the amount of reclaimed slabs may higher than 1M during this process. Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx> --- Documentation/admin-guide/cgroup-v2.rst | 12 ++++++++++++ include/linux/swap.h | 2 ++ mm/memcontrol.c | 22 +++++++++++++++++++++- mm/vmscan.c | 13 ++++++++++--- 4 files changed, 45 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0270517ade47..6807d0fa197d 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1322,6 +1322,18 @@ The following nested keys are defined. same semantics as vm.swappiness applied to memcg reclaim with all the existing limitations and potential future extensions. + ==== ============================== + type Type of memory to reclaim with + ==== ============================== + + Specifying a memory type value instructs the kernel to perform + the reclaim with that memory type. The current supported + values are: + + 1 - Reclaim pagecache pages only + 2 - Reclaim slabs only + 3 - Reclaim both pagecache pages and slabs + memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/include/linux/swap.h b/include/linux/swap.h index 41e4b484bc34..27c432101032 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -404,6 +404,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) +#define MEMCG_RECLAIM_PAGECACHE_ONLY (1 << 3) +#define MEMCG_RECLAIM_SLAB_ONLY (1 << 4) #define MIN_SWAPPINESS 0 #define MAX_SWAPPINESS 200 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4070ba84b508..3dfdbf5782c8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -54,6 +54,7 @@ #include <linux/seq_file.h> #include <linux/parser.h> #include <linux/vmpressure.h> +#include <linux/parser.h> #include <linux/memremap.h> #include <linux/mm_inline.h> #include <linux/swap_cgroup.h> @@ -6930,11 +6931,13 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, enum { MEMORY_RECLAIM_SWAPPINESS = 0, + MEMORY_RECLAIM_TYPE = 1, MEMORY_RECLAIM_NULL, }; static const match_table_t tokens = { { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, + { MEMORY_RECLAIM_TYPE, "type=%d"}, { MEMORY_RECLAIM_NULL, NULL }, }; @@ -6944,7 +6947,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; - int swappiness = -1; + int swappiness = -1, type = 0; unsigned int reclaim_options; char *old_buf, *start; substring_t args[MAX_OPT_ARGS]; @@ -6968,12 +6971,29 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS) return -EINVAL; break; + case MEMORY_RECLAIM_TYPE: + if (match_int(&args[0], &type)) + return -EINVAL; + if (type > 3 || type <= 0) + return -EINVAL; + break; default: return -EINVAL; } } reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; + switch (type) { + case 1: + reclaim_options |= MEMCG_RECLAIM_PAGECACHE_ONLY; + break; + case 2: + reclaim_options |= MEMCG_RECLAIM_SLAB_ONLY; + break; + default: + break; + } + while (nr_reclaimed < nr_to_reclaim) { /* Will converge on zero, but reclaim enforces a minimum */ unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; diff --git a/mm/vmscan.c b/mm/vmscan.c index 4b1a609755bb..53cea01a1742 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -141,6 +141,9 @@ struct scan_control { /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; + unsigned int pagecache_only:1; + unsigned int slab_only:1; + /* Allocation order */ s8 order; @@ -5881,10 +5884,12 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_lruvec(lruvec, sc); + if (!sc->slab_only) + shrink_lruvec(lruvec, sc); - shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, - sc->priority); + if (!sc->pagecache_only) + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority); /* Record the group's reclaim efficiency */ if (!sc->proactive) @@ -6522,6 +6527,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), + .pagecache_only = !!(reclaim_options & MEMCG_RECLAIM_PAGECACHE_ONLY), + .slab_only = !!(reclaim_options & MEMCG_RECLAIM_SLAB_ONLY), }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put -- 2.39.1