[RFC PATCH 1/4] mm: memcg: pmc framework

Huan Yang <link@xxxxxxxx> · Tue, 2 Jul 2024 16:44:04 +0800

pmc - per memcg cache

This patch add a feature pmc in each memcg unless root memcg.
User can enable pmc in a target memcg, so all task in this memcg
will share a cache pool, the alloc/free order 0 page will high
priority turn in this cache pool.

Signed-off-by: Huan Yang <link@xxxxxxxx>
---
 include/linux/memcontrol.h |  41 +++++++
 include/linux/mmzone.h     |  25 ++++
 include/linux/swap.h       |   1 +
 mm/memcontrol.c            | 237 +++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c            | 146 +++++++++++++++++++++++
 5 files changed, 450 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8f332b4ae84c..5ec4c64bc515 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -130,6 +130,7 @@ struct mem_cgroup_per_node {
 	bool			on_tree;
 	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
 						/* use container_of	   */
+	struct mem_cgroup_per_node_cache *cachep;
 };
 
 struct mem_cgroup_threshold {
@@ -336,6 +337,8 @@ struct mem_cgroup {
 	struct lru_gen_mm_list mm_list;
 #endif
 
+	bool cache_enabled;
+
 	struct mem_cgroup_per_node *nodeinfo[];
 };
 
@@ -557,6 +560,8 @@ static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *ob
 	return memcg;
 }
 
+extern struct static_key_true pmc_key;
+
 #ifdef CONFIG_MEMCG_KMEM
 /*
  * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set.
@@ -1185,6 +1190,25 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);
 
+static inline bool pmc_disabled(void)
+{
+	return static_branch_likely(&pmc_key);
+}
+
+static inline bool mem_cgroup_cache_disabled(struct mem_cgroup *memcg)
+{
+	return !READ_ONCE(memcg->cache_enabled);
+}
+
+
+static inline struct mem_cgroup_per_node_cache *
+mem_cgroup_get_node_cachep(struct mem_cgroup *memcg, int nid)
+{
+	struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+
+	return nodeinfo->cachep;
+}
+
 #else /* CONFIG_MEMCG */
 
 #define MEM_CGROUP_ID_SHIFT	0
@@ -1648,6 +1672,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 {
 	return 0;
 }
+
+static inline bool pmc_disabled(void)
+{
+	return true;
+}
+
+static inline bool mem_cgroup_cache_disabled(struct mem_cgroup *memcg)
+{
+	return true;
+}
+
+
+static inline struct mem_cgroup_per_node_cache *
+mem_cgroup_get_node_cachep(struct mem_cgroup *memcg, int nid)
+{
+	return NULL;
+}
 #endif /* CONFIG_MEMCG */
 
 /*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c11b7cde81ef..773b89e214c9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -603,6 +603,31 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
 
 #endif /* CONFIG_LRU_GEN */
 
+struct mem_cgroup_zone_cache {
+	/* cache pages, current only hold order 0 */
+	struct list_head pages;
+	spinlock_t pages_lock;
+	atomic_t nr_pages;
+	atomic_t nr_alloced;
+};
+
+struct mem_cgroup_per_node_cache {
+	/* per zone cache */
+	struct mem_cgroup_zone_cache zone_cachep[MAX_NR_ZONES];
+	struct mem_cgroup *memcg;
+
+	/* max number to hold page, unit page, default 100MB */
+#define DEFAULT_PMC_HOLD_LIMIX ((100 << 20) >> PAGE_SHIFT)
+	unsigned int hold_limit;
+
+#define DEFAULT_PMC_GAP_WATERMARK ((50 << 20) >> PAGE_SHIFT)
+	/*
+	 * Only when zone free pages above high+allow watermark, can hold cache,
+	 * unit page, default 50MB
+	 */
+	unsigned int allow_watermark;
+};
+
 struct lruvec {
 	struct list_head		lists[NR_LRU_LISTS];
 	/* per lruvec lru_lock for memcg */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 11c53692f65f..d7b5e0a8317c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -420,6 +420,7 @@ extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 long remove_mapping(struct address_space *mapping, struct folio *folio);
+extern int mem_cgroup_release_cache(struct mem_cgroup_per_node_cache *fc);
 
 #ifdef CONFIG_NUMA
 extern int node_reclaim_mode;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1b3c3394a2ba..404fcb96bf68 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -95,6 +95,15 @@ static bool cgroup_memory_nokmem __ro_after_init;
 /* BPF memory accounting disabled? */
 static bool cgroup_memory_nobpf __ro_after_init;
 
+/*
+ * How many memcg enabled cache? If none, static branch will enable
+ * so none task free/alloc will into PMC path.
+ * Else, hold/free cache in target memcg, disable static branch.
+ */
+static atomic_t pmc_nr_enabled;
+DEFINE_STATIC_KEY_TRUE(pmc_key);
+
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 #endif
@@ -5738,6 +5747,8 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
 	lru_gen_release_memcg(memcg);
 }
 
+static int __disable_mem_cgroup_cache(struct mem_cgroup *memcg);
+
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5762,6 +5773,8 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 	cancel_work_sync(&memcg->high_work);
 	mem_cgroup_remove_from_trees(memcg);
 	free_shrinker_info(memcg);
+	if (READ_ONCE(memcg->cache_enabled))
+		__disable_mem_cgroup_cache(memcg);
 	mem_cgroup_free(memcg);
 }
 
@@ -7088,6 +7101,223 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 	return nbytes;
 }
 
+static int __enable_mem_cgroup_cache(struct mem_cgroup *memcg)
+{
+	int nid, idx;
+
+	if (!mem_cgroup_cache_disabled(memcg))
+		return -EINVAL;
+
+	for_each_node(nid) {
+		struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+		struct mem_cgroup_per_node_cache *p = kvzalloc_node(
+			sizeof(struct mem_cgroup_per_node_cache),
+			GFP_KERNEL, nid);
+
+		if (unlikely(!p))
+			goto fail;
+
+		nodeinfo->cachep = p;
+	}
+
+	for_each_node(nid) {
+		struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+		pg_data_t *pgdat = NODE_DATA(nid);
+		struct mem_cgroup_per_node_cache *p = nodeinfo->cachep;
+
+		for (idx = 0; idx < MAX_NR_ZONES; idx++) {
+			struct zone *z = &pgdat->node_zones[idx];
+			struct mem_cgroup_zone_cache *zc;
+
+			if (!populated_zone(z))
+				continue;
+
+			zc = &p->zone_cachep[idx];
+
+			INIT_LIST_HEAD(&zc->pages);
+			spin_lock_init(&zc->pages_lock);
+		}
+
+		p->memcg = memcg;
+		p->hold_limit = DEFAULT_PMC_HOLD_LIMIX;
+		p->allow_watermark = DEFAULT_PMC_GAP_WATERMARK;
+
+		atomic_inc(&pmc_nr_enabled);
+	}
+
+	if (static_branch_likely(&pmc_key))
+		static_branch_disable(&pmc_key);
+
+	//online
+	smp_wmb();
+	WRITE_ONCE(memcg->cache_enabled, true);
+	atomic_inc(&pmc_nr_enabled);
+
+	return 0;
+
+fail:
+	for_each_node(nid) {
+		struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+
+		if (nodeinfo->cachep) {
+			kvfree(nodeinfo->cachep);
+			nodeinfo->cachep = NULL;
+		}
+	}
+
+	return -ENOMEM;
+}
+
+static int __disable_mem_cgroup_cache(struct mem_cgroup *memcg)
+{
+	int nid;
+
+	if (unlikely(mem_cgroup_cache_disabled(memcg)))
+		return -EINVAL;
+
+	//offline
+	WRITE_ONCE(memcg->cache_enabled, false);
+
+	for_each_node(nid) {
+		struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+		struct mem_cgroup_per_node_cache *p;
+
+		p = nodeinfo->cachep;
+
+		mem_cgroup_release_cache(p);
+
+		kfree(p);
+	}
+
+	if (atomic_dec_and_test(&pmc_nr_enabled))
+		static_branch_enable(&pmc_key);
+
+	return 0;
+}
+
+static int mem_cgroup_cache_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg;
+	int nid;
+
+	if (static_branch_likely(&pmc_key))
+		return -EINVAL;
+
+	memcg = mem_cgroup_from_seq(m);
+	if (!READ_ONCE(memcg->cache_enabled))
+		return -EINVAL;
+
+	seq_printf(m, "%4s %16s %16s\n", "NODE", "WATERMARK", "HOLD_LIMIT");
+	for_each_online_node(nid) {
+		struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+		struct mem_cgroup_per_node_cache *p;
+
+		p = nodeinfo->cachep;
+		if (!p)
+			continue;
+
+		seq_printf(m, "%4d %14uKB %14uKB\n", nid,
+			   (READ_ONCE(p->allow_watermark) << (PAGE_SHIFT - 10)),
+			   (READ_ONCE(p->hold_limit) << (PAGE_SHIFT - 10)));
+	}
+
+	seq_puts(m, "===========\n");
+	seq_printf(m, "%4s %16s %16s %16s\n", "NODE", "ZONE", "CACHE", "HIT");
+
+	for_each_online_node(nid) {
+		struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+		struct mem_cgroup_per_node_cache *p;
+		pg_data_t *pgdat = NODE_DATA(nid);
+		int idx;
+
+		p = nodeinfo->cachep;
+		if (!p)
+			continue;
+
+		for (idx = 0; idx < MAX_NR_ZONES; idx++) {
+			struct mem_cgroup_zone_cache *zc;
+			struct zone *z = &pgdat->node_zones[idx];
+
+			if (!populated_zone(z))
+				continue;
+
+			zc = &p->zone_cachep[idx];
+			seq_printf(m, "%4d %16s %14dKB %14dKB\n", nid, z->name,
+				   (atomic_read(&zc->nr_pages)
+				    << (PAGE_SHIFT - 10)),
+				   (atomic_read(&zc->nr_alloced)
+				    << (PAGE_SHIFT - 10)));
+		}
+	}
+
+	return 0;
+}
+
+enum {
+	OPT_CTRL_ENABLE,
+	OPT_CTRL_ERR,
+	OPR_CTRL_NR = OPT_CTRL_ERR,
+};
+
+static const match_table_t ctrl_tokens = {
+					   { OPT_CTRL_ENABLE, "enable=%s" },
+					   { OPT_CTRL_ERR, NULL } };
+
+/**
+ * This function  can control target memcg's cache. include enable\keys set.
+ * To enable\disable this cache, by `echo enable=[y|n] > memory.cace`
+ * in target memcg.
+ */
+static ssize_t mem_cgroup_cache_control(struct kernfs_open_file *of, char *buf,
+					size_t nbytes, loff_t off)
+{
+	bool enable;
+	bool opt_enable_set = false;
+	int err = 0;
+	char *sub;
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+	buf = strstrip(buf);
+	if (!strlen(buf))
+		return -EINVAL;
+
+	while ((sub = strsep(&buf, " ")) != NULL) {
+		int token;
+		substring_t args[MAX_OPT_ARGS];
+		char tbuf[256];
+
+		sub = strstrip(sub);
+
+		token = match_token(sub, ctrl_tokens, args);
+		switch (token) {
+		case OPT_CTRL_ENABLE:
+			if (match_strlcpy(tbuf, &args[0], sizeof(tbuf)) >=
+			    sizeof(tbuf))
+				return -EINVAL;
+
+			err = kstrtobool(tbuf, &enable);
+			if (err)
+				return -EINVAL;
+			opt_enable_set = true;
+			break;
+		case OPT_CTRL_ERR:
+		default:
+			return -EINVAL;
+		}
+	}
+
+	if (opt_enable_set) {
+		if (enable) {
+			__enable_mem_cgroup_cache(memcg);
+		} else {
+			__disable_mem_cgroup_cache(memcg);
+			return nbytes;
+		}
+	}
+
+	return err ? err : nbytes;
+}
+
 static struct cftype memory_files[] = {
 	{
 		.name = "current",
@@ -7156,6 +7386,13 @@ static struct cftype memory_files[] = {
 		.flags = CFTYPE_NS_DELEGATABLE,
 		.write = memory_reclaim,
 	},
+	/* free cache field */
+	{
+		.name = "cache",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write = mem_cgroup_cache_control,
+		.seq_show = mem_cgroup_cache_show,
+	},
 	{ }	/* terminate */
 };
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1beb56f75319..54c4d00c2506 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -530,6 +530,14 @@ static inline int pindex_to_order(unsigned int pindex)
 	return order;
 }
 
+/**
+ * Per memcg cache currently only allow order 0.
+ */
+static inline bool pmc_allow_order(unsigned int order)
+{
+	return !order;
+}
+
 static inline bool pcp_allowed_order(unsigned int order)
 {
 	if (order <= PAGE_ALLOC_COSTLY_ORDER)
@@ -1271,6 +1279,43 @@ void __free_pages_core(struct page *page, unsigned int order)
 	__free_pages_ok(page, order, FPI_TO_TAIL);
 }
 
+int mem_cgroup_release_cache(struct mem_cgroup_per_node_cache *nodep)
+{
+	LIST_HEAD(temp_list);
+	int zid, num = 0;
+
+	for (zid = 0; zid < MAX_NR_ZONES; ++zid) {
+		struct mem_cgroup_zone_cache *zc = &nodep->zone_cachep[zid];
+		int i = 0;
+
+		if (!atomic_read(&zc->nr_pages))
+			continue;
+
+		spin_lock(&zc->pages_lock);
+		list_splice_init(&zc->pages, &temp_list);
+		spin_unlock(&zc->pages_lock);
+
+		while (!list_empty(&temp_list)) {
+			struct page *page =
+				list_first_entry(&temp_list, struct page, lru);
+			struct zone *zone = page_zone(page);
+			unsigned long pfn = page_to_pfn(page);
+
+			list_del(&page->lru);
+
+
+			// is good to put into pcp?
+			free_one_page(zone, page, pfn, 0, FPI_NONE);
+			++i;
+		}
+
+		num += i;
+		atomic_sub(i, &zc->nr_pages);
+	}
+
+	return num;
+}
+
 /*
  * Check that the whole (or subset of) a pageblock given by the interval of
  * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
@@ -2603,6 +2648,41 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	}
 }
 
+static bool free_unref_page_to_pmc(struct page *page, struct zone *zone,
+				   int order)
+{
+	struct mem_cgroup *memcg;
+	struct mem_cgroup_per_node_cache *cachp;
+	struct mem_cgroup_zone_cache *zc;
+	unsigned long flags;
+	bool ret = false;
+
+	if (pmc_disabled())
+		return false;
+
+	memcg = get_mem_cgroup_from_current();
+	if (!memcg || mem_cgroup_is_root(memcg) ||
+	    mem_cgroup_cache_disabled(memcg))
+		goto out;
+
+	cachp = mem_cgroup_get_node_cachep(memcg, page_to_nid(page));
+	zc = &cachp->zone_cachep[page_zonenum(page)];
+
+	if (high_wmark_pages(zone) + READ_ONCE(cachp->allow_watermark) >=
+	    zone_page_state(zone, NR_FREE_PAGES))
+		goto out;
+
+	spin_lock_irqsave(&zc->pages_lock, flags);
+	list_add(&page->lru, &zc->pages);
+	spin_unlock_irqrestore(&zc->pages_lock, flags);
+	atomic_inc(&zc->nr_pages);
+
+	ret = true;
+out:
+	mem_cgroup_put(memcg);
+	return ret;
+}
+
 /*
  * Free a pcp page
  */
@@ -2634,6 +2714,17 @@ void free_unref_page(struct page *page, unsigned int order)
 	}
 
 	zone = page_zone(page);
+
+	/**
+	 * This function can cache release page before free into pcp if current
+	 * memcg enabled cache feature.
+	 * Compared to PCP, PMC is unique, only processes in PMC can access it.
+	 * So, if the conditions are met, it should be prioritized to be
+	 * released to PMC before being released to the public CPU cache.
+	 */
+	if (pmc_allow_order(order) && free_unref_page_to_pmc(page, zone, order))
+		return;
+
 	pcp_trylock_prepare(UP_flags);
 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 	if (pcp) {
@@ -3012,6 +3103,49 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	return page;
 }
 
+static struct page *rmqueue_mem_cgroup_cache(struct zone *preferred_zone,
+					     struct zone *zone,
+					     unsigned int order,
+					     int migratetype)
+{
+	struct mem_cgroup *memcg;
+	struct mem_cgroup_per_node_cache *cachp;
+	struct mem_cgroup_zone_cache *zc;
+	unsigned long flags;
+	int nid = zone->zone_pgdat->node_id;
+	struct page *page = NULL;
+
+	if (pmc_disabled())
+		return NULL;
+
+	memcg = get_mem_cgroup_from_current();
+	if (!memcg || mem_cgroup_is_root(memcg) ||
+	    mem_cgroup_cache_disabled(memcg))
+		goto out;
+
+	cachp = mem_cgroup_get_node_cachep(memcg, nid);
+
+	zc = &cachp->zone_cachep[zone_idx(zone)];
+	if (!atomic_read(&zc->nr_pages))
+		goto out;
+
+	spin_lock_irqsave(&zc->pages_lock, flags);
+	if (list_empty(&zc->pages)) {
+		spin_unlock_irqrestore(&zc->pages_lock, flags);
+		goto out;
+	}
+	page = list_first_entry(&zc->pages, struct page, lru);
+	list_del(&page->lru);
+	spin_unlock_irqrestore(&zc->pages_lock, flags);
+
+	atomic_dec(&zc->nr_pages);
+	atomic_inc(&zc->nr_alloced);
+
+out:
+	mem_cgroup_put(memcg);
+	return page;
+}
+
 /*
  * Allocate a page from the given zone.
  * Use pcplists for THP or "cheap" high-order allocations.
@@ -3038,6 +3172,18 @@ struct page *rmqueue(struct zone *preferred_zone,
 	 */
 	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
 
+	/*
+	 * Before disturb public pcp or buddy, current may in a memcg
+	 * which already enabled cache feature.
+	 * If that's true, first get page from private pool can boost alloc.
+	 */
+	if (pmc_allow_order(order)) {
+		page = rmqueue_mem_cgroup_cache(preferred_zone, zone, order,
+						migratetype);
+		if (page)
+			goto out;
+	}
+
 	if (likely(pcp_allowed_order(order))) {
 		page = rmqueue_pcplist(preferred_zone, zone, order,
 				       migratetype, alloc_flags);
-- 
2.45.2