pmc - per memcg cache This patch add a feature pmc in each memcg unless root memcg. User can enable pmc in a target memcg, so all task in this memcg will share a cache pool, the alloc/free order 0 page will high priority turn in this cache pool. Signed-off-by: Huan Yang <link@xxxxxxxx> --- include/linux/memcontrol.h | 41 +++++++ include/linux/mmzone.h | 25 ++++ include/linux/swap.h | 1 + mm/memcontrol.c | 237 +++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 146 +++++++++++++++++++++++ 5 files changed, 450 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8f332b4ae84c..5ec4c64bc515 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -130,6 +130,7 @@ struct mem_cgroup_per_node { bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ + struct mem_cgroup_per_node_cache *cachep; }; struct mem_cgroup_threshold { @@ -336,6 +337,8 @@ struct mem_cgroup { struct lru_gen_mm_list mm_list; #endif + bool cache_enabled; + struct mem_cgroup_per_node *nodeinfo[]; }; @@ -557,6 +560,8 @@ static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *ob return memcg; } +extern struct static_key_true pmc_key; + #ifdef CONFIG_MEMCG_KMEM /* * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. @@ -1185,6 +1190,25 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +static inline bool pmc_disabled(void) +{ + return static_branch_likely(&pmc_key); +} + +static inline bool mem_cgroup_cache_disabled(struct mem_cgroup *memcg) +{ + return !READ_ONCE(memcg->cache_enabled); +} + + +static inline struct mem_cgroup_per_node_cache * +mem_cgroup_get_node_cachep(struct mem_cgroup *memcg, int nid) +{ + struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid]; + + return nodeinfo->cachep; +} + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1648,6 +1672,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, { return 0; } + +static inline bool pmc_disabled(void) +{ + return true; +} + +static inline bool mem_cgroup_cache_disabled(struct mem_cgroup *memcg) +{ + return true; +} + + +static inline struct mem_cgroup_per_node_cache * +mem_cgroup_get_node_cachep(struct mem_cgroup *memcg, int nid) +{ + return NULL; +} #endif /* CONFIG_MEMCG */ /* diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c11b7cde81ef..773b89e214c9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -603,6 +603,31 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) #endif /* CONFIG_LRU_GEN */ +struct mem_cgroup_zone_cache { + /* cache pages, current only hold order 0 */ + struct list_head pages; + spinlock_t pages_lock; + atomic_t nr_pages; + atomic_t nr_alloced; +}; + +struct mem_cgroup_per_node_cache { + /* per zone cache */ + struct mem_cgroup_zone_cache zone_cachep[MAX_NR_ZONES]; + struct mem_cgroup *memcg; + + /* max number to hold page, unit page, default 100MB */ +#define DEFAULT_PMC_HOLD_LIMIX ((100 << 20) >> PAGE_SHIFT) + unsigned int hold_limit; + +#define DEFAULT_PMC_GAP_WATERMARK ((50 << 20) >> PAGE_SHIFT) + /* + * Only when zone free pages above high+allow watermark, can hold cache, + * unit page, default 50MB + */ + unsigned int allow_watermark; +}; + struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 11c53692f65f..d7b5e0a8317c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -420,6 +420,7 @@ extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); +extern int mem_cgroup_release_cache(struct mem_cgroup_per_node_cache *fc); #ifdef CONFIG_NUMA extern int node_reclaim_mode; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b3c3394a2ba..404fcb96bf68 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -95,6 +95,15 @@ static bool cgroup_memory_nokmem __ro_after_init; /* BPF memory accounting disabled? */ static bool cgroup_memory_nobpf __ro_after_init; +/* + * How many memcg enabled cache? If none, static branch will enable + * so none task free/alloc will into PMC path. + * Else, hold/free cache in target memcg, disable static branch. + */ +static atomic_t pmc_nr_enabled; +DEFINE_STATIC_KEY_TRUE(pmc_key); + + #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -5738,6 +5747,8 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) lru_gen_release_memcg(memcg); } +static int __disable_mem_cgroup_cache(struct mem_cgroup *memcg); + static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -5762,6 +5773,8 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); free_shrinker_info(memcg); + if (READ_ONCE(memcg->cache_enabled)) + __disable_mem_cgroup_cache(memcg); mem_cgroup_free(memcg); } @@ -7088,6 +7101,223 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, return nbytes; } +static int __enable_mem_cgroup_cache(struct mem_cgroup *memcg) +{ + int nid, idx; + + if (!mem_cgroup_cache_disabled(memcg)) + return -EINVAL; + + for_each_node(nid) { + struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node_cache *p = kvzalloc_node( + sizeof(struct mem_cgroup_per_node_cache), + GFP_KERNEL, nid); + + if (unlikely(!p)) + goto fail; + + nodeinfo->cachep = p; + } + + for_each_node(nid) { + struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid]; + pg_data_t *pgdat = NODE_DATA(nid); + struct mem_cgroup_per_node_cache *p = nodeinfo->cachep; + + for (idx = 0; idx < MAX_NR_ZONES; idx++) { + struct zone *z = &pgdat->node_zones[idx]; + struct mem_cgroup_zone_cache *zc; + + if (!populated_zone(z)) + continue; + + zc = &p->zone_cachep[idx]; + + INIT_LIST_HEAD(&zc->pages); + spin_lock_init(&zc->pages_lock); + } + + p->memcg = memcg; + p->hold_limit = DEFAULT_PMC_HOLD_LIMIX; + p->allow_watermark = DEFAULT_PMC_GAP_WATERMARK; + + atomic_inc(&pmc_nr_enabled); + } + + if (static_branch_likely(&pmc_key)) + static_branch_disable(&pmc_key); + + //online + smp_wmb(); + WRITE_ONCE(memcg->cache_enabled, true); + atomic_inc(&pmc_nr_enabled); + + return 0; + +fail: + for_each_node(nid) { + struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid]; + + if (nodeinfo->cachep) { + kvfree(nodeinfo->cachep); + nodeinfo->cachep = NULL; + } + } + + return -ENOMEM; +} + +static int __disable_mem_cgroup_cache(struct mem_cgroup *memcg) +{ + int nid; + + if (unlikely(mem_cgroup_cache_disabled(memcg))) + return -EINVAL; + + //offline + WRITE_ONCE(memcg->cache_enabled, false); + + for_each_node(nid) { + struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node_cache *p; + + p = nodeinfo->cachep; + + mem_cgroup_release_cache(p); + + kfree(p); + } + + if (atomic_dec_and_test(&pmc_nr_enabled)) + static_branch_enable(&pmc_key); + + return 0; +} + +static int mem_cgroup_cache_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg; + int nid; + + if (static_branch_likely(&pmc_key)) + return -EINVAL; + + memcg = mem_cgroup_from_seq(m); + if (!READ_ONCE(memcg->cache_enabled)) + return -EINVAL; + + seq_printf(m, "%4s %16s %16s\n", "NODE", "WATERMARK", "HOLD_LIMIT"); + for_each_online_node(nid) { + struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node_cache *p; + + p = nodeinfo->cachep; + if (!p) + continue; + + seq_printf(m, "%4d %14uKB %14uKB\n", nid, + (READ_ONCE(p->allow_watermark) << (PAGE_SHIFT - 10)), + (READ_ONCE(p->hold_limit) << (PAGE_SHIFT - 10))); + } + + seq_puts(m, "===========\n"); + seq_printf(m, "%4s %16s %16s %16s\n", "NODE", "ZONE", "CACHE", "HIT"); + + for_each_online_node(nid) { + struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node_cache *p; + pg_data_t *pgdat = NODE_DATA(nid); + int idx; + + p = nodeinfo->cachep; + if (!p) + continue; + + for (idx = 0; idx < MAX_NR_ZONES; idx++) { + struct mem_cgroup_zone_cache *zc; + struct zone *z = &pgdat->node_zones[idx]; + + if (!populated_zone(z)) + continue; + + zc = &p->zone_cachep[idx]; + seq_printf(m, "%4d %16s %14dKB %14dKB\n", nid, z->name, + (atomic_read(&zc->nr_pages) + << (PAGE_SHIFT - 10)), + (atomic_read(&zc->nr_alloced) + << (PAGE_SHIFT - 10))); + } + } + + return 0; +} + +enum { + OPT_CTRL_ENABLE, + OPT_CTRL_ERR, + OPR_CTRL_NR = OPT_CTRL_ERR, +}; + +static const match_table_t ctrl_tokens = { + { OPT_CTRL_ENABLE, "enable=%s" }, + { OPT_CTRL_ERR, NULL } }; + +/** + * This function can control target memcg's cache. include enable\keys set. + * To enable\disable this cache, by `echo enable=[y|n] > memory.cace` + * in target memcg. + */ +static ssize_t mem_cgroup_cache_control(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + bool enable; + bool opt_enable_set = false; + int err = 0; + char *sub; + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + buf = strstrip(buf); + if (!strlen(buf)) + return -EINVAL; + + while ((sub = strsep(&buf, " ")) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + char tbuf[256]; + + sub = strstrip(sub); + + token = match_token(sub, ctrl_tokens, args); + switch (token) { + case OPT_CTRL_ENABLE: + if (match_strlcpy(tbuf, &args[0], sizeof(tbuf)) >= + sizeof(tbuf)) + return -EINVAL; + + err = kstrtobool(tbuf, &enable); + if (err) + return -EINVAL; + opt_enable_set = true; + break; + case OPT_CTRL_ERR: + default: + return -EINVAL; + } + } + + if (opt_enable_set) { + if (enable) { + __enable_mem_cgroup_cache(memcg); + } else { + __disable_mem_cgroup_cache(memcg); + return nbytes; + } + } + + return err ? err : nbytes; +} + static struct cftype memory_files[] = { { .name = "current", @@ -7156,6 +7386,13 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NS_DELEGATABLE, .write = memory_reclaim, }, + /* free cache field */ + { + .name = "cache", + .flags = CFTYPE_NOT_ON_ROOT, + .write = mem_cgroup_cache_control, + .seq_show = mem_cgroup_cache_show, + }, { } /* terminate */ }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1beb56f75319..54c4d00c2506 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -530,6 +530,14 @@ static inline int pindex_to_order(unsigned int pindex) return order; } +/** + * Per memcg cache currently only allow order 0. + */ +static inline bool pmc_allow_order(unsigned int order) +{ + return !order; +} + static inline bool pcp_allowed_order(unsigned int order) { if (order <= PAGE_ALLOC_COSTLY_ORDER) @@ -1271,6 +1279,43 @@ void __free_pages_core(struct page *page, unsigned int order) __free_pages_ok(page, order, FPI_TO_TAIL); } +int mem_cgroup_release_cache(struct mem_cgroup_per_node_cache *nodep) +{ + LIST_HEAD(temp_list); + int zid, num = 0; + + for (zid = 0; zid < MAX_NR_ZONES; ++zid) { + struct mem_cgroup_zone_cache *zc = &nodep->zone_cachep[zid]; + int i = 0; + + if (!atomic_read(&zc->nr_pages)) + continue; + + spin_lock(&zc->pages_lock); + list_splice_init(&zc->pages, &temp_list); + spin_unlock(&zc->pages_lock); + + while (!list_empty(&temp_list)) { + struct page *page = + list_first_entry(&temp_list, struct page, lru); + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + + list_del(&page->lru); + + + // is good to put into pcp? + free_one_page(zone, page, pfn, 0, FPI_NONE); + ++i; + } + + num += i; + atomic_sub(i, &zc->nr_pages); + } + + return num; +} + /* * Check that the whole (or subset of) a pageblock given by the interval of * [start_pfn, end_pfn) is valid and within the same zone, before scanning it @@ -2603,6 +2648,41 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, } } +static bool free_unref_page_to_pmc(struct page *page, struct zone *zone, + int order) +{ + struct mem_cgroup *memcg; + struct mem_cgroup_per_node_cache *cachp; + struct mem_cgroup_zone_cache *zc; + unsigned long flags; + bool ret = false; + + if (pmc_disabled()) + return false; + + memcg = get_mem_cgroup_from_current(); + if (!memcg || mem_cgroup_is_root(memcg) || + mem_cgroup_cache_disabled(memcg)) + goto out; + + cachp = mem_cgroup_get_node_cachep(memcg, page_to_nid(page)); + zc = &cachp->zone_cachep[page_zonenum(page)]; + + if (high_wmark_pages(zone) + READ_ONCE(cachp->allow_watermark) >= + zone_page_state(zone, NR_FREE_PAGES)) + goto out; + + spin_lock_irqsave(&zc->pages_lock, flags); + list_add(&page->lru, &zc->pages); + spin_unlock_irqrestore(&zc->pages_lock, flags); + atomic_inc(&zc->nr_pages); + + ret = true; +out: + mem_cgroup_put(memcg); + return ret; +} + /* * Free a pcp page */ @@ -2634,6 +2714,17 @@ void free_unref_page(struct page *page, unsigned int order) } zone = page_zone(page); + + /** + * This function can cache release page before free into pcp if current + * memcg enabled cache feature. + * Compared to PCP, PMC is unique, only processes in PMC can access it. + * So, if the conditions are met, it should be prioritized to be + * released to PMC before being released to the public CPU cache. + */ + if (pmc_allow_order(order) && free_unref_page_to_pmc(page, zone, order)) + return; + pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { @@ -3012,6 +3103,49 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, return page; } +static struct page *rmqueue_mem_cgroup_cache(struct zone *preferred_zone, + struct zone *zone, + unsigned int order, + int migratetype) +{ + struct mem_cgroup *memcg; + struct mem_cgroup_per_node_cache *cachp; + struct mem_cgroup_zone_cache *zc; + unsigned long flags; + int nid = zone->zone_pgdat->node_id; + struct page *page = NULL; + + if (pmc_disabled()) + return NULL; + + memcg = get_mem_cgroup_from_current(); + if (!memcg || mem_cgroup_is_root(memcg) || + mem_cgroup_cache_disabled(memcg)) + goto out; + + cachp = mem_cgroup_get_node_cachep(memcg, nid); + + zc = &cachp->zone_cachep[zone_idx(zone)]; + if (!atomic_read(&zc->nr_pages)) + goto out; + + spin_lock_irqsave(&zc->pages_lock, flags); + if (list_empty(&zc->pages)) { + spin_unlock_irqrestore(&zc->pages_lock, flags); + goto out; + } + page = list_first_entry(&zc->pages, struct page, lru); + list_del(&page->lru); + spin_unlock_irqrestore(&zc->pages_lock, flags); + + atomic_dec(&zc->nr_pages); + atomic_inc(&zc->nr_alloced); + +out: + mem_cgroup_put(memcg); + return page; +} + /* * Allocate a page from the given zone. * Use pcplists for THP or "cheap" high-order allocations. @@ -3038,6 +3172,18 @@ struct page *rmqueue(struct zone *preferred_zone, */ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); + /* + * Before disturb public pcp or buddy, current may in a memcg + * which already enabled cache feature. + * If that's true, first get page from private pool can boost alloc. + */ + if (pmc_allow_order(order)) { + page = rmqueue_mem_cgroup_cache(preferred_zone, zone, order, + migratetype); + if (page) + goto out; + } + if (likely(pcp_allowed_order(order))) { page = rmqueue_pcplist(preferred_zone, zone, order, migratetype, alloc_flags); -- 2.45.2