It is possible for a reclaimer to cause demotions of an lruvec belonging to a cgroup with cpuset.mems set to exclude some nodes. Attempt to apply this limitation based on the lruvec's memcg and prevent demotion. Notably, this may still allow demotion of shared libraries or any memory first instantiated in another cgroup. This means cpusets still cannot cannot guarantee complete isolation when demotion is enabled, and the docs have been updated to reflect this. Note: This is a fairly hacked up method that probably overlooks some cgroup/cpuset controls or designs. RFCing now for some discussion at LSFMM '25. Signed-off-by: Gregory Price <gourry@xxxxxxxxxx> --- .../ABI/testing/sysfs-kernel-mm-numa | 14 +++++--- include/linux/cpuset.h | 2 ++ kernel/cgroup/cpuset.c | 10 ++++++ mm/vmscan.c | 32 ++++++++++++------- 4 files changed, 41 insertions(+), 17 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa index 77e559d4ed80..27cdcab901f7 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-numa +++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa @@ -16,9 +16,13 @@ Description: Enable/disable demoting pages during reclaim Allowing page migration during reclaim enables these systems to migrate pages from fast tiers to slow tiers when the fast tier is under pressure. This migration - is performed before swap. It may move data to a NUMA - node that does not fall into the cpuset of the - allocating process which might be construed to violate - the guarantees of cpusets. This should not be enabled - on systems which need strict cpuset location + is performed before swap if an eligible numa node is + present in cpuset.mems for the cgroup. If cpusets.mems + changes at runtime, it may move data to a NUMA node that + does not fall into the cpuset of the new cpusets.mems, + which might be construed to violate the guarantees of + cpusets. Shared memory, such as libraries, owned by + another cgroup may still be demoted and result in memory + use on a node not present in cpusets.mem. This should not + be enabled on systems which need strict cpuset location guarantees. diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 835e7b793f6a..d4169f1b1719 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -171,6 +171,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } +bool memcg_mems_allowed(struct mem_cgroup *memcg, int nid); + #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0f910c828973..bb9669cc105d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4296,3 +4296,13 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) seq_printf(m, "Mems_allowed_list:\t%*pbl\n", nodemask_pr_args(&task->mems_allowed)); } + +bool memcg_mems_allowed(struct mem_cgroup *memcg, int nid) +{ + struct cgroup_subsys_state *css; + struct cpuset *cs; + + css = cgroup_get_e_css(memcg->css.cgroup, &cpuset_cgrp_subsys); + cs = css ? container_of(css, struct cpuset, css) : NULL; + return cs ? node_isset(nid, cs->effective_mems) : true; +} diff --git a/mm/vmscan.c b/mm/vmscan.c index 2b2ab386cab5..04152ea1c03d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -342,16 +342,22 @@ static void flush_reclaim_state(struct scan_control *sc) } } -static bool can_demote(int nid, struct scan_control *sc) +static bool can_demote(int nid, struct scan_control *sc, + struct mem_cgroup *memcg) { + int demotion_nid; + if (!numa_demotion_enabled) return false; if (sc && sc->no_demotion) return false; - if (next_demotion_node(nid) == NUMA_NO_NODE) + + demotion_nid = next_demotion_node(nid); + if (demotion_nid == NUMA_NO_NODE) return false; - return true; + /* If demotion node isn't in mems_allowed, fall back */ + return memcg ? memcg_mems_allowed(memcg, demotion_nid) : true; } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, @@ -376,7 +382,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, * * Can it be reclaimed from this node via demotion? */ - return can_demote(nid, sc); + return can_demote(nid, sc, NULL); } /* @@ -1096,7 +1102,8 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) */ static unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, - struct reclaim_stat *stat, bool ignore_references) + struct reclaim_stat *stat, bool ignore_references, + struct mem_cgroup *memcg) { struct folio_batch free_folios; LIST_HEAD(ret_folios); @@ -1109,7 +1116,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, folio_batch_init(&free_folios); memset(stat, 0, sizeof(*stat)); cond_resched(); - do_demote_pass = can_demote(pgdat->node_id, sc); + do_demote_pass = can_demote(pgdat->node_id, sc, memcg); retry: while (!list_empty(folio_list)) { @@ -1658,7 +1665,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, */ noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, - &stat, true); + &stat, true, NULL); memalloc_noreclaim_restore(noreclaim_flag); list_splice(&clean_folios, folio_list); @@ -2031,7 +2038,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); + nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false, + lruvec_memcg(lruvec)); spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); @@ -2214,7 +2222,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, }; - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); @@ -2654,7 +2662,7 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, return true; /* Also valuable if anon pages can be demoted: */ - return can_demote(pgdat->node_id, sc); + return can_demote(pgdat->node_id, sc, NULL); } #ifdef CONFIG_LRU_GEN @@ -2732,7 +2740,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) if (!sc->may_swap) return 0; - if (!can_demote(pgdat->node_id, sc) && + if (!can_demote(pgdat->node_id, sc, NULL) && mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) return 0; @@ -4695,7 +4703,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap if (list_empty(&list)) return scanned; retry: - reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); + reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, NULL); sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, -- 2.48.1