On Sun, Feb 5, 2023 at 11:49 PM Alistair Popple <apopple@xxxxxxxxxx> wrote: > > If too much memory in a system is pinned or locked it can lead to > problems such as performance degradation or in the worst case > out-of-memory errors as such memory cannot be moved or paged out. > > In order to prevent users without CAP_IPC_LOCK from causing these > issues the amount of memory that can be pinned is typically limited by > RLIMIT_MEMLOCK. However this is inflexible as limits can't be shared > between tasks and the enforcement of these limits is inconsistent > between in-kernel users of pinned memory such as mlock() and device > drivers which may also pin pages with pin_user_pages(). > > To allow for a single limit to be set introduce a cgroup controller > which can be used to limit the number of pages being pinned by all > tasks in the cgroup. > > Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx> > Cc: Tejun Heo <tj@xxxxxxxxxx> > Cc: Zefan Li <lizefan.x@xxxxxxxxxxxxx> > Cc: Johannes Weiner <hannes@xxxxxxxxxxx> > Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > Cc: linux-kernel@xxxxxxxxxxxxxxx > Cc: cgroups@xxxxxxxxxxxxxxx > Cc: linux-mm@xxxxxxxxx > --- > MAINTAINERS | 7 +- > include/linux/cgroup.h | 20 +++- > include/linux/cgroup_subsys.h | 4 +- > mm/Kconfig | 11 +- > mm/Makefile | 1 +- > mm/pins_cgroup.c | 273 +++++++++++++++++++++++++++++++++++- > 6 files changed, 316 insertions(+) > create mode 100644 mm/pins_cgroup.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index f781f93..f8526e2 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -5381,6 +5381,13 @@ F: tools/testing/selftests/cgroup/memcg_protection.m > F: tools/testing/selftests/cgroup/test_kmem.c > F: tools/testing/selftests/cgroup/test_memcontrol.c > > +CONTROL GROUP - PINNED AND LOCKED MEMORY > +M: Alistair Popple <apopple@xxxxxxxxxx> > +L: cgroups@xxxxxxxxxxxxxxx > +L: linux-mm@xxxxxxxxx > +S: Maintained > +F: mm/pins_cgroup.c > + > CORETEMP HARDWARE MONITORING DRIVER > M: Fenghua Yu <fenghua.yu@xxxxxxxxx> > L: linux-hwmon@xxxxxxxxxxxxxxx > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h > index 3410aec..d98de25 100644 > --- a/include/linux/cgroup.h > +++ b/include/linux/cgroup.h > @@ -857,4 +857,24 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} > > #endif /* CONFIG_CGROUP_BPF */ > > +#ifdef CONFIG_CGROUP_PINS > + > +struct pins_cgroup *get_pins_cg(struct task_struct *task); > +void put_pins_cg(struct pins_cgroup *cg); > +void pins_uncharge(struct pins_cgroup *pins, int num); > +bool pins_try_charge(struct pins_cgroup *pins, int num); > + > +#else /* CONFIG_CGROUP_PINS */ > + > +static inline struct pins_cgroup *get_pins_cg(struct task_struct *task) > +{ > + return NULL; > +} > + > +static inline void put_pins_cg(struct pins_cgroup *cg) {} > +static inline void pins_uncharge(struct pins_cgroup *pins, int num) {} > +static inline bool pins_try_charge(struct pins_cgroup *pins, int num) { return 1; } > + > +#endif /* CONFIG_CGROUP_PINS */ > + > #endif /* _LINUX_CGROUP_H */ > diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h > index 4452354..c1b4aab 100644 > --- a/include/linux/cgroup_subsys.h > +++ b/include/linux/cgroup_subsys.h > @@ -65,6 +65,10 @@ SUBSYS(rdma) > SUBSYS(misc) > #endif > > +#if IS_ENABLED(CONFIG_CGROUP_PINS) > +SUBSYS(pins) > +#endif > + > /* > * The following subsystems are not supported on the default hierarchy. > */ > diff --git a/mm/Kconfig b/mm/Kconfig > index ff7b209..4472043 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -1183,6 +1183,17 @@ config LRU_GEN_STATS > This option has a per-memcg and per-node memory overhead. > # } > > +config CGROUP_PINS > + bool "Cgroup controller for pinned and locked memory" > + depends on CGROUPS > + default y > + help > + Having too much memory pinned or locked can lead to system > + instability due to increased likelihood of encountering > + out-of-memory conditions. Select this option to enable a cgroup > + controller which can be used to limit the overall number of > + pages procceses in a cgroup may lock or have pinned by drivers. > + > source "mm/damon/Kconfig" > > endmenu > diff --git a/mm/Makefile b/mm/Makefile > index 8e105e5..81db189 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o > obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o > obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o > obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o > +obj-$(CONFIG_CGROUP_PINS) += pins_cgroup.o > diff --git a/mm/pins_cgroup.c b/mm/pins_cgroup.c > new file mode 100644 > index 0000000..2d8c6c7 > --- /dev/null > +++ b/mm/pins_cgroup.c > @@ -0,0 +1,273 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Controller for cgroups limiting number of pages pinned for FOLL_LONGETERM. > + * > + * Copyright (C) 2022 Alistair Popple <apopple@xxxxxxxxxx> > + */ > + > +#include <linux/kernel.h> > +#include <linux/threads.h> > +#include <linux/atomic.h> > +#include <linux/cgroup.h> > +#include <linux/slab.h> > +#include <linux/sched/task.h> > + > +#define PINS_MAX (-1ULL) > +#define PINS_MAX_STR "max" > + > +struct pins_cgroup { > + struct cgroup_subsys_state css; > + > + atomic64_t counter; > + atomic64_t limit; > + > + struct cgroup_file events_file; > + atomic64_t events_limit; > +}; > + > +static struct pins_cgroup *css_pins(struct cgroup_subsys_state *css) > +{ > + return container_of(css, struct pins_cgroup, css); > +} > + > +static struct pins_cgroup *parent_pins(struct pins_cgroup *pins) > +{ > + return css_pins(pins->css.parent); > +} > + > +struct pins_cgroup *get_pins_cg(struct task_struct *task) I think you'll need a version of this that gets multiple refs to the pins cgroup. I will add more details in the patch that actually hooks the accounting. > +{ > + return css_pins(task_get_css(task, pins_cgrp_id)); > +} > + > +void put_pins_cg(struct pins_cgroup *cg) > +{ > + css_put(&cg->css); > +} > + > +static struct cgroup_subsys_state * > +pins_css_alloc(struct cgroup_subsys_state *parent) > +{ > + struct pins_cgroup *pins; > + > + pins = kzalloc(sizeof(struct pins_cgroup), GFP_KERNEL); > + if (!pins) > + return ERR_PTR(-ENOMEM); > + > + atomic64_set(&pins->counter, 0); > + atomic64_set(&pins->limit, PINS_MAX); > + atomic64_set(&pins->events_limit, 0); > + return &pins->css; > +} > + > +static void pins_css_free(struct cgroup_subsys_state *css) > +{ > + kfree(css_pins(css)); > +} > + > +/** > + * pins_cancel - uncharge the local pin count > + * @pins: the pin cgroup state > + * @num: the number of pins to cancel > + * > + * This function will WARN if the pin count goes under 0, because such a case is > + * a bug in the pins controller proper. > + */ > +void pins_cancel(struct pins_cgroup *pins, int num) > +{ > + /* > + * A negative count (or overflow for that matter) is invalid, > + * and indicates a bug in the `pins` controller proper. > + */ > + WARN_ON_ONCE(atomic64_add_negative(-num, &pins->counter)); > +} > + > +/** > + * pins_uncharge - hierarchically uncharge the pin count > + * @pins: the pin cgroup state > + * @num: the number of pins to uncharge > + */ > +void pins_uncharge(struct pins_cgroup *pins, int num) > +{ > + struct pins_cgroup *p; > + > + for (p = pins; parent_pins(p); p = parent_pins(p)) > + pins_cancel(p, num); > +} > + > +/** > + * pins_charge - hierarchically charge the pin count > + * @pins: the pin cgroup state > + * @num: the number of pins to charge > + * > + * This function does *not* follow the pin limit set. It cannot fail and the new > + * pin count may exceed the limit. This is only used for reverting failed > + * attaches, where there is no other way out than violating the limit. > + */ > +static void pins_charge(struct pins_cgroup *pins, int num) > +{ > + struct pins_cgroup *p; > + > + for (p = pins; parent_pins(p); p = parent_pins(p)) > + atomic64_add(num, &p->counter); > +} > + > +/** > + * pins_try_charge - hierarchically try to charge the pin count > + * @pins: the pin cgroup state > + * @num: the number of pins to charge > + * > + * This function follows the set limit. It will fail if the charge would cause > + * the new value to exceed the hierarchical limit. Returns true if the charge > + * succeeded, false otherwise. > + */ > +bool pins_try_charge(struct pins_cgroup *pins, int num) > +{ > + struct pins_cgroup *p, *q; > + > + for (p = pins; parent_pins(p); p = parent_pins(p)) { > + uint64_t new = atomic64_add_return(num, &p->counter); > + uint64_t limit = atomic64_read(&p->limit); > + > + if (limit != PINS_MAX && new > limit) > + goto revert; > + } > + > + return true; > + > +revert: > + for (q = pins; q != p; q = parent_pins(q)) > + pins_cancel(q, num); > + pins_cancel(p, num); > + > + return false; > +} > + > +static int pins_can_attach(struct cgroup_taskset *tset) > +{ > + struct cgroup_subsys_state *dst_css; > + struct task_struct *task; > + > + cgroup_taskset_for_each(task, dst_css, tset) { > + struct pins_cgroup *pins = css_pins(dst_css); > + struct cgroup_subsys_state *old_css; > + struct pins_cgroup *old_pins; > + > + old_css = task_css(task, pins_cgrp_id); > + old_pins = css_pins(old_css); > + > + pins_charge(pins, task->mm->locked_vm); Why are we not using pins_try_charge() here, and failing attachment if we cannot charge? The comment above pins_charge() states that it is only used when cancelling attachment, but this is not the case here, right? > + pins_uncharge(old_pins, task->mm->locked_vm); > + } > + > + return 0; > +} > + > +static void pins_cancel_attach(struct cgroup_taskset *tset) > +{ > + struct cgroup_subsys_state *dst_css; > + struct task_struct *task; > + > + cgroup_taskset_for_each(task, dst_css, tset) { > + struct pins_cgroup *pins = css_pins(dst_css); > + struct cgroup_subsys_state *old_css; > + struct pins_cgroup *old_pins; > + > + old_css = task_css(task, pins_cgrp_id); > + old_pins = css_pins(old_css); > + > + pins_charge(old_pins, task->mm->locked_vm); > + pins_uncharge(pins, task->mm->locked_vm); > + } > +} > + > + > +static ssize_t pins_max_write(struct kernfs_open_file *of, char *buf, > + size_t nbytes, loff_t off) > +{ > + struct cgroup_subsys_state *css = of_css(of); > + struct pins_cgroup *pins = css_pins(css); > + uint64_t limit; > + int err; > + > + buf = strstrip(buf); > + if (!strcmp(buf, PINS_MAX_STR)) { > + limit = PINS_MAX; > + goto set_limit; > + } > + > + err = kstrtoll(buf, 0, &limit); > + if (err) > + return err; > + > + if (limit < 0 || limit >= PINS_MAX) > + return -EINVAL; > + > +set_limit: > + /* > + * Limit updates don't need to be mutex'd, since it isn't > + * critical that any racing fork()s follow the new limit. > + */ > + atomic64_set(&pins->limit, limit); > + return nbytes; > +} > + > +static int pins_max_show(struct seq_file *sf, void *v) > +{ > + struct cgroup_subsys_state *css = seq_css(sf); > + struct pins_cgroup *pins = css_pins(css); > + uint64_t limit = atomic64_read(&pins->limit); > + > + if (limit >= PINS_MAX) > + seq_printf(sf, "%s\n", PINS_MAX_STR); > + else > + seq_printf(sf, "%lld\n", limit); > + > + return 0; > +} > + > +static s64 pins_current_read(struct cgroup_subsys_state *css, > + struct cftype *cft) > +{ > + struct pins_cgroup *pins = css_pins(css); > + > + return atomic64_read(&pins->counter); > +} > + > +static int pins_events_show(struct seq_file *sf, void *v) > +{ > + struct pins_cgroup *pins = css_pins(seq_css(sf)); > + > + seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pins->events_limit)); > + return 0; > +} > + > +static struct cftype pins_files[] = { > + { > + .name = "max", > + .write = pins_max_write, > + .seq_show = pins_max_show, > + .flags = CFTYPE_NOT_ON_ROOT, > + }, > + { > + .name = "current", > + .read_s64 = pins_current_read, > + .flags = CFTYPE_NOT_ON_ROOT, > + }, > + { > + .name = "events", > + .seq_show = pins_events_show, > + .file_offset = offsetof(struct pins_cgroup, events_file), > + .flags = CFTYPE_NOT_ON_ROOT, > + }, > + { } /* terminate */ > +}; > + > +struct cgroup_subsys pins_cgrp_subsys = { > + .css_alloc = pins_css_alloc, > + .css_free = pins_css_free, > + .legacy_cftypes = pins_files, > + .dfl_cftypes = pins_files, > + .can_attach = pins_can_attach, > + .cancel_attach = pins_cancel_attach, > +}; > -- > git-series 0.9.1 >