On 11/03/2011 02:22 PM, Max Kellermann wrote:
Can limit the number of fork()/clone() calls in a cgroup. It is useful as a safeguard against fork bombs.
I do have a couple of questions about this, but the most important one is: Is this a competing implementation, or a cooperative effort with Frederic's ?
Signed-off-by: Max Kellermann<mk@xxxxxxxxxx> --- Documentation/cgroups/fork.txt | 30 ++++++ include/linux/cgroup_fork.h | 26 +++++ include/linux/cgroup_subsys.h | 6 + init/Kconfig | 6 + kernel/Makefile | 1 kernel/cgroup_fork.c | 197 ++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 5 + 7 files changed, 271 insertions(+), 0 deletions(-) create mode 100644 Documentation/cgroups/fork.txt create mode 100644 include/linux/cgroup_fork.h create mode 100644 kernel/cgroup_fork.c diff --git a/Documentation/cgroups/fork.txt b/Documentation/cgroups/fork.txt new file mode 100644 index 0000000..dfbf291 --- /dev/null +++ b/Documentation/cgroups/fork.txt @@ -0,0 +1,30 @@ +The "fork" Controller +--------------------- + +The "fork" controller limits the number of times a new child process +or thread can be created. It maintains a per-group counter which gets +decremented on each fork() / clone(). When the counter reaches zero, +no process in the cgroup is allowed to create new child +processes/threads, even if existing ones quit. + +This has been proven useful in a shared hosting environment. A new +temporary cgroup is created for each CGI process, and the maximum fork +count is configured to a sensible value. Since CGIs are expected to +run for only a short time with predictable resource usage, this may be +an appropriate tool to limit the damage that a freaked CGI can do. + +Initially, the counter is set to -1, which is a magic value for +"disabled" - no limits are imposed on the processes in the group. To +set a new value, type (in the working directory of that control +group): + + echo 16> fork.remaining + +This examples allows 16 forks in the control group. 0 means no +further forks are allowed. The limit may be lowered or increased or +even disabled at any time by a process with write permissions to the +attribute. + +To check if a fork is allowed, the controller walks the cgroup +hierarchy up, and verifies all ancestors. The counter of all +ancestors is decreased. diff --git a/include/linux/cgroup_fork.h b/include/linux/cgroup_fork.h new file mode 100644 index 0000000..4ac66b3 --- /dev/null +++ b/include/linux/cgroup_fork.h @@ -0,0 +1,26 @@ +#ifndef _LINUX_CGROUP_FORK_H +#define _LINUX_CGROUP_FORK_H + +#ifdef CONFIG_CGROUP_FORK + +/** + * Checks if another fork is allowed. Call this before creating a new + * child process. + * + * @return 0 on success, a negative errno value if forking should be + * denied + */ +int +cgroup_fork_pre_fork(void); + +#else /* !CONFIG_CGROUP_FORK */ + +static inline int +cgroup_fork_pre_fork(void) +{ + return 0; +} + +#endif /* !CONFIG_CGROUP_FORK */ + +#endif /* !_LINUX_CGROUP_FORK_H */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index ac663c1..e2dbd65 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -64,3 +64,9 @@ SUBSYS(perf) #endif /* */ + +#ifdef CONFIG_CGROUP_FORK +SUBSYS(fork) +#endif + +/* */ diff --git a/init/Kconfig b/init/Kconfig index 31ba0fd..7a2fe2e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -603,6 +603,12 @@ config CGROUP_FREEZER Provides a way to freeze and unfreeze all tasks in a cgroup. +config CGROUP_FORK + bool "fork controller for cgroups" + help + Limits the number of fork() calls in a cgroup. An application + for this is to make a cgroup safe against fork bombs. + config CGROUP_DEVICE bool "Device controller for cgroups" help diff --git a/kernel/Makefile b/kernel/Makefile index e898c5b..2aab192 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -60,6 +60,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o +obj-$(CONFIG_CGROUP_FORK) += cgroup_fork.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o diff --git a/kernel/cgroup_fork.c b/kernel/cgroup_fork.c new file mode 100644 index 0000000..e9aa650 --- /dev/null +++ b/kernel/cgroup_fork.c @@ -0,0 +1,197 @@ +/* + * A cgroup implementation which limits the number of fork() calls. + * See Documentation/cgroups/fork.txt for more information. + * + * Copyright 2011 Content Management AG + * Author: Max Kellermann<mk@xxxxxxxxxx> + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include<linux/cgroup.h> +#include<linux/cgroup_fork.h> +#include<linux/slab.h> + +struct cgroup_fork { + struct cgroup_subsys_state css; + + /** protect the "remaining" attribute */ + spinlock_t lock; + + /** + * The remaining number of forks allowed. -1 is the magic + * value for "unlimited". + */ + int remaining; +}; + +/** + * Get the #cgroup_fork instance of the specified #cgroup. + */ +static inline struct cgroup_fork * +cgroup_fork_group(struct cgroup *cgroup) +{ + return container_of(cgroup_subsys_state(cgroup, fork_subsys_id), + struct cgroup_fork, css); +} + +/** + * Get the #cgroup_fork instance of the specified task. + */ +static inline struct cgroup_fork * +cgroup_fork_task(struct task_struct *task) +{ + return container_of(task_subsys_state(task, fork_subsys_id), + struct cgroup_fork, css); +} + +/** + * Get the #cgroup_fork instance of the current task. + */ +static inline struct cgroup_fork * +cgroup_fork_current(void) +{ + return cgroup_fork_task(current); +} + +static __pure int +cgroup_fork_lock_get_remaining(struct cgroup_fork *t) +{ + unsigned remaining; + + spin_lock(&t->lock); + remaining = t->remaining; + spin_unlock(&t->lock); + + return remaining; +} + +static struct cgroup_subsys_state * +cgroup_fork_create(struct cgroup_subsys *ss, struct cgroup *cgroup) +{ + struct cgroup_fork *t = kzalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&t->lock); + + t->remaining = -1; + + return&t->css; +} + +static void +cgroup_fork_destroy(struct cgroup_subsys *ss, struct cgroup *cgroup) +{ + struct cgroup_fork *t = cgroup_fork_group(cgroup); + + kfree(t); +} + +static void +cgroup_fork_fork(struct cgroup_subsys *ss, struct task_struct *task) +{ + struct cgroup_fork *t; + + rcu_read_lock(); + + /* decrement the counters in the cgroup and all of its + ancestors (except for the root cgroup) */ + + t = cgroup_fork_current(); + while (t->css.cgroup->parent != NULL) { + spin_lock(&t->lock); + if (t->remaining> 0) + --t->remaining; + spin_unlock(&t->lock); + + t = cgroup_fork_group(t->css.cgroup->parent); + } + + rcu_read_unlock(); +} + +static s64 +cgroup_fork_remaining_read(struct cgroup *cgroup, struct cftype *cft) +{ + struct cgroup_fork *t = cgroup_fork_group(cgroup); + + return cgroup_fork_lock_get_remaining(t); +} + +static int +cgroup_fork_remaining_write(struct cgroup *cgroup, struct cftype *cft, + s64 value) +{ + struct cgroup_fork *t = cgroup_fork_group(cgroup); + + if (value< -1 || value> (1L<< 30)) + return -EINVAL; + + spin_lock(&t->lock); + t->remaining = (int)value; + spin_unlock(&t->lock); + + return 0; +} + +static const struct cftype cgroup_fork_files[] = { + { + .name = "remaining", + .read_s64 = cgroup_fork_remaining_read, + .write_s64 = cgroup_fork_remaining_write, + }, +}; + +static int +cgroup_fork_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) +{ + if (cgroup->parent == NULL) + /* cannot limit the root cgroup */ + return 0; + + return cgroup_add_files(cgroup, ss, cgroup_fork_files, + ARRAY_SIZE(cgroup_fork_files)); +} + +struct cgroup_subsys fork_subsys = { + .name = "fork", + .create = cgroup_fork_create, + .destroy = cgroup_fork_destroy, + .fork = cgroup_fork_fork, + .populate = cgroup_fork_populate, + .subsys_id = fork_subsys_id, +}; + +int +cgroup_fork_pre_fork(void) +{ + struct cgroup_fork *t; + int err = 0; + + if (unlikely(current ==&init_task)) + /* ignore the kernel's fork request while booting; the + cgroup subsystem doesn't get initialized by + INIT_TASK(), so we need this check */ + return err; + + BUG_ON(current->cgroups == NULL); + + rcu_read_lock(); + + t = cgroup_fork_current(); + while (t->css.cgroup->parent != NULL&& err == 0) { + if (unlikely(cgroup_fork_lock_get_remaining(t) == 0)) { + err = -EPERM; + break; + } + + t = cgroup_fork_group(t->css.cgroup->parent); + } + + rcu_read_unlock(); + + return err; +} diff --git a/kernel/fork.c b/kernel/fork.c index 70d7619..c8cba7d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -32,6 +32,7 @@ #include<linux/capability.h> #include<linux/cpu.h> #include<linux/cgroup.h> +#include<linux/cgroup_fork.h> #include<linux/security.h> #include<linux/hugetlb.h> #include<linux/swap.h> @@ -1084,6 +1085,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, current->signal->flags& SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); + retval = cgroup_fork_pre_fork(); + if (retval) + goto fork_out; + retval = security_task_create(clone_flags); if (retval) goto fork_out; _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers
_______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers