Re: [PATCH] new cgroup controller "fork"

Glauber Costa <glommer@xxxxxxxxxxxxx> · Thu, 3 Nov 2011 14:43:17 -0200

On 11/03/2011 02:22 PM, Max Kellermann wrote:
Can limit the number of fork()/clone() calls in a cgroup.  It is
useful as a safeguard against fork bombs.

I do have a couple of questions about this, but the most important one 
is: Is this a competing implementation, or a cooperative effort with 
Frederic's ?

Signed-off-by: Max Kellermann<mk@xxxxxxxxxx>
---
  Documentation/cgroups/fork.txt |   30 ++++++
  include/linux/cgroup_fork.h    |   26 +++++
  include/linux/cgroup_subsys.h  |    6 +
  init/Kconfig                   |    6 +
  kernel/Makefile                |    1
  kernel/cgroup_fork.c           |  197 ++++++++++++++++++++++++++++++++++++++++
  kernel/fork.c                  |    5 +
  7 files changed, 271 insertions(+), 0 deletions(-)
  create mode 100644 Documentation/cgroups/fork.txt
  create mode 100644 include/linux/cgroup_fork.h
  create mode 100644 kernel/cgroup_fork.c

diff --git a/Documentation/cgroups/fork.txt b/Documentation/cgroups/fork.txt
new file mode 100644
index 0000000..dfbf291
--- /dev/null
+++ b/Documentation/cgroups/fork.txt
@@ -0,0 +1,30 @@
+The "fork" Controller
+---------------------
+
+The "fork" controller limits the number of times a new child process
+or thread can be created.  It maintains a per-group counter which gets
+decremented on each fork() / clone().  When the counter reaches zero,
+no process in the cgroup is allowed to create new child
+processes/threads, even if existing ones quit.
+
+This has been proven useful in a shared hosting environment.  A new
+temporary cgroup is created for each CGI process, and the maximum fork
+count is configured to a sensible value.  Since CGIs are expected to
+run for only a short time with predictable resource usage, this may be
+an appropriate tool to limit the damage that a freaked CGI can do.
+
+Initially, the counter is set to -1, which is a magic value for
+"disabled" - no limits are imposed on the processes in the group.  To
+set a new value, type (in the working directory of that control
+group):
+
+ echo 16>  fork.remaining
+
+This examples allows 16 forks in the control group.  0 means no
+further forks are allowed.  The limit may be lowered or increased or
+even disabled at any time by a process with write permissions to the
+attribute.
+
+To check if a fork is allowed, the controller walks the cgroup
+hierarchy up, and verifies all ancestors.  The counter of all
+ancestors is decreased.
diff --git a/include/linux/cgroup_fork.h b/include/linux/cgroup_fork.h
new file mode 100644
index 0000000..4ac66b3
--- /dev/null
+++ b/include/linux/cgroup_fork.h
@@ -0,0 +1,26 @@
+#ifndef _LINUX_CGROUP_FORK_H
+#define _LINUX_CGROUP_FORK_H
+
+#ifdef CONFIG_CGROUP_FORK
+
+/**
+ * Checks if another fork is allowed.  Call this before creating a new
+ * child process.
+ *
+ * @return 0 on success, a negative errno value if forking should be
+ * denied
+ */
+int
+cgroup_fork_pre_fork(void);
+
+#else /* !CONFIG_CGROUP_FORK */
+
+static inline int
+cgroup_fork_pre_fork(void)
+{
+	return 0;
+}
+
+#endif /* !CONFIG_CGROUP_FORK */
+
+#endif /* !_LINUX_CGROUP_FORK_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ac663c1..e2dbd65 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -64,3 +64,9 @@ SUBSYS(perf)
  #endif

  /* */
+
+#ifdef CONFIG_CGROUP_FORK
+SUBSYS(fork)
+#endif
+
+/* */
diff --git a/init/Kconfig b/init/Kconfig
index 31ba0fd..7a2fe2e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -603,6 +603,12 @@ config CGROUP_FREEZER
  	  Provides a way to freeze and unfreeze all tasks in a
  	  cgroup.

+config CGROUP_FORK
+	bool "fork controller for cgroups"
+	help
+	  Limits the number of fork() calls in a cgroup.  An application
+	  for this is to make a cgroup safe against fork bombs.
+
  config CGROUP_DEVICE
  	bool "Device controller for cgroups"
  	help
diff --git a/kernel/Makefile b/kernel/Makefile
index e898c5b..2aab192 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
  obj-$(CONFIG_COMPAT) += compat.o
  obj-$(CONFIG_CGROUPS) += cgroup.o
  obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_FORK) += cgroup_fork.o
  obj-$(CONFIG_CPUSETS) += cpuset.o
  obj-$(CONFIG_UTS_NS) += utsname.o
  obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_fork.c b/kernel/cgroup_fork.c
new file mode 100644
index 0000000..e9aa650
--- /dev/null
+++ b/kernel/cgroup_fork.c
@@ -0,0 +1,197 @@
+/*
+ * A cgroup implementation which limits the number of fork() calls.
+ * See Documentation/cgroups/fork.txt for more information.
+ *
+ * Copyright 2011 Content Management AG
+ * Author: Max Kellermann<mk@xxxxxxxxxx>
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include<linux/cgroup.h>
+#include<linux/cgroup_fork.h>
+#include<linux/slab.h>
+
+struct cgroup_fork {
+	struct cgroup_subsys_state css;
+
+	/** protect the "remaining" attribute */
+	spinlock_t lock;
+
+	/**
+	 * The remaining number of forks allowed.  -1 is the magic
+	 * value for "unlimited".
+	 */
+	int remaining;
+};
+
+/**
+ * Get the #cgroup_fork instance of the specified #cgroup.
+ */
+static inline struct cgroup_fork *
+cgroup_fork_group(struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, fork_subsys_id),
+			    struct cgroup_fork, css);
+}
+
+/**
+ * Get the #cgroup_fork instance of the specified task.
+ */
+static inline struct cgroup_fork *
+cgroup_fork_task(struct task_struct *task)
+{
+	return container_of(task_subsys_state(task, fork_subsys_id),
+			    struct cgroup_fork, css);
+}
+
+/**
+ * Get the #cgroup_fork instance of the current task.
+ */
+static inline struct cgroup_fork *
+cgroup_fork_current(void)
+{
+	return cgroup_fork_task(current);
+}
+
+static __pure int
+cgroup_fork_lock_get_remaining(struct cgroup_fork *t)
+{
+	unsigned remaining;
+
+	spin_lock(&t->lock);
+	remaining = t->remaining;
+	spin_unlock(&t->lock);
+
+	return remaining;
+}
+
+static struct cgroup_subsys_state *
+cgroup_fork_create(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	struct cgroup_fork *t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&t->lock);
+
+	t->remaining = -1;
+
+	return&t->css;
+}
+
+static void
+cgroup_fork_destroy(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	struct cgroup_fork *t = cgroup_fork_group(cgroup);
+
+	kfree(t);
+}
+
+static void
+cgroup_fork_fork(struct cgroup_subsys *ss, struct task_struct *task)
+{
+	struct cgroup_fork *t;
+
+	rcu_read_lock();
+
+	/* decrement the counters in the cgroup and all of its
+	   ancestors (except for the root cgroup) */
+
+	t = cgroup_fork_current();
+	while (t->css.cgroup->parent != NULL) {
+		spin_lock(&t->lock);
+		if (t->remaining>  0)
+			--t->remaining;
+		spin_unlock(&t->lock);
+
+		t = cgroup_fork_group(t->css.cgroup->parent);
+	}
+
+	rcu_read_unlock();
+}
+
+static s64
+cgroup_fork_remaining_read(struct cgroup *cgroup, struct cftype *cft)
+{
+	struct cgroup_fork *t = cgroup_fork_group(cgroup);
+
+	return cgroup_fork_lock_get_remaining(t);
+}
+
+static int
+cgroup_fork_remaining_write(struct cgroup *cgroup, struct cftype *cft,
+			    s64 value)
+{
+	struct cgroup_fork *t = cgroup_fork_group(cgroup);
+
+	if (value<  -1 || value>  (1L<<  30))
+		return -EINVAL;
+
+	spin_lock(&t->lock);
+	t->remaining = (int)value;
+	spin_unlock(&t->lock);
+
+	return 0;
+}
+
+static const struct cftype cgroup_fork_files[] =  {
+	{
+		.name = "remaining",
+		.read_s64 = cgroup_fork_remaining_read,
+		.write_s64 = cgroup_fork_remaining_write,
+	},
+};
+
+static int
+cgroup_fork_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	if (cgroup->parent == NULL)
+		/* cannot limit the root cgroup */
+		return 0;
+
+	return cgroup_add_files(cgroup, ss, cgroup_fork_files,
+				ARRAY_SIZE(cgroup_fork_files));
+}
+
+struct cgroup_subsys fork_subsys = {
+	.name = "fork",
+	.create = cgroup_fork_create,
+	.destroy = cgroup_fork_destroy,
+	.fork = cgroup_fork_fork,
+	.populate = cgroup_fork_populate,
+	.subsys_id = fork_subsys_id,
+};
+
+int
+cgroup_fork_pre_fork(void)
+{
+	struct cgroup_fork *t;
+	int err = 0;
+
+	if (unlikely(current ==&init_task))
+		/* ignore the kernel's fork request while booting; the
+		   cgroup subsystem doesn't get initialized by
+		   INIT_TASK(), so we need this check */
+		return err;
+
+	BUG_ON(current->cgroups == NULL);
+
+	rcu_read_lock();
+
+	t = cgroup_fork_current();
+	while (t->css.cgroup->parent != NULL&&  err == 0) {
+		if (unlikely(cgroup_fork_lock_get_remaining(t) == 0)) {
+			err = -EPERM;
+			break;
+		}
+
+		t = cgroup_fork_group(t->css.cgroup->parent);
+	}
+
+	rcu_read_unlock();
+
+	return err;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 70d7619..c8cba7d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
  #include<linux/capability.h>
  #include<linux/cpu.h>
  #include<linux/cgroup.h>
+#include<linux/cgroup_fork.h>
  #include<linux/security.h>
  #include<linux/hugetlb.h>
  #include<linux/swap.h>
@@ -1084,6 +1085,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  				current->signal->flags&  SIGNAL_UNKILLABLE)
  		return ERR_PTR(-EINVAL);

+	retval = cgroup_fork_pre_fork();
+	if (retval)
+		goto fork_out;
+
  	retval = security_task_create(clone_flags);
  	if (retval)
  		goto fork_out;

_______________________________________________
Containers mailing list
Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/containers

_______________________________________________
Containers mailing list
Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/containers