> @@ -0,0 +1,368 @@ > +/* > + * Process number limiting controller for cgroups. > + * > + * Used to allow a cgroup hierarchy to stop any new processes > + * from fork()ing after a certain limit is reached. > + * > + * Since it is trivial to hit the task limit without hitting > + * any kmemcg limits in place, PIDs are a fundamental resource. > + * As such, PID exhaustion must be preventable in the scope of > + * a cgroup hierarchy by allowing resource limiting of the > + * number of tasks in a cgroup. > + * > + * In order to use the `pids` controller, set the maximum number > + * of tasks in pids.max (this is not available in the root cgroup > + * for obvious reasons). The number of processes currently > + * in the cgroup is given by pids.current. Organisational operations > + * are not blocked by cgroup policies, so it is possible to have > + * pids.current > pids.max. However, fork()s will still not work. > + * > + * To set a cgroup to have no limit, set pids.max to "max". fork() > + * will return -EBUSY if forking would cause a cgroup policy to be > + * violated. > + * > + * pids.current tracks all child cgroup hierarchies, so > + * parent/pids.current is a superset of parent/child/pids.current. > + * > + * Copyright (C) 2015 Aleksa Sarai <cyphar@xxxxxxxxxx> The above text looks wrapped too narrow. > +struct pids_cgroup { > + struct cgroup_subsys_state css; > + > + /* > + * Use 64-bit types so that we can safely represent "max" as > + * (PID_MAX_LIMIT + 1). ^^^^^^^^^^^^^^^^^ ... > +static struct cgroup_subsys_state * > +pids_css_alloc(struct cgroup_subsys_state *parent) > +{ > + struct pids_cgroup *pids; > + > + pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL); > + if (!pids) > + return ERR_PTR(-ENOMEM); > + > + pids->limit = PIDS_MAX; ^^^^^^^^^ > + atomic64_set(&pids->counter, 0); > + return &pids->css; > +} ... > +static void pids_detach(struct cgroup_subsys_state *old_css, > + struct task_struct *task) > +{ > + struct pids_cgroup *old_pids = css_pids(old_css); > + > + pids_uncharge(old_pids, 1); > +} You can do the above as a part of can/cancel. > +static int pids_can_fork(struct task_struct *task, void **private) Maybe @priv_p or something which signifies it's of different type from others? > +{ ... > + rcu_read_lock(); > + css = task_css(current, pids_cgrp_id); > + if (!css_tryget_online(css)) { > + retval = -EBUSY; > + goto err_rcu_unlock; > + } > + rcu_read_unlock(); Hmmm... so, the above is guaranteed to succeed in finite amount of time (the race window is actually very narrow) and it'd be silly to fail fork because a task was being moved across cgroups. I think it'd be a good idea to implement task_get_css() which loops and returns the current css for the requested subsystem with reference count bumped and it can use css_tryget() too. Holding a ref doesn't prevent css from dying anyway, so it doesn't make any difference. > +static void pids_fork(struct task_struct *task, void *private) > +{ ... > + rcu_read_lock(); > + css = task_css(task, pids_cgrp_id); > + css_get(css); Why is this safe? What guarantees that css's ref isn't already zero at this point? > + rcu_read_unlock(); > + > + pids = css_pids(css); > + > + /* > + * The association has changed, we have to revert and reapply the > + * charge/uncharge on the wrong hierarchy to the current one. Since > + * the association can only change due to an organisation event, its > + * okay for us to ignore the limit in this case. > + */ > + if (pids != old_pids) { > + pids_uncharge(old_pids, 1); > + pids_charge(pids, 1); > + } > + > + css_put(css); > + css_put(old_css); > +} ... > +static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf, > + size_t nbytes, loff_t off) > +{ > + struct cgroup_subsys_state *css = of_css(of); > + struct pids_cgroup *pids = css_pids(css); > + int64_t limit; > + int err; > + > + buf = strstrip(buf); > + if (!strcmp(buf, PIDS_MAX_STR)) { > + limit = PIDS_MAX; > + goto set_limit; > + } > + > + err = kstrtoll(buf, 0, &limit); > + if (err) > + return err; > + > + /* We use INT_MAX as the maximum value of pid_t. */ > + if (limit < 0 || limit > INT_MAX) This is kinda weird if we're using PIDS_MAX for max as it may end up showing "max" after some larger number is written to the file. Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html