[RFC 12/16] sched, padata: Bound max threads with max_cfs_bandwidth_cpus()

Daniel Jordan <daniel.m.jordan@xxxxxxxxxx> · Wed, 5 Jan 2022 19:46:52 -0500

Helpers are currently not bound by the main thread's CFS bandwidth
limits because they're kernel threads that run on the root runqueues, so
a multithreaded job could cause a task group to consume more quota than
it's configured for.

As a starting point for helpers honoring these limits, restrict a job to
only as many helpers as there are CPUs allowed by these limits.  Helpers
are generally CPU-bound, so starting more helpers than this would likely
exceed the group's entire quota.  Max CFS bandwidth CPUs are calculated
conservatively with integer division (quota / period).

This restriction ignores other tasks in the group that might also be
consuming quota, so it doesn't strictly prevent a group from exceeding
its limits.  However, this may be the right tradeoff between simplicity
and absolutely correct resource control, given that VFIO page pinning
typically happens during guest initialization when there's not much
other CPU activity in the group.  There's also a prototype for an
absolutely correct approach later in the series should that be
preferred.

Signed-off-by: Daniel Jordan <daniel.m.jordan@xxxxxxxxxx>
---
 include/linux/sched/cgroup.h | 21 +++++++++++++++++++++
 kernel/padata.c              | 15 +++++++++++++++
 kernel/sched/core.c          | 19 +++++++++++++++++++
 3 files changed, 55 insertions(+)
 create mode 100644 include/linux/sched/cgroup.h

diff --git a/include/linux/sched/cgroup.h b/include/linux/sched/cgroup.h
new file mode 100644
index 000000000000..f89d92e9e015
--- /dev/null
+++ b/include/linux/sched/cgroup.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_CGROUP_H
+#define _LINUX_SCHED_CGROUP_H
+
+#include <linux/cgroup-defs.h>
+#include <linux/cpumask.h>
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+int max_cfs_bandwidth_cpus(struct cgroup_subsys_state *css);
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline int max_cfs_bandwidth_cpus(struct cgroup_subsys_state *css)
+{
+	return nr_cpu_ids;
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+#endif /* _LINUX_SCHED_CGROUP_H */
diff --git a/kernel/padata.c b/kernel/padata.c
index e27988d3e9ed..ef6589a6b665 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -24,6 +24,7 @@
  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include <linux/cgroup.h>
 #include <linux/completion.h>
 #include <linux/export.h>
 #include <linux/cpumask.h>
@@ -34,6 +35,7 @@
 #include <linux/padata.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/sched/cgroup.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
 #include <linux/rcupdate.h>
@@ -572,6 +574,7 @@ int padata_do_multithreaded_job(struct padata_mt_job *job,
 {
 	/* In case threads finish at different times. */
 	static const unsigned long load_balance_factor = 4;
+	struct cgroup_subsys_state *cpu_css;
 	struct padata_work my_work, *pw;
 	struct padata_mt_job_state ps;
 	LIST_HEAD(works);
@@ -585,6 +588,18 @@ int padata_do_multithreaded_job(struct padata_mt_job *job,
 	nworks = min(nworks, job->max_threads);
 	nworks = min(nworks, current->nr_cpus_allowed);
 
+#ifdef CONFIG_CGROUP_SCHED
+	/*
+	 * Cap threads at the max number of CPUs current's CFS bandwidth
+	 * settings allow.  Keep it simple, don't try to keep this value up to
+	 * date.  The ifdef guards cpu_cgrp_id.
+	 */
+	rcu_read_lock();
+	cpu_css = task_css(current, cpu_cgrp_id);
+	nworks = min(nworks, max_cfs_bandwidth_cpus(cpu_css));
+	rcu_read_unlock();
+#endif
+
 	if (nworks == 1) {
 		/* Single thread, no coordination needed, cut to the chase. */
 		return job->thread_fn(job->start, job->start + job->size,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f3b27c6c5153..848c9fec8006 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10021,6 +10021,25 @@ static long tg_get_cfs_burst(struct task_group *tg)
 	return burst_us;
 }
 
+/* Returns the max whole number of CPUs that @css's bandwidth settings allow. */
+int max_cfs_bandwidth_cpus(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+	u64 quota_us, period_us;
+
+	if (tg == &root_task_group)
+		return nr_cpu_ids;
+
+	quota_us = tg_get_cfs_quota(tg);
+
+	if (quota_us == RUNTIME_INF)
+		return nr_cpu_ids;
+
+	period_us = tg_get_cfs_period(tg);
+
+	return quota_us / period_us;
+}
+
 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
 				  struct cftype *cft)
 {
-- 
2.34.1