[RFC 14/16] padata: Nice helper threads one by one to prevent starvation

Daniel Jordan <daniel.m.jordan@xxxxxxxxxx> · Wed, 5 Jan 2022 19:46:54 -0500

With padata helper threads running at MAX_NICE, it's possible for one or
more of them to begin chunks of the job and then have their CPU time
constrained by higher priority threads.  The main padata thread, running
at normal priority, may finish all available chunks of the job and then
wait on the MAX_NICE helpers to finish the last in-progress chunks, for
longer than it would have if no helpers were used.

Avoid this by having the main thread assign its priority to each
unfinished helper one at a time so that on a heavily loaded system,
exactly one thread in a given padata call is running at the main thread's
priority.  At least one thread to ensure forward progress, and at most
one thread to limit excessive multithreading.

Here are tests like the ones for MAX_NICE, run on the same two-socket
server, but with a couple of differences:
 - The non-padata workload uses 8 CPUs instead of 7 to compete with the
   main padata thread as well as the padata helpers, so that when the main
   thread finishes, its CPU is completely occupied by the non-padata
   workload, meaning MAX_NICE helpers can't run as often.
 - The non-padata workload starts before the padata workload, rather
   than after, to maximize the chance that it interferes with helpers.

Runtimes in seconds.

Case 1: Synthetic, worst-case CPU contention

  padata_test - a tight loop doing integer multiplication to max out CPU;
                used for testing only, does not appear in this series
  stress-ng   - cpu stressor ("-c 8 --cpu-method ackermann --cpu-ops 1200");

            8_padata_thrs          8_padata_thrs
                 w/o_nice  (stdev)     with_nice  (stdev)  1_padata_thr  (stdev)
             ------------------------------------------------------------------
  padata_test       41.98  ( 0.22)         25.15  ( 2.98)        30.40  ( 0.61)
  stress-ng         44.79  ( 1.11)         46.37  ( 0.69)        53.29  ( 1.91)

Without nicing, padata_test finishes just after stress-ng does because
stress-ng needs to free up CPUs for the helpers to finish (padata_test
shows a shorter runtime than stress-ng because padata_test was started
later).  Nicing lets padata_test finish 40% sooner, and running the same
amount of work in padata_test with 1 thread instead of 8 takes longer
than "with_nice" because MAX_NICE threads still get some CPU time, and
the effect over 8 threads adds up.

stress-ng's total runtime gets a little longer going from no nicing to
nicing because each niced padata thread takes more CPU time than before
when the helpers were starved.

Competing against just one padata thread, stress-ng's reported walltime
goes up because that single thread interferes with fewer stress-ng
threads, but with more impact, causing a greater spread in the time it
takes for individual stress-ng threads to finish.  Averages of the
per-thread stress-ng times from "with_nice" to "1_padata_thr" come out
roughly the same, though, 43.81 and 43.89 respectively.  So the total
runtime of stress-ng across all threads is unaffected, but the time
stress-ng takes to finish running its threads completely actually
improves by spreading the padata_test work over more threads.

Case 2: Real-world CPU contention

  padata_vfio - VFIO page pin a 32G kvm guest
  usemem      - faults in 86G of anonymous THP per thread, PAGE_SIZE stride;
                used to mimic the page clearing that dominates in padata_vfio
                so that usemem competes for the same system resources

            8_padata_thrs          8_padata_thrs
                 w/o_nice  (stdev)     with_nice  (stdev)  1_padata_thr (stdev)
             ------------------------------------------------------------------
  padata_vfio       18.59  ( 0.19)         14.62  ( 2.03)        16.24  ( 0.90)
      usemem        47.54  ( 0.89)         48.18  ( 0.77)        49.70  ( 1.20)

These results are similar to case 1's, though the differences between
times are not quite as pronounced because padata_vfio ran shorter
compared to usemem.

Signed-off-by: Daniel Jordan <daniel.m.jordan@xxxxxxxxxx>
---
 kernel/padata.c | 106 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 73 insertions(+), 33 deletions(-)

diff --git a/kernel/padata.c b/kernel/padata.c
index 83e86724b3e1..52f670a5d6d9 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -40,10 +40,17 @@
 #include <linux/sysfs.h>
 #include <linux/rcupdate.h>
 
+enum padata_work_flags {
+	PADATA_WORK_FINISHED	= 1,
+	PADATA_WORK_UNDO	= 2,
+};
+
 struct padata_work {
 	struct work_struct	pw_work;
 	struct list_head	pw_list;  /* padata_free_works linkage */
+	enum padata_work_flags	pw_flags;
 	void			*pw_data;
+	struct task_struct	*pw_task;
 	/* holds job units from padata_mt_job::start to pw_error_start */
 	unsigned long		pw_error_offset;
 	unsigned long		pw_error_start;
@@ -58,9 +65,8 @@ static LIST_HEAD(padata_free_works);
 struct padata_mt_job_state {
 	spinlock_t		lock;
 	struct completion	completion;
+	struct task_struct	*niced_task;
 	struct padata_mt_job	*job;
-	int			nworks;
-	int			nworks_fini;
 	int			error; /* first error from thread_fn */
 	unsigned long		chunk_size;
 	unsigned long		position;
@@ -451,12 +457,44 @@ static int padata_setup_cpumasks(struct padata_instance *pinst)
 	return err;
 }
 
+static void padata_wait_for_helpers(struct padata_mt_job_state *ps,
+				    struct list_head *unfinished_works,
+				    struct list_head *finished_works)
+{
+	struct padata_work *work;
+
+	if (list_empty(unfinished_works))
+		return;
+
+	spin_lock(&ps->lock);
+	while (!list_empty(unfinished_works)) {
+		work = list_first_entry(unfinished_works, struct padata_work,
+					pw_list);
+		if (!(work->pw_flags & PADATA_WORK_FINISHED)) {
+			set_user_nice(work->pw_task, task_nice(current));
+			ps->niced_task = work->pw_task;
+			spin_unlock(&ps->lock);
+
+			wait_for_completion(&ps->completion);
+
+			spin_lock(&ps->lock);
+			WARN_ON_ONCE(!(work->pw_flags & PADATA_WORK_FINISHED));
+		}
+		/*
+		 * Leave works used in padata_undo() on ps->failed_works.
+		 * padata_undo() will move them to finished_works.
+		 */
+		if (!(work->pw_flags & PADATA_WORK_UNDO))
+			list_move(&work->pw_list, finished_works);
+	}
+	spin_unlock(&ps->lock);
+}
+
 static int padata_mt_helper(void *__pw)
 {
 	struct padata_work *pw = __pw;
 	struct padata_mt_job_state *ps = pw->pw_data;
 	struct padata_mt_job *job = ps->job;
-	bool done;
 
 	spin_lock(&ps->lock);
 
@@ -488,6 +526,7 @@ static int padata_mt_helper(void *__pw)
 				ps->error = ret;
 			/* Save information about where the job failed. */
 			if (job->undo_fn) {
+				pw->pw_flags |= PADATA_WORK_UNDO;
 				list_move(&pw->pw_list, &ps->failed_works);
 				pw->pw_error_start = position;
 				pw->pw_error_offset = position_offset;
@@ -496,12 +535,10 @@ static int padata_mt_helper(void *__pw)
 		}
 	}
 
-	++ps->nworks_fini;
-	done = (ps->nworks_fini == ps->nworks);
-	spin_unlock(&ps->lock);
-
-	if (done)
+	pw->pw_flags |= PADATA_WORK_FINISHED;
+	if (ps->niced_task == current)
 		complete(&ps->completion);
+	spin_unlock(&ps->lock);
 
 	return 0;
 }
@@ -520,7 +557,7 @@ static int padata_error_cmp(void *unused, const struct list_head *a,
 }
 
 static void padata_undo(struct padata_mt_job_state *ps,
-			struct list_head *works_list,
+			struct list_head *finished_works,
 			struct padata_work *stack_work)
 {
 	struct list_head *failed_works = &ps->failed_works;
@@ -548,11 +585,12 @@ static void padata_undo(struct padata_mt_job_state *ps,
 
 		if (failed_work) {
 			undo_pos = failed_work->pw_error_end;
-			/* main thread's stack_work stays off works_list */
+			/* main thread's stack_work stays off finished_works */
 			if (failed_work == stack_work)
 				list_del(&failed_work->pw_list);
 			else
-				list_move(&failed_work->pw_list, works_list);
+				list_move(&failed_work->pw_list,
+					  finished_works);
 		} else {
 			undo_pos = undo_end;
 		}
@@ -577,16 +615,17 @@ int padata_do_multithreaded_job(struct padata_mt_job *job,
 	struct cgroup_subsys_state *cpu_css;
 	struct padata_work my_work, *pw;
 	struct padata_mt_job_state ps;
-	LIST_HEAD(works);
-	int nworks;
+	LIST_HEAD(unfinished_works);
+	LIST_HEAD(finished_works);
+	int nworks, req;
 
 	if (job->size == 0)
 		return 0;
 
 	/* Ensure at least one thread when size < min_chunk. */
-	nworks = max(job->size / job->min_chunk, 1ul);
-	nworks = min(nworks, job->max_threads);
-	nworks = min(nworks, current->nr_cpus_allowed);
+	req = max(job->size / job->min_chunk, 1ul);
+	req = min(req, job->max_threads);
+	req = min(req, current->nr_cpus_allowed);
 
 #ifdef CONFIG_CGROUP_SCHED
 	/*
@@ -596,23 +635,23 @@ int padata_do_multithreaded_job(struct padata_mt_job *job,
 	 */
 	rcu_read_lock();
 	cpu_css = task_css(current, cpu_cgrp_id);
-	nworks = min(nworks, max_cfs_bandwidth_cpus(cpu_css));
+	req = min(req, max_cfs_bandwidth_cpus(cpu_css));
 	rcu_read_unlock();
 #endif
 
-	if (nworks == 1) {
+	if (req == 1) {
 		/* Single thread, no coordination needed, cut to the chase. */
 		return job->thread_fn(job->start, job->start + job->size,
 				      job->fn_arg);
 	}
 
+	nworks = padata_work_alloc_mt(req, &unfinished_works);
+
 	spin_lock_init(&ps.lock);
 	init_completion(&ps.completion);
 	lockdep_init_map(&ps.lockdep_map, map_name, key, 0);
 	INIT_LIST_HEAD(&ps.failed_works);
 	ps.job		  = job;
-	ps.nworks	  = padata_work_alloc_mt(nworks, &works);
-	ps.nworks_fini	  = 0;
 	ps.error	  = 0;
 	ps.position	  = job->start;
 	ps.remaining_size = job->size;
@@ -623,41 +662,42 @@ int padata_do_multithreaded_job(struct padata_mt_job *job,
 	 * increasing the number of chunks, guarantee at least the minimum
 	 * chunk size from the caller, and honor the caller's alignment.
 	 */
-	ps.chunk_size = job->size / (ps.nworks * load_balance_factor);
+	ps.chunk_size = job->size / (nworks * load_balance_factor);
 	ps.chunk_size = max(ps.chunk_size, job->min_chunk);
 	ps.chunk_size = roundup(ps.chunk_size, job->align);
 
 	lock_map_acquire(&ps.lockdep_map);
 	lock_map_release(&ps.lockdep_map);
 
-	list_for_each_entry(pw, &works, pw_list) {
-		struct task_struct *task;
-
+	list_for_each_entry(pw, &unfinished_works, pw_list) {
 		pw->pw_data = &ps;
-		task = kthread_create(padata_mt_helper, pw, "padata");
-		if (IS_ERR(task)) {
-			--ps.nworks;
+		pw->pw_task = kthread_create(padata_mt_helper, pw, "padata");
+		if (IS_ERR(pw->pw_task)) {
+			pw->pw_flags = PADATA_WORK_FINISHED;
 		} else {
 			/* Helper threads shouldn't disturb other workloads. */
-			set_user_nice(task, MAX_NICE);
-			kthread_bind_mask(task, current->cpus_ptr);
+			set_user_nice(pw->pw_task, MAX_NICE);
+
+			pw->pw_flags = 0;
+			kthread_bind_mask(pw->pw_task, current->cpus_ptr);
 
-			wake_up_process(task);
+			wake_up_process(pw->pw_task);
 		}
 	}
 
 	/* Use the current task, which saves starting a kthread. */
 	my_work.pw_data = &ps;
+	my_work.pw_flags = 0;
 	INIT_LIST_HEAD(&my_work.pw_list);
 	padata_mt_helper(&my_work);
 
 	/* Wait for all the helpers to finish. */
-	wait_for_completion(&ps.completion);
+	padata_wait_for_helpers(&ps, &unfinished_works, &finished_works);
 
 	if (ps.error && job->undo_fn)
-		padata_undo(&ps, &works, &my_work);
+		padata_undo(&ps, &finished_works, &my_work);
 
-	padata_works_free(&works);
+	padata_works_free(&finished_works);
 	return ps.error;
 }
 
-- 
2.34.1