[RFC] better visibility into kworkers

Tejun Heo <tj@xxxxxxxxxx> · Wed, 16 May 2018 08:39:56 -0700

Hello,

There can be a lot of workqueue workers and they all show up with the
cryptic kworker/* names making it difficult to understand which is
doing what and how they came to be.

  # ps -ef | grep kworker
  root           4       2  0 Feb25 ?        00:00:00 [kworker/0:0H]
  root           6       2  0 Feb25 ?        00:00:00 [kworker/u112:0]
  root          19       2  0 Feb25 ?        00:00:00 [kworker/1:0H]
  root          25       2  0 Feb25 ?        00:00:00 [kworker/2:0H]
  root          31       2  0 Feb25 ?        00:00:00 [kworker/3:0H]
  ...

On Linus's nudging, I made kworkers report what it is and was doing so
that it reports which workqueue it is executing or has executed the
last time (surrounded by <>) through /proc/$PID/cmdline.

  # ps -ef | grep kworker
  root         4     2  0 08:10 ?        00:00:00 kworker/0:0 <events>
  root         5     2  0 08:10 ?        00:00:00 [kworker/0:0H]
  root         6     2  0 08:10 ?        00:00:00 kworker/u32:0 <events_unbound>
  root        19     2  0 08:10 ?        00:00:00 kworker/1:0 <events>
  root        20     2  0 08:10 ?        00:00:00 kworker/1:0H <kblockd>
  root        25     2  0 08:10 ?        00:00:00 kworker/2:0 <events_power_efficient>
  ...

You can see that, out of the six kworkers listed, five are currently
idle but showing their last workqueue, which should help understanding
the situation when the number of kworkers becomes unusually high and
the information can't get too stale as idle kworkers expire after a
while.  The second one is created during boot and hasn't executed
anything yet, so it's not reporting any workqueue.

However, please note the [] around kowrker/0:0H.  It turns out ps uses
empty /proc/$PID/cmdline to detect kernel threads and puts [] around
them.  Now that kworkers are reporting non-empty cmdline, ps thinks
they aren't kthreads and skips [].

Being mis-represented in ps output is one thing but the behavior isn't
restricted to ps.  For example, with the patch applied, systemd stalls
several minutes during shutdown trying to kill the kernel threads.

  [  935.128029] systemd-shutdown[1]: Syncing filesystems and block devices.
  [  936.339388] systemd-shutdown[1]: Sending SIGTERM to remaining processes...
  [  936.354619] systemd-journald[1043]: Received SIGTERM from PID 1 (systemd-shutdow).
  [ 1026.354890] systemd-shutdown[1]: Sending SIGKILL to remaining processes...
  [ 1026.356955] systemd-shutdown[1]: Sending SIGKILL to PID 4 (kworker/0:0).
  [ 1026.358546] systemd-shutdown[1]: Sending SIGKILL to PID 19 (kworker/1:0).
  [ 1026.359356] systemd-shutdown[1]: Sending SIGKILL to PID 20 (kworker/1:0H).
  [ 1026.360405] systemd-shutdown[1]: Sending SIGKILL to PID 25 (kworker/2:0).
  [ 1026.361513] systemd-shutdown[1]: Sending SIGKILL to PID 31 (kworker/3:0).
  [ 1026.362349] systemd-shutdown[1]: Sending SIGKILL to PID 32 (kworker/3:0H).
  [ 1026.363401] systemd-shutdown[1]: Sending SIGKILL to PID 37 (kworker/4:0).
  [ 1026.364506] systemd-shutdown[1]: Sending SIGKILL to PID 43 (kworker/5:0).
  [ 1026.365392] systemd-shutdown[1]: Sending SIGKILL to PID 44 (kworker/5:0H).
  [ 1116.381015] systemd-shutdown[1]: Unmounting file systems.

So...

1. It doesn't look we can use cmdline to report more info about
   kworkers as it makes userland think they aren't kernel threads
   anymore and misbehave.

2. comm has always been limited to 16 chars including the termination
   (TASK_COMM_LEN).  We can easily report the above kworker
   information through comm but there is some minute chance that
   userland is depending on TASK_COMM_LEN.

3. Userland tools using empty cmdline as an indicator for kernel
   threads seems potentially dangerous depending on what the
   distinction is used for.  From the looks of it, any process should
   be able to clear out their cmdline using prctl.

I'll see how #2 works but if anyone has any comments and/or ideas,
please let me know.

Thanks.

Index: work/kernel/workqueue.c
===================================================================

--- work.orig/kernel/workqueue.c
+++ work/kernel/workqueue.c
@@ -66,7 +66,7 @@ enum {
 	 * be executing on any CPU.  The pool behaves as an unbound one.
 	 *
 	 * Note that DISASSOCIATED should be flipped only while holding
-	 * attach_mutex to avoid changing binding state while
+	 * wq_pool_attach_mutex to avoid changing binding state while
 	 * worker_attach_to_pool() is in progress.
 	 */
 	POOL_MANAGER_ACTIVE	= 1 << 0,	/* being managed */
@@ -123,7 +123,7 @@ enum {
  *    cpu or grabbing pool->lock is enough for read access.  If
  *    POOL_DISASSOCIATED is set, it's identical to L.
  *
- * A: pool->attach_mutex protected.
+ * A: wq_pool_attach_mutex protected.
  *
  * PL: wq_pool_mutex protected.
  *
@@ -166,7 +166,6 @@ struct worker_pool {
 						/* L: hash of busy workers */
 
 	struct worker		*manager;	/* L: purely informational */
-	struct mutex		attach_mutex;	/* attach/detach exclusion */
 	struct list_head	workers;	/* A: attached workers */
 	struct completion	*detach_completion; /* all workers detached */
 
@@ -297,6 +296,7 @@ static bool wq_numa_enabled;		/* unbound
 static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
 
 static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
+static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
 static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
 
@@ -399,14 +399,14 @@ static void workqueue_sysfs_unregister(s
  * @worker: iteration cursor
  * @pool: worker_pool to iterate workers of
  *
- * This must be called with @pool->attach_mutex.
+ * This must be called with wq_pool_attach_mutex.
  *
  * The if/else clause exists only for the lockdep assertion and can be
  * ignored.
  */
 #define for_each_pool_worker(worker, pool)				\
 	list_for_each_entry((worker), &(pool)->workers, node)		\
-		if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
+		if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
 		else
 
 /**
@@ -1724,7 +1724,7 @@ static struct worker *alloc_worker(int n
 static void worker_attach_to_pool(struct worker *worker,
 				   struct worker_pool *pool)
 {
-	mutex_lock(&pool->attach_mutex);
+	mutex_lock(&wq_pool_attach_mutex);
 
 	/*
 	 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
@@ -1733,37 +1733,40 @@ static void worker_attach_to_pool(struct
 	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
 
 	/*
-	 * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
-	 * stable across this function.  See the comments above the
-	 * flag definition for details.
+	 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
+	 * stable across this function.  See the comments above the flag
+	 * definition for details.
 	 */
 	if (pool->flags & POOL_DISASSOCIATED)
 		worker->flags |= WORKER_UNBOUND;
 
 	list_add_tail(&worker->node, &pool->workers);
+	worker->pool = pool;
 
-	mutex_unlock(&pool->attach_mutex);
+	mutex_unlock(&wq_pool_attach_mutex);
 }
 
 /**
  * worker_detach_from_pool() - detach a worker from its pool
  * @worker: worker which is attached to its pool
- * @pool: the pool @worker is attached to
  *
  * Undo the attaching which had been done in worker_attach_to_pool().  The
  * caller worker shouldn't access to the pool after detached except it has
  * other reference to the pool.
  */
-static void worker_detach_from_pool(struct worker *worker,
-				    struct worker_pool *pool)
+static void worker_detach_from_pool(struct worker *worker)
 {
+	struct worker_pool *pool = worker->pool;
 	struct completion *detach_completion = NULL;
 
-	mutex_lock(&pool->attach_mutex);
+	mutex_lock(&wq_pool_attach_mutex);
+
 	list_del(&worker->node);
+	worker->pool = NULL;
+
 	if (list_empty(&pool->workers))
 		detach_completion = pool->detach_completion;
-	mutex_unlock(&pool->attach_mutex);
+	mutex_unlock(&wq_pool_attach_mutex);
 
 	/* clear leftover flags without pool->lock after it is detached */
 	worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
@@ -1799,7 +1802,6 @@ static struct worker *create_worker(stru
 	if (!worker)
 		goto fail;
 
-	worker->pool = pool;
 	worker->id = id;
 
 	if (pool->cpu >= 0)
@@ -2086,6 +2088,12 @@ __acquires(&pool->lock)
 	worker->current_pwq = pwq;
 	work_color = get_work_color(work);
 
+	/*
+	 * Record wq name for cmdline and debug reporting, may get
+	 * overridden through set_worker_desc().
+	 */
+	strlcpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
+
 	list_del_init(&work->entry);
 
 	/*
@@ -2181,7 +2189,6 @@ __acquires(&pool->lock)
 	worker->current_work = NULL;
 	worker->current_func = NULL;
 	worker->current_pwq = NULL;
-	worker->desc_valid = false;
 	pwq_dec_nr_in_flight(pwq, work_color);
 }
 
@@ -2236,7 +2243,7 @@ woke_up:
 
 		set_task_comm(worker->task, "kworker/dying");
 		ida_simple_remove(&pool->worker_ida, worker->id);
-		worker_detach_from_pool(worker, pool);
+		worker_detach_from_pool(worker);
 		kfree(worker);
 		return 0;
 	}
@@ -2367,7 +2374,6 @@ repeat:
 		worker_attach_to_pool(rescuer, pool);
 
 		spin_lock_irq(&pool->lock);
-		rescuer->pool = pool;
 
 		/*
 		 * Slurp in all works issued via this workqueue and
@@ -2417,10 +2423,9 @@ repeat:
 		if (need_more_worker(pool))
 			wake_up_worker(pool);
 
-		rescuer->pool = NULL;
 		spin_unlock_irq(&pool->lock);
 
-		worker_detach_from_pool(rescuer, pool);
+		worker_detach_from_pool(rescuer);
 
 		spin_lock_irq(&wq_mayday_lock);
 	}
@@ -3271,7 +3276,6 @@ static int init_worker_pool(struct worke
 
 	timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
 
-	mutex_init(&pool->attach_mutex);
 	INIT_LIST_HEAD(&pool->workers);
 
 	ida_init(&pool->worker_ida);
@@ -3354,10 +3358,10 @@ static void put_unbound_pool(struct work
 	WARN_ON(pool->nr_workers || pool->nr_idle);
 	spin_unlock_irq(&pool->lock);
 
-	mutex_lock(&pool->attach_mutex);
+	mutex_lock(&wq_pool_attach_mutex);
 	if (!list_empty(&pool->workers))
 		pool->detach_completion = &detach_completion;
-	mutex_unlock(&pool->attach_mutex);
+	mutex_unlock(&wq_pool_attach_mutex);
 
 	if (pool->detach_completion)
 		wait_for_completion(pool->detach_completion);
@@ -4347,7 +4351,6 @@ void set_worker_desc(const char *fmt, ..
 		va_start(args, fmt);
 		vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
 		va_end(args);
-		worker->desc_valid = true;
 	}
 }
 
@@ -4369,9 +4372,9 @@ void print_worker_info(const char *log_l
 	work_func_t *fn = NULL;
 	char name[WQ_NAME_LEN] = { };
 	char desc[WORKER_DESC_LEN] = { };
+	struct work_struct *work = NULL;
 	struct pool_workqueue *pwq = NULL;
 	struct workqueue_struct *wq = NULL;
-	bool desc_valid = false;
 	struct worker *worker;
 
 	if (!(task->flags & PF_WQ_WORKER))
@@ -4383,6 +4386,11 @@ void print_worker_info(const char *log_l
 	 */
 	worker = kthread_probe_data(task);
 
+	/* skip idle workers */
+	probe_kernel_read(&work, &worker->current_work, sizeof(work));
+	if (!work)
+		return;
+
 	/*
 	 * Carefully copy the associated workqueue's workfn and name.  Keep
 	 * the original last '\0' in case the original contains garbage.
@@ -4393,16 +4401,9 @@ void print_worker_info(const char *log_l
 	probe_kernel_read(name, wq->name, sizeof(name) - 1);
 
 	/* copy worker description */
-	probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
-	if (desc_valid)
-		probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
-
-	if (fn || name[0] || desc[0]) {
-		printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
-		if (desc[0])
-			pr_cont(" (%s)", desc);
-		pr_cont("\n");
-	}
+	probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
+
+	printk("%sWorkqueue: %s %pf (%s)\n", log_lvl, name, fn, desc);
 }
 
 static void pr_cont_pool_info(struct worker_pool *pool)
@@ -4579,6 +4580,57 @@ void show_workqueue_state(void)
 	rcu_read_unlock_sched();
 }
 
+ssize_t wq_worker_cmdline(struct task_struct *task, char __user *buf,
+			  size_t count, loff_t *pos)
+{
+	struct worker *worker;
+	struct worker_pool *pool;
+	char *page;
+	size_t len = 0;
+	ssize_t ret;
+
+	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	mutex_lock(&wq_pool_attach_mutex);
+
+	worker = kthread_data(task);
+	pool = worker->pool;
+
+	if (pool) {
+		spin_lock_irq(&pool->lock);
+		if (worker->desc[0] != '\0') {
+			if (worker->current_work)
+				len = snprintf(page, PAGE_SIZE, "%s %s",
+					       worker->task->comm, worker->desc);
+			else
+				len = snprintf(page, PAGE_SIZE, "%s <%s>",
+					       worker->task->comm, worker->desc);
+		}
+		spin_unlock_irq(&pool->lock);
+	}
+
+	mutex_unlock(&wq_pool_attach_mutex);
+
+	if (len < *pos) {
+		ret = 0;
+		goto out_free;
+	}
+
+	len = min_t(size_t, len - *pos, count);
+	if (copy_to_user(buf, page + *pos, len)) {
+		ret = -EFAULT;
+		goto out_free;
+	}
+
+	ret = len;
+	*pos += len;
+out_free:
+	kfree(page);
+	return ret;
+}
+
 /*
  * CPU hotplug.
  *
@@ -4600,7 +4652,7 @@ static void unbind_workers(int cpu)
 	struct worker *worker;
 
 	for_each_cpu_worker_pool(pool, cpu) {
-		mutex_lock(&pool->attach_mutex);
+		mutex_lock(&wq_pool_attach_mutex);
 		spin_lock_irq(&pool->lock);
 
 		/*
@@ -4616,7 +4668,7 @@ static void unbind_workers(int cpu)
 		pool->flags |= POOL_DISASSOCIATED;
 
 		spin_unlock_irq(&pool->lock);
-		mutex_unlock(&pool->attach_mutex);
+		mutex_unlock(&wq_pool_attach_mutex);
 
 		/*
 		 * Call schedule() so that we cross rq->lock and thus can
@@ -4657,7 +4709,7 @@ static void rebind_workers(struct worker
 {
 	struct worker *worker;
 
-	lockdep_assert_held(&pool->attach_mutex);
+	lockdep_assert_held(&wq_pool_attach_mutex);
 
 	/*
 	 * Restore CPU affinity of all workers.  As all idle workers should
@@ -4727,7 +4779,7 @@ static void restore_unbound_workers_cpum
 	static cpumask_t cpumask;
 	struct worker *worker;
 
-	lockdep_assert_held(&pool->attach_mutex);
+	lockdep_assert_held(&wq_pool_attach_mutex);
 
 	/* is @cpu allowed for @pool? */
 	if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
@@ -4762,14 +4814,14 @@ int workqueue_online_cpu(unsigned int cp
 	mutex_lock(&wq_pool_mutex);
 
 	for_each_pool(pool, pi) {
-		mutex_lock(&pool->attach_mutex);
+		mutex_lock(&wq_pool_attach_mutex);
 
 		if (pool->cpu == cpu)
 			rebind_workers(pool);
 		else if (pool->cpu < 0)
 			restore_unbound_workers_cpumask(pool, cpu);
 
-		mutex_unlock(&pool->attach_mutex);
+		mutex_unlock(&wq_pool_attach_mutex);
 	}
 
 	/* update NUMA affinity of unbound workqueues */
Index: work/kernel/workqueue_internal.h
===================================================================
--- work.orig/kernel/workqueue_internal.h
+++ work/kernel/workqueue_internal.h
@@ -31,7 +31,6 @@ struct worker {
 	struct work_struct	*current_work;	/* L: work being processed */
 	work_func_t		current_func;	/* L: current_work's fn */
 	struct pool_workqueue	*current_pwq; /* L: current_work's pwq */
-	bool			desc_valid;	/* ->desc is valid */
 	struct list_head	scheduled;	/* L: scheduled works */
 
 	/* 64 bytes boundary on 64bit, 32 on 32bit */
Index: work/fs/proc/base.c
===================================================================
--- work.orig/fs/proc/base.c
+++ work/fs/proc/base.c
@@ -223,6 +223,13 @@ static ssize_t proc_pid_cmdline_read(str
 	tsk = get_proc_task(file_inode(file));
 	if (!tsk)
 		return -ESRCH;
+
+	if (tsk->flags & PF_WQ_WORKER) {
+		rv = wq_worker_cmdline(tsk, buf, count, pos);
+		put_task_struct(tsk);
+		return rv;
+	}
+
 	mm = get_task_mm(tsk);
 	put_task_struct(tsk);
 	if (!mm)
Index: work/include/linux/workqueue.h
===================================================================
--- work.orig/include/linux/workqueue.h
+++ work/include/linux/workqueue.h
@@ -494,6 +494,8 @@ extern unsigned int work_busy(struct wor
 extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
 extern void print_worker_info(const char *log_lvl, struct task_struct *task);
 extern void show_workqueue_state(void);
+extern ssize_t wq_worker_cmdline(struct task_struct *task, char __user *buf,
+				 size_t count, loff_t *pos);
 
 /**
  * queue_work - queue work on a workqueue
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html