Re: [PATCH v3 7/7] n_tty: Provide an informational line on VSTATUS receipt

Jiri Slaby <jslaby@xxxxxxx> · Thu, 30 Apr 2020 09:29:28 +0200

General comments:
- care to CC scheduler and mm people?
- couldn't this share some code with fs/proc?
- I am not sure/convinced it is worth the hassle

On 30. 04. 20, 8:43, Arseny Maslennikov wrote:
> If the three termios local flags isig, icanon, iexten are enabled
> and the local flag nokerninfo is disabled for a tty governed
> by the n_tty line discipline, then on receiving the keyboard status
> character n_tty will generate a status message and write it out to
> the tty before sending SIGINFO to the tty's foreground process group.
> 
> This kerninfo line contains information about the current system load
> as well as some properties of "the most interesting" process in the
> tty's current foreground process group, namely:
>  - its PID as seen inside its deepest PID namespace;
>    * the whole process group ought to be in a single PID namespace,
>      so this is actually deterministic
>  - its saved command name truncated to 16 bytes (task_struct::comm);
>    * at the time of writing TASK_COMM_LEN == 16
>  - its state and some related bits, procps-style;
>  - for S and D: its symbolic wait channel, if available; or a short
>    description for other process states instead;
>  - its user, system and real rusage time values;
>  - its resident set size (as well as the high watermark) in kilobytes.
> 
> The "most interesting" process is chosen as follows:
>  - runnables over everything
>  - uninterruptibles over everything else
>  - among 2 runnables pick the biggest utime + stime
>  - any unresolved ties are decided in favour of greatest PID.
> 
> While the kerninfo line is not very useful for debugging the kernel
> itself, since we have much more powerful debugging tools, it still gives
> the user behind the terminal some meaningful feedback to a VSTATUS that
> works even if no processes respond.

Care to append an example output to the commit message?

> Signed-off-by: Arseny Maslennikov <ar@xxxxxxxxx>
...
> index f72a3fd4b..905cdd985 100644
> --- a/drivers/tty/n_tty.c
> +++ b/drivers/tty/n_tty.c
> @@ -79,6 +80,8 @@
>  #define ECHO_BLOCK		256
>  #define ECHO_DISCARD_WATERMARK	N_TTY_BUF_SIZE - (ECHO_BLOCK + 32)
>  
> +#define STATUS_LINE_LEN (2 * KSYM_NAME_LEN)

It's too magic constant.

> @@ -2489,6 +2496,21 @@ static int n_tty_ioctl(struct tty_struct *tty, struct file *file,
>  	}
>  }
>  
> +static void n_tty_status_line(struct tty_struct *tty)
> +{
> +	struct n_tty_data *ldata = tty->disc_data;
> +	char *msg, *buf;
> +	msg = buf = kzalloc(STATUS_LINE_LEN, GFP_KERNEL);
> +	tty_sprint_status_line(tty, buf + 1, STATUS_LINE_LEN - 1);
> +	/* The only current caller of this takes output_lock for us. */

So add a lockdep assertion?

> +	if (ldata->column != 0)
> +		*msg = '\n';
> +	else
> +		msg++;
> +	do_n_tty_write(tty, NULL, msg, strlen(msg));
> +	kfree(buf);
> +}
> +
>  static struct tty_ldisc_ops n_tty_ops = {
>  	.magic           = TTY_LDISC_MAGIC,
>  	.name            = "n_tty",
> diff --git a/drivers/tty/n_tty_status.c b/drivers/tty/n_tty_status.c
> new file mode 100644
> index 000000000..d92261bbe
> --- /dev/null
> +++ b/drivers/tty/n_tty_status.c
> @@ -0,0 +1,338 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +
> +#include <linux/kallsyms.h>
> +#include <linux/pid.h>
> +#include <linux/sched.h>
> +#include <linux/sched/signal.h>
> +#include <linux/sched/loadavg.h>
> +#include <linux/sched/cputime.h>
> +#include <linux/sched/mm.h>
> +#include <linux/slab.h>
> +#include <linux/tty.h>
> +
> +#define BCOMPARE(lbool, rbool) ((lbool) << 1 | (rbool))
> +#define BCOMPARE_NONE 0
> +#define BCOMPARE_RIGHT 1
> +#define BCOMPARE_LEFT 2
> +#define BCOMPARE_BOTH (BCOMPARE_LEFT | BCOMPARE_RIGHT)
> +
> +/*
> + * Select the most interesting task of two.
> + *
> + * The implemented approach is simple for now:
> + * - pick runnable
> + * - if no runnables, pick uninterruptible
> + * - if tie between runnables, pick highest utime + stime
> + * - if a tie is not broken by the above, pick highest pid nr.
> + *
> + * For reference, here's the one used in FreeBSD:
> + * - pick runnables over anything
> + * - if both runnables, pick highest cpu utilization
> + * - if no runnables, pick shortest sleep time (the scheduler
> + *   actually takes care to maintain this statistic)
> + * - other ties are decided in favour of youngest process.
> + */
> +static struct task_struct *__better_proc_R(struct task_struct *a,

Why so weird name __better_proc_R?

> +		struct task_struct *b)
> +{
> +	unsigned long flags;
> +	u64 atime, btime, tgutime, tgstime;
> +	struct task_struct *ret;
> +
> +	if (!lock_task_sighand(a, &flags))
> +		goto out_a_unlocked;
> +	thread_group_cputime_adjusted(a, &tgutime, &tgstime);
> +	atime = tgutime + tgstime;
> +	unlock_task_sighand(a, &flags);
> +
> +	if (!lock_task_sighand(b, &flags))
> +		goto out_b_unlocked;
> +	thread_group_cputime_adjusted(b, &tgutime, &tgstime);
> +	btime = tgutime + tgstime;
> +	unlock_task_sighand(b, &flags);
> +
> +	ret = atime > btime ? a : b;
> +
> +	return ret;
> +
> +out_b_unlocked:
> +out_a_unlocked:
> +	return task_pid_vinr(a) > task_pid_vinr(b) ? a : b;
> +}
> +
> +static struct task_struct *__better_proc(struct task_struct *a,

Again, why "__" in the name?

> +		struct task_struct *b)
> +{
> +	if (!pid_alive(a))
> +		return b;
> +	if (!pid_alive(b))
> +		return a;
> +
> +	switch (BCOMPARE(a->state == TASK_RUNNING,
> +			b->state == TASK_RUNNING)) {
> +	case BCOMPARE_LEFT:
> +		return a;
> +	case BCOMPARE_RIGHT:
> +		return b;
> +	case BCOMPARE_BOTH:
> +		return __better_proc_R(a, b);
> +	}

Doesn't this look saner and better:

if (a->state == TASK_RUNNING && b->state == TASK_RUNNING)
  return __better_proc_R(a, b);
if (a->state == TASK_RUNNING)
  return a;
if (b->state == TASK_RUNNING)
  return b;

?

And one doesn't need to decrypt the defines.

> +	switch (BCOMPARE(a->state == TASK_UNINTERRUPTIBLE,
> +			b->state == TASK_UNINTERRUPTIBLE)) {
> +	case BCOMPARE_LEFT:
> +		return a;
> +	case BCOMPARE_RIGHT:
> +		return b;
> +	case BCOMPARE_BOTH:
> +		break;

dtto.

> +	}
> +
> +	/* TODO: Perhaps we should check something else... */
> +	return task_pid_vinr(a) > task_pid_vinr(b) ? a : b;
> +}
> +
> +/* Weed out NULLs. */
> +static inline struct task_struct *better_proc(struct task_struct *a,
> +		struct task_struct *b) {
> +	return a ? (b ? __better_proc(a, b) : a) : b;

This urgently calls for ifs and not ternany operators.

Actually you don't need this helper at all. Check a and b in the same if
as the respective !pid_alive above.

> +}
> +
> +static int scnprint_load(char *msgp, size_t size)
> +{
> +	unsigned long la[3];
> +
> +	get_avenrun(la, FIXED_1/200, 0);
> +	return scnprintf(msgp, size, "load: %lu.%02lu; ",
> +			LOAD_INT(la[0]), LOAD_FRAC(la[0]));
> +}
> +
> +static int scnprint_task(char *msgp, size_t size, struct task_struct *task)
> +{
> +	char commname[TASK_COMM_LEN];
> +
> +	get_task_comm(commname, task);
> +	return scnprintf(msgp, size, "%d/%s:", task_pid_vinr(task), commname);
> +}
> +
> +static int scnprint_rusage(char *msgp, ssize_t size,
> +		struct task_struct *task, struct mm_struct *mm)
> +{
> +	struct rusage ru;
> +	struct timespec64 utime, stime;
> +	struct timespec64 rtime;
> +	u64 now;
> +	int ret = 0;
> +	int psz = 0;
> +
> +	getrusage(task, RUSAGE_BOTH, &ru);
> +	now = ktime_get_ns();
> +
> +	utime.tv_sec = ru.ru_utime.tv_sec;
> +	utime.tv_nsec = ru.ru_utime.tv_usec * NSEC_PER_USEC;
> +	stime.tv_sec = ru.ru_stime.tv_sec;
> +	stime.tv_nsec = ru.ru_stime.tv_usec * NSEC_PER_USEC;
> +
> +	rtime = ns_to_timespec64(now - task->start_time);
> +
> +	psz = scnprintf(msgp, size,
> +			"%llu.%03lur %llu.%03luu %llu.%03lus",
> +			rtime.tv_sec, rtime.tv_nsec / NSEC_PER_MSEC,
> +			utime.tv_sec, utime.tv_nsec / NSEC_PER_MSEC,
> +			stime.tv_sec, stime.tv_nsec / NSEC_PER_MSEC);
> +	ret += psz;
> +	msgp += psz;
> +	size -= psz;
> +
> +	if (mm) {
> +		psz = scnprintf(msgp, size,
> +				" %luk/%luk",
> +				get_mm_rss(mm) * PAGE_SIZE / 1024,
> +				get_mm_hiwater_rss(mm) * PAGE_SIZE / 1024);
> +		ret += psz;
> +	}
> +	return ret;
> +}
> +
> +static int scnprint_state(char *msgp, ssize_t size,
> +		struct task_struct *task, struct mm_struct *mm)
> +{
> +	char stat[8] = {0};
> +	const char *state_descr = "";
> +	struct task_struct *parent = NULL;
> +	struct task_struct *real_parent = NULL;
> +	unsigned long wchan = 0;
> +	int psz = 0;
> +	char symname[KSYM_NAME_LEN];
> +
> +	stat[psz++] = task_state_to_char(task);
> +	if (task_nice(task) < 0)
> +		stat[psz++] = '<';
> +	else if (task_nice(task) > 0)
> +		stat[psz++] = 'N';
> +	if (mm && mm->locked_vm)
> +		stat[psz++] = 'L';
> +	if (get_nr_threads(task) > 1)
> +		stat[psz++] = 'l';
> +
> +	switch (stat[0]) {
> +	case 'R':
> +		if (task_curr(task))
> +			stat[psz++] = '!';
> +		break;
> +	case 'S':
> +	case 'D':
> +		wchan = get_wchan(task);
> +		if (!wchan)
> +			break;
> +		if (!lookup_symbol_name(wchan, symname))
> +			state_descr = symname;
> +		else
> +			/* get_wchan() returned something
> +			 * that looks like no kernel symbol.
> +			 */
> +			state_descr = "*unknown*";
> +		break;
> +	case 'T':
> +		state_descr = "stopped";
> +		break;
> +	case 't':
> +		state_descr = "traced";
> +		break;
> +	case 'Z':
> +		rcu_read_lock();
> +		real_parent = rcu_dereference(task->real_parent);
> +		parent = rcu_dereference(task->parent);
> +		psz = sprintf(symname, "zombie; ppid=%d",
> +			task_tgid_nr_ns(real_parent,
> +				ns_of_pid(task_pid(task))));
> +		if (parent != real_parent)
> +			sprintf(symname + psz, " reaper=%d",
> +				task_tgid_nr_ns(parent,
> +					ns_of_pid(task_pid(task))));
> +		rcu_read_unlock();
> +		state_descr = symname;
> +		break;
> +	case 'I':
> +		/* Can this even happen? */
> +		state_descr = "idle";
> +		break;
> +	default:
> +		state_descr = "unknown";
> +	}
> +
> +	psz = scnprintf(msgp, size, "%s", stat);
> +	msgp += psz;
> +	size -= psz;
> +	if (*state_descr)
> +		psz += scnprintf(msgp, size, wchan ? " [%s]" : " (%s)", state_descr);
> +
> +	return psz;
> +}
> +
> +/**
> + *	tty_sprint_status_line	-		produce kerninfo line
> + *	@tty: terminal device
> + *	@msg: preallocated memory buffer
> + *	@length: maximum line length
> + *
> + *	Reports state of foreground process group in a null-terminated string
> + *	located at @msg, @length bytes long. If @length is insufficient,
> + *	the line gets truncated.
> + */
> +void tty_sprint_status_line(struct tty_struct *t, char *msg, size_t length)
> +{
> +	struct task_struct *tsk = NULL, *mit = NULL;
> +	struct mm_struct *mm;
> +	struct pid *pgrp = NULL;
> +	char *msgp = msg;
> +	int psz = 0;
> +
> +	if (!length)
> +		return;
> +	length--; /* Make room for trailing '\n' */
> +
> +	psz = scnprint_load(msgp, length);
> +	if (psz > 0) {
> +		msgp += psz;
> +		length -= psz;
> +	}
> +	if (!length)
> +		goto finalize_message;
> +
> +	/* Not sure if session pid is protected by ctrl_lock
> +	 * or tasklist_lock...
> +	 */
> +	pgrp = t->session;
> +	if (pgrp == NULL) {
> +		psz = scnprintf(msgp, length, "not a controlling tty");
> +		if (psz > 0)
> +			msgp += psz;
> +		goto finalize_message;
> +	}
> +	pgrp = tty_get_pgrp(t);
> +	if (pgrp == NULL) {
> +		psz = scnprintf(msgp, length, "no foreground process group");
> +		if (psz > 0)
> +			msgp += psz;
> +		goto finalize_message;
> +	}
> +
> +	read_lock(&tasklist_lock);
> +	do_each_pid_task(pgrp, PIDTYPE_PGID, tsk)
> +	{
> +		/* Select the most interesting task. */
> +		if (tsk == better_proc(mit, tsk))
> +			mit = tsk;
> +	} while_each_pid_task(pgrp, PIDTYPE_PGID, tsk);
> +	read_unlock(&tasklist_lock);
> +
> +	if (!pid_alive(mit))
> +		goto finalize_message;
> +
> +	/* Gather intel on most interesting task. */
> +	/* Can the mm of a foreground process turn out to be NULL?
> +	 * Definitely; for example, if it is a zombie.
> +	 */
> +	mm = get_task_mm(mit);
> +
> +	psz = scnprint_task(msgp, length, mit);
> +	if (psz > 0) {
> +		msgp += psz;
> +		length -= psz;
> +	}
> +	if (!length)
> +		goto finalize_message;
> +	*msgp++ = ' ';
> +	length--;
> +
> +	psz = scnprint_state(msgp, length, mit, mm);
> +	if (psz > 0) {
> +		msgp += psz;
> +		length -= psz;
> +	}
> +	if (!length)
> +		goto finalize_message;
> +	*msgp++ = ' ';
> +	length--;
> +
> +	psz = scnprint_rusage(msgp, length, mit, mm);
> +	if (psz > 0) {
> +		msgp += psz;
> +		length -= psz;
> +	}
> +	if (!length)
> +		goto finalize_message;
> +	*msgp++ = ' ';
> +	length--;
> +
> +	if (!mm)
> +		goto finalize_message;
> +
> +	mmput(mm);
> +
> +finalize_message:
> +	*msgp++ = '\n';
> +	if (pgrp)
> +		put_pid(pgrp);
> +}
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 4418f5cb8..195abd47d 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1318,6 +1318,8 @@ static inline struct pid *task_pid(struct task_struct *task)
>   * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
>   * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
>   *                     current.
> + * task_xid_vinr()   : virtual inner id, i.e. the id seen from the namespace of
> + *                     the task itself;
>   * task_xid_nr_ns()  : id seen from the ns specified;
>   *
>   * see also pid_nr() etc in include/linux/pid.h
> @@ -1339,6 +1341,11 @@ static inline pid_t task_pid_vnr(struct task_struct *tsk)
>  	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
>  }
>  
> +static inline pid_t task_pid_vinr(struct task_struct *tsk)
> +{
> +	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns_of_pid(task_pid(tsk)));
> +}
> +

This smells like it should be done in a separate patch.

>  static inline pid_t task_tgid_nr(struct task_struct *tsk)
>  {
> diff --git a/include/linux/tty.h b/include/linux/tty.h
> index 3499845ab..2023addaf 100644
> --- a/include/linux/tty.h
> +++ b/include/linux/tty.h
> @@ -727,6 +727,9 @@ extern void __init n_tty_init(void);
>  static inline void n_tty_init(void) { }
>  #endif
>  
> +/* n_tty_status.c */
> +extern void tty_sprint_status_line(struct tty_struct *tty, char *msg, size_t size);

No need for extern.

thanks,
-- 
js
suse labs