General comments: - care to CC scheduler and mm people? - couldn't this share some code with fs/proc? - I am not sure/convinced it is worth the hassle On 30. 04. 20, 8:43, Arseny Maslennikov wrote: > If the three termios local flags isig, icanon, iexten are enabled > and the local flag nokerninfo is disabled for a tty governed > by the n_tty line discipline, then on receiving the keyboard status > character n_tty will generate a status message and write it out to > the tty before sending SIGINFO to the tty's foreground process group. > > This kerninfo line contains information about the current system load > as well as some properties of "the most interesting" process in the > tty's current foreground process group, namely: > - its PID as seen inside its deepest PID namespace; > * the whole process group ought to be in a single PID namespace, > so this is actually deterministic > - its saved command name truncated to 16 bytes (task_struct::comm); > * at the time of writing TASK_COMM_LEN == 16 > - its state and some related bits, procps-style; > - for S and D: its symbolic wait channel, if available; or a short > description for other process states instead; > - its user, system and real rusage time values; > - its resident set size (as well as the high watermark) in kilobytes. > > The "most interesting" process is chosen as follows: > - runnables over everything > - uninterruptibles over everything else > - among 2 runnables pick the biggest utime + stime > - any unresolved ties are decided in favour of greatest PID. > > While the kerninfo line is not very useful for debugging the kernel > itself, since we have much more powerful debugging tools, it still gives > the user behind the terminal some meaningful feedback to a VSTATUS that > works even if no processes respond. Care to append an example output to the commit message? > Signed-off-by: Arseny Maslennikov <ar@xxxxxxxxx> ... > index f72a3fd4b..905cdd985 100644 > --- a/drivers/tty/n_tty.c > +++ b/drivers/tty/n_tty.c > @@ -79,6 +80,8 @@ > #define ECHO_BLOCK 256 > #define ECHO_DISCARD_WATERMARK N_TTY_BUF_SIZE - (ECHO_BLOCK + 32) > > +#define STATUS_LINE_LEN (2 * KSYM_NAME_LEN) It's too magic constant. > @@ -2489,6 +2496,21 @@ static int n_tty_ioctl(struct tty_struct *tty, struct file *file, > } > } > > +static void n_tty_status_line(struct tty_struct *tty) > +{ > + struct n_tty_data *ldata = tty->disc_data; > + char *msg, *buf; > + msg = buf = kzalloc(STATUS_LINE_LEN, GFP_KERNEL); > + tty_sprint_status_line(tty, buf + 1, STATUS_LINE_LEN - 1); > + /* The only current caller of this takes output_lock for us. */ So add a lockdep assertion? > + if (ldata->column != 0) > + *msg = '\n'; > + else > + msg++; > + do_n_tty_write(tty, NULL, msg, strlen(msg)); > + kfree(buf); > +} > + > static struct tty_ldisc_ops n_tty_ops = { > .magic = TTY_LDISC_MAGIC, > .name = "n_tty", > diff --git a/drivers/tty/n_tty_status.c b/drivers/tty/n_tty_status.c > new file mode 100644 > index 000000000..d92261bbe > --- /dev/null > +++ b/drivers/tty/n_tty_status.c > @@ -0,0 +1,338 @@ > +// SPDX-License-Identifier: GPL-2.0+ > + > +#include <linux/kallsyms.h> > +#include <linux/pid.h> > +#include <linux/sched.h> > +#include <linux/sched/signal.h> > +#include <linux/sched/loadavg.h> > +#include <linux/sched/cputime.h> > +#include <linux/sched/mm.h> > +#include <linux/slab.h> > +#include <linux/tty.h> > + > +#define BCOMPARE(lbool, rbool) ((lbool) << 1 | (rbool)) > +#define BCOMPARE_NONE 0 > +#define BCOMPARE_RIGHT 1 > +#define BCOMPARE_LEFT 2 > +#define BCOMPARE_BOTH (BCOMPARE_LEFT | BCOMPARE_RIGHT) > + > +/* > + * Select the most interesting task of two. > + * > + * The implemented approach is simple for now: > + * - pick runnable > + * - if no runnables, pick uninterruptible > + * - if tie between runnables, pick highest utime + stime > + * - if a tie is not broken by the above, pick highest pid nr. > + * > + * For reference, here's the one used in FreeBSD: > + * - pick runnables over anything > + * - if both runnables, pick highest cpu utilization > + * - if no runnables, pick shortest sleep time (the scheduler > + * actually takes care to maintain this statistic) > + * - other ties are decided in favour of youngest process. > + */ > +static struct task_struct *__better_proc_R(struct task_struct *a, Why so weird name __better_proc_R? > + struct task_struct *b) > +{ > + unsigned long flags; > + u64 atime, btime, tgutime, tgstime; > + struct task_struct *ret; > + > + if (!lock_task_sighand(a, &flags)) > + goto out_a_unlocked; > + thread_group_cputime_adjusted(a, &tgutime, &tgstime); > + atime = tgutime + tgstime; > + unlock_task_sighand(a, &flags); > + > + if (!lock_task_sighand(b, &flags)) > + goto out_b_unlocked; > + thread_group_cputime_adjusted(b, &tgutime, &tgstime); > + btime = tgutime + tgstime; > + unlock_task_sighand(b, &flags); > + > + ret = atime > btime ? a : b; > + > + return ret; > + > +out_b_unlocked: > +out_a_unlocked: > + return task_pid_vinr(a) > task_pid_vinr(b) ? a : b; > +} > + > +static struct task_struct *__better_proc(struct task_struct *a, Again, why "__" in the name? > + struct task_struct *b) > +{ > + if (!pid_alive(a)) > + return b; > + if (!pid_alive(b)) > + return a; > + > + switch (BCOMPARE(a->state == TASK_RUNNING, > + b->state == TASK_RUNNING)) { > + case BCOMPARE_LEFT: > + return a; > + case BCOMPARE_RIGHT: > + return b; > + case BCOMPARE_BOTH: > + return __better_proc_R(a, b); > + } Doesn't this look saner and better: if (a->state == TASK_RUNNING && b->state == TASK_RUNNING) return __better_proc_R(a, b); if (a->state == TASK_RUNNING) return a; if (b->state == TASK_RUNNING) return b; ? And one doesn't need to decrypt the defines. > + switch (BCOMPARE(a->state == TASK_UNINTERRUPTIBLE, > + b->state == TASK_UNINTERRUPTIBLE)) { > + case BCOMPARE_LEFT: > + return a; > + case BCOMPARE_RIGHT: > + return b; > + case BCOMPARE_BOTH: > + break; dtto. > + } > + > + /* TODO: Perhaps we should check something else... */ > + return task_pid_vinr(a) > task_pid_vinr(b) ? a : b; > +} > + > +/* Weed out NULLs. */ > +static inline struct task_struct *better_proc(struct task_struct *a, > + struct task_struct *b) { > + return a ? (b ? __better_proc(a, b) : a) : b; This urgently calls for ifs and not ternany operators. Actually you don't need this helper at all. Check a and b in the same if as the respective !pid_alive above. > +} > + > +static int scnprint_load(char *msgp, size_t size) > +{ > + unsigned long la[3]; > + > + get_avenrun(la, FIXED_1/200, 0); > + return scnprintf(msgp, size, "load: %lu.%02lu; ", > + LOAD_INT(la[0]), LOAD_FRAC(la[0])); > +} > + > +static int scnprint_task(char *msgp, size_t size, struct task_struct *task) > +{ > + char commname[TASK_COMM_LEN]; > + > + get_task_comm(commname, task); > + return scnprintf(msgp, size, "%d/%s:", task_pid_vinr(task), commname); > +} > + > +static int scnprint_rusage(char *msgp, ssize_t size, > + struct task_struct *task, struct mm_struct *mm) > +{ > + struct rusage ru; > + struct timespec64 utime, stime; > + struct timespec64 rtime; > + u64 now; > + int ret = 0; > + int psz = 0; > + > + getrusage(task, RUSAGE_BOTH, &ru); > + now = ktime_get_ns(); > + > + utime.tv_sec = ru.ru_utime.tv_sec; > + utime.tv_nsec = ru.ru_utime.tv_usec * NSEC_PER_USEC; > + stime.tv_sec = ru.ru_stime.tv_sec; > + stime.tv_nsec = ru.ru_stime.tv_usec * NSEC_PER_USEC; > + > + rtime = ns_to_timespec64(now - task->start_time); > + > + psz = scnprintf(msgp, size, > + "%llu.%03lur %llu.%03luu %llu.%03lus", > + rtime.tv_sec, rtime.tv_nsec / NSEC_PER_MSEC, > + utime.tv_sec, utime.tv_nsec / NSEC_PER_MSEC, > + stime.tv_sec, stime.tv_nsec / NSEC_PER_MSEC); > + ret += psz; > + msgp += psz; > + size -= psz; > + > + if (mm) { > + psz = scnprintf(msgp, size, > + " %luk/%luk", > + get_mm_rss(mm) * PAGE_SIZE / 1024, > + get_mm_hiwater_rss(mm) * PAGE_SIZE / 1024); > + ret += psz; > + } > + return ret; > +} > + > +static int scnprint_state(char *msgp, ssize_t size, > + struct task_struct *task, struct mm_struct *mm) > +{ > + char stat[8] = {0}; > + const char *state_descr = ""; > + struct task_struct *parent = NULL; > + struct task_struct *real_parent = NULL; > + unsigned long wchan = 0; > + int psz = 0; > + char symname[KSYM_NAME_LEN]; > + > + stat[psz++] = task_state_to_char(task); > + if (task_nice(task) < 0) > + stat[psz++] = '<'; > + else if (task_nice(task) > 0) > + stat[psz++] = 'N'; > + if (mm && mm->locked_vm) > + stat[psz++] = 'L'; > + if (get_nr_threads(task) > 1) > + stat[psz++] = 'l'; > + > + switch (stat[0]) { > + case 'R': > + if (task_curr(task)) > + stat[psz++] = '!'; > + break; > + case 'S': > + case 'D': > + wchan = get_wchan(task); > + if (!wchan) > + break; > + if (!lookup_symbol_name(wchan, symname)) > + state_descr = symname; > + else > + /* get_wchan() returned something > + * that looks like no kernel symbol. > + */ > + state_descr = "*unknown*"; > + break; > + case 'T': > + state_descr = "stopped"; > + break; > + case 't': > + state_descr = "traced"; > + break; > + case 'Z': > + rcu_read_lock(); > + real_parent = rcu_dereference(task->real_parent); > + parent = rcu_dereference(task->parent); > + psz = sprintf(symname, "zombie; ppid=%d", > + task_tgid_nr_ns(real_parent, > + ns_of_pid(task_pid(task)))); > + if (parent != real_parent) > + sprintf(symname + psz, " reaper=%d", > + task_tgid_nr_ns(parent, > + ns_of_pid(task_pid(task)))); > + rcu_read_unlock(); > + state_descr = symname; > + break; > + case 'I': > + /* Can this even happen? */ > + state_descr = "idle"; > + break; > + default: > + state_descr = "unknown"; > + } > + > + psz = scnprintf(msgp, size, "%s", stat); > + msgp += psz; > + size -= psz; > + if (*state_descr) > + psz += scnprintf(msgp, size, wchan ? " [%s]" : " (%s)", state_descr); > + > + return psz; > +} > + > +/** > + * tty_sprint_status_line - produce kerninfo line > + * @tty: terminal device > + * @msg: preallocated memory buffer > + * @length: maximum line length > + * > + * Reports state of foreground process group in a null-terminated string > + * located at @msg, @length bytes long. If @length is insufficient, > + * the line gets truncated. > + */ > +void tty_sprint_status_line(struct tty_struct *t, char *msg, size_t length) > +{ > + struct task_struct *tsk = NULL, *mit = NULL; > + struct mm_struct *mm; > + struct pid *pgrp = NULL; > + char *msgp = msg; > + int psz = 0; > + > + if (!length) > + return; > + length--; /* Make room for trailing '\n' */ > + > + psz = scnprint_load(msgp, length); > + if (psz > 0) { > + msgp += psz; > + length -= psz; > + } > + if (!length) > + goto finalize_message; > + > + /* Not sure if session pid is protected by ctrl_lock > + * or tasklist_lock... > + */ > + pgrp = t->session; > + if (pgrp == NULL) { > + psz = scnprintf(msgp, length, "not a controlling tty"); > + if (psz > 0) > + msgp += psz; > + goto finalize_message; > + } > + pgrp = tty_get_pgrp(t); > + if (pgrp == NULL) { > + psz = scnprintf(msgp, length, "no foreground process group"); > + if (psz > 0) > + msgp += psz; > + goto finalize_message; > + } > + > + read_lock(&tasklist_lock); > + do_each_pid_task(pgrp, PIDTYPE_PGID, tsk) > + { > + /* Select the most interesting task. */ > + if (tsk == better_proc(mit, tsk)) > + mit = tsk; > + } while_each_pid_task(pgrp, PIDTYPE_PGID, tsk); > + read_unlock(&tasklist_lock); > + > + if (!pid_alive(mit)) > + goto finalize_message; > + > + /* Gather intel on most interesting task. */ > + /* Can the mm of a foreground process turn out to be NULL? > + * Definitely; for example, if it is a zombie. > + */ > + mm = get_task_mm(mit); > + > + psz = scnprint_task(msgp, length, mit); > + if (psz > 0) { > + msgp += psz; > + length -= psz; > + } > + if (!length) > + goto finalize_message; > + *msgp++ = ' '; > + length--; > + > + psz = scnprint_state(msgp, length, mit, mm); > + if (psz > 0) { > + msgp += psz; > + length -= psz; > + } > + if (!length) > + goto finalize_message; > + *msgp++ = ' '; > + length--; > + > + psz = scnprint_rusage(msgp, length, mit, mm); > + if (psz > 0) { > + msgp += psz; > + length -= psz; > + } > + if (!length) > + goto finalize_message; > + *msgp++ = ' '; > + length--; > + > + if (!mm) > + goto finalize_message; > + > + mmput(mm); > + > +finalize_message: > + *msgp++ = '\n'; > + if (pgrp) > + put_pid(pgrp); > +} > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 4418f5cb8..195abd47d 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1318,6 +1318,8 @@ static inline struct pid *task_pid(struct task_struct *task) > * task_xid_nr() : global id, i.e. the id seen from the init namespace; > * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of > * current. > + * task_xid_vinr() : virtual inner id, i.e. the id seen from the namespace of > + * the task itself; > * task_xid_nr_ns() : id seen from the ns specified; > * > * see also pid_nr() etc in include/linux/pid.h > @@ -1339,6 +1341,11 @@ static inline pid_t task_pid_vnr(struct task_struct *tsk) > return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); > } > > +static inline pid_t task_pid_vinr(struct task_struct *tsk) > +{ > + return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns_of_pid(task_pid(tsk))); > +} > + This smells like it should be done in a separate patch. > static inline pid_t task_tgid_nr(struct task_struct *tsk) > { > diff --git a/include/linux/tty.h b/include/linux/tty.h > index 3499845ab..2023addaf 100644 > --- a/include/linux/tty.h > +++ b/include/linux/tty.h > @@ -727,6 +727,9 @@ extern void __init n_tty_init(void); > static inline void n_tty_init(void) { } > #endif > > +/* n_tty_status.c */ > +extern void tty_sprint_status_line(struct tty_struct *tty, char *msg, size_t size); No need for extern. thanks, -- js suse labs