Internal representations of task state are likely to be changed or ordered, and reporting them to userspace without exporting them as part of API is not a good choice, which can easily break a userspace observability tool as kernel evolves. For example, perf suffers from this and still reports wrong states by this patch. OTOH, some masqueraded state like TASK_REPORT_IDLE and TASK_REPORT_MAX are also reported inadvertently, which confuses things even more. So add a new variable in company with the old raw value to report task state in symbolic char, which is self-explaining and no further translation is needed, and also report priorities in 'short' to save some buffer space. Of course this does not break any userspace tool. Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use the old conventions for the rest. Signed-off-by: Ze Gao <zegao@xxxxxxxxxxx> --- include/trace/events/sched.h | 60 +++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index fbb99a61f714..0fcf68f49e45 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -6,6 +6,7 @@ #define _TRACE_SCHED_H #include <linux/kthread.h> +#include <linux/sched.h> #include <linux/sched/numa_balancing.h> #include <linux/tracepoint.h> #include <linux/binfmts.h> @@ -214,6 +215,27 @@ static inline long __trace_sched_switch_state(bool preempt, return state ? (1 << (state - 1)) : state; } + +static inline char __trace_sched_switch_state_char(bool preempt, + unsigned int prev_state, + struct task_struct *p) +{ + long state; + +#ifdef CONFIG_SCHED_DEBUG + BUG_ON(p != current); +#endif /* CONFIG_SCHED_DEBUG */ + + /* + * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old + * conventions for the rest. + */ + if (preempt) + return 'p'; + + state = __task_state_index(prev_state, p->exit_state); + return task_index_to_char(state); +} #endif /* CREATE_TRACE_POINTS */ /* @@ -231,41 +253,29 @@ TRACE_EVENT(sched_switch, TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) __field( pid_t, prev_pid ) - __field( int, prev_prio ) - __field( long, prev_state ) + __field( short, prev_prio ) + __field( int, prev_state ) + __field( char, prev_state_char ) __array( char, next_comm, TASK_COMM_LEN ) __field( pid_t, next_pid ) - __field( int, next_prio ) + __field( short, next_prio ) ), TP_fast_assign( memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); - __entry->prev_pid = prev->pid; - __entry->prev_prio = prev->prio; - __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); + __entry->prev_pid = prev->pid; + __entry->prev_prio = (short) prev->prio; + __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); + __entry->prev_state_char = __trace_sched_switch_state_char(preempt, prev_state, prev); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); - __entry->next_pid = next->pid; - __entry->next_prio = next->prio; + __entry->next_pid = next->pid; + __entry->next_prio = (short) next->prio; /* XXX SCHED_DEADLINE */ ), - TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - - (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? - __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", - { TASK_INTERRUPTIBLE, "S" }, - { TASK_UNINTERRUPTIBLE, "D" }, - { __TASK_STOPPED, "T" }, - { __TASK_TRACED, "t" }, - { EXIT_DEAD, "X" }, - { EXIT_ZOMBIE, "Z" }, - { TASK_PARKED, "P" }, - { TASK_DEAD, "I" }) : - "R", - - __entry->prev_state & TASK_REPORT_MAX ? "+" : "", - __entry->next_comm, __entry->next_pid, __entry->next_prio) + TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d", + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state_char, __entry->next_comm, + __entry->next_pid, __entry->next_prio) ); /* -- 2.40.1