2009/1/26 Peter Zijlstra <peterz@xxxxxxxxxxxxx>: > On Mon, 2009-01-26 at 12:35 +0100, Peter Zijlstra wrote: >> >> Another something nice would be to have ctx switches like: >> >> foo-1 => bar-2 ran: ${time foo spend on the cpu} since: ${time bar >> spend away from the cpu} > > Steve, Frederic, how's this? That looks nice after a quick sight. Using the ctx switch probe to trace the context switches is, of course, more natural :-) And we have now these useful time run/wait bits. Thanks! > (compile tested only) > > --- > include/linux/ftrace.h | 5 ++ > kernel/sched_fair.c | 1 - > kernel/trace/Kconfig | 1 + > kernel/trace/trace.c | 51 +++++++++++++++++++ > kernel/trace/trace.h | 10 ++++ > kernel/trace/trace_functions_graph.c | 88 ++++++++++++++------------------- > 6 files changed, 104 insertions(+), 52 deletions(-) > > diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h > index 9f7880d..411b027 100644 > --- a/include/linux/ftrace.h > +++ b/include/linux/ftrace.h > @@ -381,6 +381,11 @@ struct ftrace_graph_ret { > int depth; > }; > > +struct ftrace_graph_switch { > + pid_t prev, next; > + u64 ran, since; > +}; > + > #ifdef CONFIG_FUNCTION_GRAPH_TRACER > > /* > diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c > index f34cf42..fa477ac 100644 > --- a/kernel/sched_fair.c > +++ b/kernel/sched_fair.c > @@ -530,7 +530,6 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) > schedstat_set(se->wait_count, se->wait_count + 1); > schedstat_set(se->wait_sum, se->wait_sum + > rq_of(cfs_rq)->clock - se->wait_start); > - schedstat_set(se->wait_start, 0); > } > > static inline void > diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig > index dde1d46..7aa1c13 100644 > --- a/kernel/trace/Kconfig > +++ b/kernel/trace/Kconfig > @@ -67,6 +67,7 @@ config FUNCTION_GRAPH_TRACER > bool "Kernel Function Graph Tracer" > depends on HAVE_FUNCTION_GRAPH_TRACER > depends on FUNCTION_TRACER > + select SCHEDSTATS > default y > help > Enable the kernel to trace a function at both its return > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c > index 2129ab9..380a334 100644 > --- a/kernel/trace/trace.c > +++ b/kernel/trace/trace.c > @@ -31,6 +31,7 @@ > #include <linux/fs.h> > #include <linux/kprobes.h> > #include <linux/writeback.h> > +#include <linux/sched.h> > > #include <linux/stacktrace.h> > #include <linux/ring_buffer.h> > @@ -820,6 +821,35 @@ static void __trace_graph_return(struct trace_array *tr, > entry->ret = *trace; > ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); > } > + > +static void __trace_graph_switch(struct trace_array *tr, > + struct trace_array_cpu *data, > + unsigned long flags, int pc, > + struct task_struct *prev, > + struct task_struct *next) > +{ > + struct ring_buffer_event *event; > + struct ftrace_graph_switch_entry *entry; > + unsigned long irq_flags; > + > + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) > + return; > + > + event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), > + &irq_flags); > + if (!event) > + return; > + entry = ring_buffer_event_data(event); > + tracing_generic_entry_update(&entry->ent, flags, pc); > + entry->ent.type = TRACE_GRAPH_SWITCH; > + entry->ctx.prev = prev->pid; > + entry->ctx.next = next->pid; > + entry->ctx.ran = prev->se.sum_exec_runtime - > + prev->se.prev_sum_exec_runtime; > + entry->ctx.since = next->se.exec_start - next->se.wait_start; > + > + ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); > +} > #endif > > void > @@ -1097,6 +1127,27 @@ void trace_graph_return(struct ftrace_graph_ret *trace) > atomic_dec(&data->disabled); > local_irq_restore(flags); > } > + > +void trace_graph_switch(struct task_struct *prev, struct task_struct *next) > +{ > + struct trace_array *tr = &global_trace; > + struct trace_array_cpu *data; > + unsigned long flags; > + long disabled; > + int cpu; > + int pc; > + > + local_irq_save(flags); > + cpu = raw_smp_processor_id(); > + data = tr->data[cpu]; > + disabled = atomic_inc_return(&data->disabled); > + if (likely(disabled == 1)) { > + pc = preempt_count(); > + __trace_graph_switch(tr, data, flags, pc, prev, next); > + } > + atomic_dec(&data->disabled); > + local_irq_restore(flags); > +} > #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ > > enum trace_file_type { > diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h > index b96037d..781fbce 100644 > --- a/kernel/trace/trace.h > +++ b/kernel/trace/trace.h > @@ -27,6 +27,7 @@ enum trace_type { > TRACE_BOOT_RET, > TRACE_GRAPH_RET, > TRACE_GRAPH_ENT, > + TRACE_GRAPH_SWITCH, > TRACE_USER_STACK, > TRACE_HW_BRANCHES, > TRACE_KMEM_ALLOC, > @@ -71,6 +72,12 @@ struct ftrace_graph_ret_entry { > struct trace_entry ent; > struct ftrace_graph_ret ret; > }; > + > +struct ftrace_graph_switch_entry { > + struct trace_entry ent; > + struct ftrace_graph_switch ctx; > +}; > + > extern struct tracer boot_tracer; > > /* > @@ -295,6 +302,8 @@ extern void __ftrace_bad_type(void); > TRACE_GRAPH_ENT); \ > IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ > TRACE_GRAPH_RET); \ > + IF_ASSIGN(var, ent, struct ftrace_graph_switch_entry, \ > + TRACE_GRAPH_SWITCH); \ > IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ > IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ > IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ > @@ -438,6 +447,7 @@ void trace_function(struct trace_array *tr, > > void trace_graph_return(struct ftrace_graph_ret *trace); > int trace_graph_entry(struct ftrace_graph_ent *trace); > +void trace_graph_switch(struct task_struct *prev, struct task_struct *next); > > void tracing_start_cmdline_record(void); > void tracing_stop_cmdline_record(void); > diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c > index 66fc7b8..cf6494b 100644 > --- a/kernel/trace/trace_functions_graph.c > +++ b/kernel/trace/trace_functions_graph.c > @@ -10,6 +10,7 @@ > #include <linux/uaccess.h> > #include <linux/ftrace.h> > #include <linux/fs.h> > +#include <trace/sched.h> > > #include "trace.h" > #include "trace_output.h" > @@ -158,28 +159,13 @@ print_graph_proc(struct trace_seq *s, pid_t pid) > return TRACE_TYPE_HANDLED; > } > > - > -/* If the pid changed since the last trace, output this event */ > static enum print_line_t > -verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu) > +print_graph_switch(struct ftrace_graph_switch_entry *field, struct trace_seq *s, > + struct trace_iterator *iter) > { > - pid_t prev_pid; > - pid_t *last_pid; > + int cpu = iter->cpu; > int ret; > > - if (!last_pids_cpu) > - return TRACE_TYPE_HANDLED; > - > - last_pid = per_cpu_ptr(last_pids_cpu, cpu); > - > - if (*last_pid == pid) > - return TRACE_TYPE_HANDLED; > - > - prev_pid = *last_pid; > - *last_pid = pid; > - > - if (prev_pid == -1) > - return TRACE_TYPE_HANDLED; > /* > * Context-switch trace line: > > @@ -197,7 +183,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu) > if (ret == TRACE_TYPE_PARTIAL_LINE) > TRACE_TYPE_PARTIAL_LINE; > > - ret = print_graph_proc(s, prev_pid); > + ret = print_graph_proc(s, field->ctx.prev); > if (ret == TRACE_TYPE_PARTIAL_LINE) > TRACE_TYPE_PARTIAL_LINE; > > @@ -205,16 +191,21 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu) > if (!ret) > TRACE_TYPE_PARTIAL_LINE; > > - ret = print_graph_proc(s, pid); > + ret = print_graph_proc(s, field->ctx.next); > if (ret == TRACE_TYPE_PARTIAL_LINE) > TRACE_TYPE_PARTIAL_LINE; > > + ret = trace_seq_printf(s, " ran: %Lu, since: %Lu", > + field->ctx.ran, field->ctx.since); > + if (!ret) > + TRACE_TYPE_PARTIAL_LINE; > + > ret = trace_seq_printf(s, > "\n ------------------------------------------\n\n"); > if (!ret) > TRACE_TYPE_PARTIAL_LINE; > > - return ret; > + return TRACE_TYPE_HANDLED; > } > > static bool > @@ -471,14 +462,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, > { > int ret; > int cpu = iter->cpu; > - pid_t *last_entry = iter->private; > struct trace_entry *ent = iter->ent; > struct ftrace_graph_ent *call = &field->graph_ent; > > - /* Pid */ > - if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE) > - return TRACE_TYPE_PARTIAL_LINE; > - > /* Interrupt */ > ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, ent->pid); > if (ret == TRACE_TYPE_PARTIAL_LINE) > @@ -523,12 +509,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, > int i; > int ret; > int cpu = iter->cpu; > - pid_t *last_pid = iter->private; > unsigned long long duration = trace->rettime - trace->calltime; > > - /* Pid */ > - if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) > - return TRACE_TYPE_PARTIAL_LINE; > > /* Absolute time */ > if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { > @@ -600,7 +582,6 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, > int i; > int ret; > int cpu = iter->cpu; > - pid_t *last_pid = iter->private; > > /* Absolute time */ > if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { > @@ -609,10 +590,6 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, > return TRACE_TYPE_PARTIAL_LINE; > } > > - /* Pid */ > - if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) > - return TRACE_TYPE_PARTIAL_LINE; > - > /* Cpu */ > if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { > ret = print_graph_cpu(s, cpu); > @@ -677,6 +654,11 @@ print_graph_function(struct trace_iterator *iter) > struct trace_entry *entry = iter->ent; > > switch (entry->type) { > + case TRACE_GRAPH_SWITCH: { > + struct ftrace_graph_switch_entry *field; > + trace_assign_type(field, entry); > + return print_graph_switch(field, s, iter); > + } > case TRACE_GRAPH_ENT: { > struct ftrace_graph_ent_entry *field; > trace_assign_type(field, entry); > @@ -724,34 +706,38 @@ static void print_graph_headers(struct seq_file *s) > seq_printf(s, " | | | |\n"); > } > > -static void graph_trace_open(struct trace_iterator *iter) > +static void probe_sched_switch(struct rq *rq, > + struct task_struct *prev, > + struct task_struct *next) > { > - /* pid on the last trace processed */ > - pid_t *last_pid = alloc_percpu(pid_t); > - int cpu; > + trace_graph_switch(prev, next); > +} > > - if (!last_pid) > - pr_warning("function graph tracer: not enough memory\n"); > - else > - for_each_possible_cpu(cpu) { > - pid_t *pid = per_cpu_ptr(last_pid, cpu); > - *pid = -1; > - } > +static DEFINE_MUTEX(graph_trace_mutex); > +static int graph_trace_ref; > > - iter->private = last_pid; > +static void graph_trace_start(struct trace_array *tr) > +{ > + mutex_lock(&graph_trace_mutex); > + if (!(graph_trace_ref++)) > + register_trace_sched_switch(probe_sched_switch); > + mutex_unlock(&graph_trace_mutex); > } > > -static void graph_trace_close(struct trace_iterator *iter) > +static void graph_trace_stop(struct trace_array *tr) > { > - percpu_free(iter->private); > + mutex_lock(&graph_trace_mutex); > + if (!(--graph_trace_ref)) > + unregister_trace_sched_switch(probe_sched_switch); > + mutex_unlock(&graph_trace_mutex); > } > > static struct tracer graph_trace __read_mostly = { > .name = "function_graph", > - .open = graph_trace_open, > - .close = graph_trace_close, > .init = graph_trace_init, > .reset = graph_trace_reset, > + .start = graph_trace_start, > + .stop = graph_trace_stop, > .print_line = print_graph_function, > .print_header = print_graph_headers, > .flags = &tracer_flags, > > > -- To unsubscribe from this list: send the line "unsubscribe kernel-testers" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html