On Mon, Oct 23, 2023 at 7:24 AM Adrian Hunter <adrian.hunter@xxxxxxxxx> wrote: > > On 12/10/23 09:23, Ian Rogers wrote: > > struct thread values hold onto references to mmaps, dsos, etc. When a > > thread exits it is necessary to clean all of this memory up by > > removing the thread from the machine's threads. Some tools require > > this doesn't happen, such as perf report if offcpu events exist or if > > a task list is being generated, so add a symbol_conf value to make the > > behavior optional. When an exited thread is left in the machine's > > threads, mark it as exited. > > > > This change relates to commit 40826c45eb0b ("perf thread: Remove > > notion of dead threads"). Dead threads were removed as they had a > > reference count of 0 and were difficult to reason about with the > > reference count checker. Here a thread is removed from threads when it > > exits, unless via symbol_conf the exited thread isn't remove and is > > marked as exited. Reference counting behaves as it normally does. > > Can we exclude AUX area tracing? > > Essentially, the EXIT event happens when the task is still running > in kernel mode, so the thread has not in fact fully exited. > > Example: > > # perf record -a --kcore -e intel_pt// uname > > Before: > > # perf script --itrace=b --show-task-events -C6 | grep -C10 EXIT > uname 14740 [006] 26795.092638: 1 branches: ffffffffb63124ee __perf_event_header__init_id+0x5e ([kernel.kallsyms]) => ffffffffb63124f7 __perf_event_header__init_id+0x67 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6312501 __perf_event_header__init_id+0x71 ([kernel.kallsyms]) => ffffffffb6312512 __perf_event_header__init_id+0x82 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6312531 __perf_event_header__init_id+0xa1 ([kernel.kallsyms]) => ffffffffb6316b3a perf_event_task_output+0x26a ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6316b40 perf_event_task_output+0x270 ([kernel.kallsyms]) => ffffffffb6316959 perf_event_task_output+0x89 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6316966 perf_event_task_output+0x96 ([kernel.kallsyms]) => ffffffffb6322040 perf_output_begin+0x0 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6322080 perf_output_begin+0x40 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb6322085 perf_output_begin+0x45 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb63220ce perf_output_begin+0x8e ([kernel.kallsyms]) => ffffffffb611d280 preempt_count_add+0x0 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb611d2bf preempt_count_add+0x3f ([kernel.kallsyms]) => ffffffffb63220d3 perf_output_begin+0x93 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb63220e8 perf_output_begin+0xa8 ([kernel.kallsyms]) => ffffffffb63220ff perf_output_begin+0xbf ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: PERF_RECORD_EXIT(14740:14740):(14739:14739) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6322119 perf_output_begin+0xd9 ([kernel.kallsyms]) => ffffffffb6322128 perf_output_begin+0xe8 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6322146 perf_output_begin+0x106 ([kernel.kallsyms]) => ffffffffb63220ea perf_output_begin+0xaa ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb63220f9 perf_output_begin+0xb9 ([kernel.kallsyms]) => ffffffffb63221ab perf_output_begin+0x16b ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb63221ae perf_output_begin+0x16e ([kernel.kallsyms]) => ffffffffb63221b6 perf_output_begin+0x176 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6322202 perf_output_begin+0x1c2 ([kernel.kallsyms]) => ffffffffb6322167 perf_output_begin+0x127 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb632218c perf_output_begin+0x14c ([kernel.kallsyms]) => ffffffffb631696b perf_event_task_output+0x9b ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6316990 perf_event_task_output+0xc0 ([kernel.kallsyms]) => ffffffffb61034a0 __task_pid_nr_ns+0x0 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb61034b7 __task_pid_nr_ns+0x17 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb61034bc __task_pid_nr_ns+0x1c ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6103503 __task_pid_nr_ns+0x63 ([kernel.kallsyms]) => ffffffffb610353b __task_pid_nr_ns+0x9b ([kernel.kallsyms]) > > After: > > $ perf script --itrace=b --show-task-events -C6 | grep -C10 EXIT > uname 14740 [006] 26795.092638: 1 branches: ffffffffb63124ee __perf_event_header__init_id+0x5e ([kernel.kallsyms]) => ffffffffb63124f7 __perf_event_header__init_id+0x67 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6312501 __perf_event_header__init_id+0x71 ([kernel.kallsyms]) => ffffffffb6312512 __perf_event_header__init_id+0x82 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6312531 __perf_event_header__init_id+0xa1 ([kernel.kallsyms]) => ffffffffb6316b3a perf_event_task_output+0x26a ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6316b40 perf_event_task_output+0x270 ([kernel.kallsyms]) => ffffffffb6316959 perf_event_task_output+0x89 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6316966 perf_event_task_output+0x96 ([kernel.kallsyms]) => ffffffffb6322040 perf_output_begin+0x0 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6322080 perf_output_begin+0x40 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb6322085 perf_output_begin+0x45 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb63220ce perf_output_begin+0x8e ([kernel.kallsyms]) => ffffffffb611d280 preempt_count_add+0x0 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb611d2bf preempt_count_add+0x3f ([kernel.kallsyms]) => ffffffffb63220d3 perf_output_begin+0x93 ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: 1 branches: ffffffffb63220e8 perf_output_begin+0xa8 ([kernel.kallsyms]) => ffffffffb63220ff perf_output_begin+0xbf ([kernel.kallsyms]) > uname 14740 [006] 26795.092638: PERF_RECORD_EXIT(14740:14740):(14739:14739) > :14740 14740 [006] 26795.092638: 1 branches: ffffffffb6322119 perf_output_begin+0xd9 ([kernel.kallsyms]) => ffffffffb6322128 perf_output_begin+0xe8 ([kernel.kallsyms]) > :14740 14740 [006] 26795.092638: 1 branches: ffffffffb6322146 perf_output_begin+0x106 ([kernel.kallsyms]) => ffffffffb63220ea perf_output_begin+0xaa ([kernel.kallsyms]) > :14740 14740 [006] 26795.092638: 1 branches: ffffffffb63220f9 perf_output_begin+0xb9 ([kernel.kallsyms]) => ffffffffb63221ab perf_output_begin+0x16b ([kernel.kallsyms]) > :14740 14740 [006] 26795.092638: 1 branches: ffffffffb63221ae perf_output_begin+0x16e ([kernel.kallsyms]) => ffffffffb63221b6 perf_output_begin+0x176 ([kernel.kallsyms]) > :14740 14740 [006] 26795.092638: 1 branches: ffffffffb6322202 perf_output_begin+0x1c2 ([kernel.kallsyms]) => ffffffffb6322167 perf_output_begin+0x127 ([kernel.kallsyms]) > :14740 14740 [006] 26795.092638: 1 branches: ffffffffb632218c perf_output_begin+0x14c ([kernel.kallsyms]) => ffffffffb631696b perf_event_task_output+0x9b ([kernel.kallsyms]) > :14740 14740 [006] 26795.092638: 1 branches: ffffffffb6316990 perf_event_task_output+0xc0 ([kernel.kallsyms]) => ffffffffb61034a0 __task_pid_nr_ns+0x0 ([kernel.kallsyms]) > :14740 14740 [006] 26795.092638: 1 branches: ffffffffb61034b7 __task_pid_nr_ns+0x17 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms]) > :14740 14740 [006] 26795.092638: 1 branches: ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb61034bc __task_pid_nr_ns+0x1c ([kernel.kallsyms]) > :14740 14740 [006] 26795.092638: 1 branches: ffffffffb6103503 __task_pid_nr_ns+0x63 ([kernel.kallsyms]) => ffffffffb610353b __task_pid_nr_ns+0x9b ([kernel.kallsyms]) > > This will also affect samples made after PERF_RECORD_EXIT but before > the task finishes exiting. Makes sense. Would an appropriate fix be in perf_session__open to set: symbol_conf.keep_exited_threads = true; when: perf_header__has_feat(&session->header, HEADER_AUXTRACE) It is kind of hacky to be changing global state this way, but symbol_conf is like that in general. Thanks, Ian > > > > Signed-off-by: Ian Rogers <irogers@xxxxxxxxxx> > > --- > > tools/perf/builtin-report.c | 7 +++++++ > > tools/perf/util/machine.c | 10 +++++++--- > > tools/perf/util/symbol_conf.h | 3 ++- > > tools/perf/util/thread.h | 14 ++++++++++++++ > > 4 files changed, 30 insertions(+), 4 deletions(-) > > > > diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c > > index dcedfe00f04d..749246817aed 100644 > > --- a/tools/perf/builtin-report.c > > +++ b/tools/perf/builtin-report.c > > @@ -1411,6 +1411,13 @@ int cmd_report(int argc, const char **argv) > > if (ret < 0) > > goto exit; > > > > + /* > > + * tasks_mode require access to exited threads to list those that are in > > + * the data file. Off-cpu events are synthesized after other events and > > + * reference exited threads. > > + */ > > + symbol_conf.keep_exited_threads = true; > > + > > annotation_options__init(&report.annotation_opts); > > > > ret = perf_config(report__config, &report); > > diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c > > index 6ca7500e2cf4..5cda47eb337d 100644 > > --- a/tools/perf/util/machine.c > > +++ b/tools/perf/util/machine.c > > @@ -2157,9 +2157,13 @@ int machine__process_exit_event(struct machine *machine, union perf_event *event > > if (dump_trace) > > perf_event__fprintf_task(event, stdout); > > > > - if (thread != NULL) > > - thread__put(thread); > > - > > + if (thread != NULL) { > > + if (symbol_conf.keep_exited_threads) > > + thread__set_exited(thread, /*exited=*/true); > > + else > > + machine__remove_thread(machine, thread); > > + } > > + thread__put(thread); > > return 0; > > } > > > > diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h > > index 2b2fb9e224b0..6040286e07a6 100644 > > --- a/tools/perf/util/symbol_conf.h > > +++ b/tools/perf/util/symbol_conf.h > > @@ -43,7 +43,8 @@ struct symbol_conf { > > disable_add2line_warn, > > buildid_mmap2, > > guest_code, > > - lazy_load_kernel_maps; > > + lazy_load_kernel_maps, > > + keep_exited_threads; > > const char *vmlinux_name, > > *kallsyms_name, > > *source_prefix, > > diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h > > index e79225a0ea46..0df775b5c110 100644 > > --- a/tools/perf/util/thread.h > > +++ b/tools/perf/util/thread.h > > @@ -36,13 +36,22 @@ struct thread_rb_node { > > }; > > > > DECLARE_RC_STRUCT(thread) { > > + /** @maps: mmaps associated with this thread. */ > > struct maps *maps; > > pid_t pid_; /* Not all tools update this */ > > + /** @tid: thread ID number unique to a machine. */ > > pid_t tid; > > + /** @ppid: parent process of the process this thread belongs to. */ > > pid_t ppid; > > int cpu; > > int guest_cpu; /* For QEMU thread */ > > refcount_t refcnt; > > + /** > > + * @exited: Has the thread had an exit event. Such threads are usually > > + * removed from the machine's threads but some events/tools require > > + * access to dead threads. > > + */ > > + bool exited; > > bool comm_set; > > int comm_len; > > struct list_head namespaces_list; > > @@ -189,6 +198,11 @@ static inline refcount_t *thread__refcnt(struct thread *thread) > > return &RC_CHK_ACCESS(thread)->refcnt; > > } > > > > +static inline void thread__set_exited(struct thread *thread, bool exited) > > +{ > > + RC_CHK_ACCESS(thread)->exited = exited; > > +} > > + > > static inline bool thread__comm_set(const struct thread *thread) > > { > > return RC_CHK_ACCESS(thread)->comm_set; >