On Wed, Feb 19, 2025 at 01:40:01PM -0800, Chun-Tse Shao wrote: > This implements per-callstack aggregation of lock owners in addition to > per-thread. The owner callstack is captured using `bpf_get_task_stack()` > at `contention_begin()` and it also adds a custom stackid function for the > owner stacks to be compared easily. > > The owner info is kept in a hash map using lock addr as a key to handle > multiple waiters for the same lock. At `contention_end()`, it updates the > owner lock stat based on the info that was saved at `contention_begin()`. > If there are more waiters, it'd update the owner pid to itself as > `contention_end()` means it gets the lock now. But it also needs to check > the return value of the lock function in case task was killed by a signal > or something. > > Signed-off-by: Chun-Tse Shao <ctshao@xxxxxxxxxx> > --- > .../perf/util/bpf_skel/lock_contention.bpf.c | 218 +++++++++++++++++- > 1 file changed, 209 insertions(+), 9 deletions(-) > > diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c > index 23fe9cc980ae..e8b113d5802a 100644 > --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c > +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c > @@ -197,6 +197,9 @@ int data_fail; > int task_map_full; > int data_map_full; > > +struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak; > +void bpf_task_release(struct task_struct *p) __ksym __weak; > + > static inline __u64 get_current_cgroup_id(void) > { > struct task_struct *task; > @@ -420,6 +423,61 @@ static inline struct tstamp_data *get_tstamp_elem(__u32 flags) > return pelem; > } > > +static inline s32 get_owner_stack_id(u64 *stacktrace) > +{ > + s32 *id, new_id; > + static s64 id_gen = 1; > + > + id = bpf_map_lookup_elem(&owner_stacks, stacktrace); > + if (id) > + return *id; > + > + new_id = (s32)__sync_fetch_and_add(&id_gen, 1); > + > + bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST); > + > + id = bpf_map_lookup_elem(&owner_stacks, stacktrace); > + if (id) > + return *id; > + > + return -1; > +} > + > +static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count) > +{ > + __sync_fetch_and_add(&data->total_time, duration); > + __sync_fetch_and_add(&data->count, count); > + > + /* FIXME: need atomic operations */ > + if (data->max_time < duration) > + data->max_time = duration; > + if (data->min_time > duration) > + data->min_time = duration; > +} > + > +static inline void update_owner_stat(u32 id, u64 duration, u32 flags) > +{ > + struct contention_key key = { > + .stack_id = id, > + .pid = 0, > + .lock_addr_or_cgroup = 0, > + }; > + struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key); > + > + if (!data) { > + struct contention_data first = { > + .total_time = duration, > + .max_time = duration, > + .min_time = duration, > + .count = 1, > + .flags = flags, > + }; > + bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST); > + } else { > + update_contention_data(data, duration, 1); > + } > +} > + > SEC("tp_btf/contention_begin") > int contention_begin(u64 *ctx) > { > @@ -437,6 +495,72 @@ int contention_begin(u64 *ctx) > pelem->flags = (__u32)ctx[1]; > > if (needs_callstack) { > + u32 i = 0; > + u32 id = 0; > + int owner_pid; > + u64 *buf; > + struct task_struct *task; > + struct owner_tracing_data *otdata; > + > + if (!lock_owner) > + goto skip_owner; > + > + task = get_lock_owner(pelem->lock, pelem->flags); > + if (!task) > + goto skip_owner; > + > + owner_pid = BPF_CORE_READ(task, pid); > + > + buf = bpf_map_lookup_elem(&stack_buf, &i); > + if (!buf) > + goto skip_owner; > + for (i = 0; i < max_stack; i++) > + buf[i] = 0x0; > + > + if (!bpf_task_from_pid) > + goto skip_owner; > + > + task = bpf_task_from_pid(owner_pid); > + if (!task) > + goto skip_owner; > + > + bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0); > + bpf_task_release(task); > + > + otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock); > + id = get_owner_stack_id(buf); > + > + /* > + * Contention just happens, or corner case `lock` is owned by process not > + * `owner_pid`. For the corner case we treat it as unexpected internal error and > + * just ignore the precvious tracing record. > + */ > + if (!otdata || otdata->pid != owner_pid) { > + struct owner_tracing_data first = { > + .pid = owner_pid, > + .timestamp = pelem->timestamp, > + .count = 1, > + .stack_id = id, > + }; > + bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY); > + } > + /* Contention is ongoing and new waiter joins */ > + else { > + __sync_fetch_and_add(&otdata->count, 1); > + > + /* > + * The owner is the same, but stacktrace might be changed. In this case we > + * store/update `owner_stat` based on current owner stack id. > + */ > + if (id != otdata->stack_id) { > + update_owner_stat(id, pelem->timestamp - otdata->timestamp, > + pelem->flags); > + > + otdata->timestamp = pelem->timestamp; > + otdata->stack_id = id; > + } > + } > +skip_owner: > pelem->stack_id = bpf_get_stackid(ctx, &stacks, > BPF_F_FAST_STACK_CMP | stack_skip); > if (pelem->stack_id < 0) > @@ -473,6 +597,7 @@ int contention_end(u64 *ctx) > struct tstamp_data *pelem; > struct contention_key key = {}; > struct contention_data *data; > + __u64 timestamp; > __u64 duration; > bool need_delete = false; > > @@ -500,12 +625,94 @@ int contention_end(u64 *ctx) > need_delete = true; > } > > - duration = bpf_ktime_get_ns() - pelem->timestamp; > + timestamp = bpf_ktime_get_ns(); > + duration = timestamp - pelem->timestamp; > if ((__s64)duration < 0) { > __sync_fetch_and_add(&time_fail, 1); > goto out; > } > > + if (needs_callstack && lock_owner) { > + struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock); > + > + if (!otdata) > + goto skip_owner; > + > + /* Update `owner_stat` */ > + update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags); > + > + /* No contention is occurring, delete `lock` entry in `owner_data` */ > + if (otdata->count <= 1) > + bpf_map_delete_elem(&owner_data, &pelem->lock); > + /* > + * Contention is still ongoing, with a new owner (current task). `owner_data` > + * should be updated accordingly. > + */ > + else { > + u32 i = 0; > + s32 ret = (s32)ctx[1]; > + u64 *buf; > + > + __sync_fetch_and_add(&otdata->count, -1); > + > + buf = bpf_map_lookup_elem(&stack_buf, &i); > + if (!buf) > + goto skip_owner; > + for (i = 0; i < (u32)max_stack; i++) > + buf[i] = 0x0; > + > + /* > + * `ret` has the return code of the lock function. > + * If `ret` is negative, the current task terminates lock waiting without > + * acquiring it. Owner is not changed, but we still need to update the owner > + * stack. > + */ > + if (ret < 0) { > + s32 id = 0; > + struct task_struct *task; > + > + if (!bpf_task_from_pid) > + goto skip_owner; > + > + task = bpf_task_from_pid(otdata->pid); > + if (!task) > + goto skip_owner; > + > + bpf_get_task_stack(task, buf, > + max_stack * sizeof(unsigned long), 0); > + bpf_task_release(task); > + > + id = get_owner_stack_id(buf); > + > + /* > + * If owner stack is changed, update `owner_data` and `owner_stat` > + * accordingly. > + */ > + if (id != otdata->stack_id) { > + update_owner_stat(id, pelem->timestamp - otdata->timestamp, Shouldn't it be 'timestamp' instead of 'pelem->timestamp'? > + pelem->flags); > + > + otdata->timestamp = pelem->timestamp; Ditto. Thanks, Namhyung > + otdata->stack_id = id; > + } > + } > + /* > + * Otherwise, update tracing data with the current task, which is the new > + * owner. > + */ > + else { > + otdata->pid = pid; > + otdata->timestamp = timestamp; > + /* > + * We don't want to retrieve callstack here, since it is where the > + * current task acquires the lock and provides no additional > + * information. We simply assign -1 to invalidate it. > + */ > + otdata->stack_id = -1; > + } > + } > + } > +skip_owner: > switch (aggr_mode) { > case LOCK_AGGR_CALLER: > key.stack_id = pelem->stack_id; > @@ -589,14 +796,7 @@ int contention_end(u64 *ctx) > } > > found: > - __sync_fetch_and_add(&data->total_time, duration); > - __sync_fetch_and_add(&data->count, 1); > - > - /* FIXME: need atomic operations */ > - if (data->max_time < duration) > - data->max_time = duration; > - if (data->min_time > duration) > - data->min_time = duration; > + update_contention_data(data, duration, 1); > > out: > pelem->lock = 0; > -- > 2.48.1.601.g30ceb7b040-goog >