On Fri, Nov 22, 2024 at 10:10 PM Shakeel Butt <shakeel.butt@xxxxxxxxx> wrote: > > We are starting to deploy mmap_lock tracepoint monitoring across our > fleet and the early results showed that these tracepoints are consuming > significant amount of CPUs in kernfs_path_from_node when enabled. > > It seems like the kernel is trying to resolved the cgroup path in the s/resolved/resolve > fast path of the locking code path when the tracepoints are enabled. In > addition for some application their metrics are regressing when > monitoring is enabled. > > The cgroup path resolution can be slow and should not be done in the > fast path. Most userspace tools, like bpftrace, provides functionality > to get the cgroup path from cgroup id, so let's just trace the cgroup > id and the users can use better tools to get the path in the slow path. > > Signed-off-by: Shakeel Butt <shakeel.butt@xxxxxxxxx> > --- > include/linux/memcontrol.h | 18 ++++++++++++ > include/trace/events/mmap_lock.h | 32 ++++++++++---------- > mm/mmap_lock.c | 50 ++------------------------------ > 3 files changed, 36 insertions(+), 64 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 5502aa8e138e..d82f08cd70cd 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -1046,6 +1046,19 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, > > void split_page_memcg(struct page *head, int old_order, int new_order); > > +static inline u64 memcg_id_from_mm(struct mm_struct *mm) The usage of memcg_id here and throughout the patch is a bit confusing because we have a member called 'id' in struct mem_cgroup, but this isn't it. This is the cgroup_id of the memcg. I admit it's hard to distinguish them during naming, but when I first saw the function I thought it was returning memcg->id. Maybe just cgroup_id_from_mm()? In cgroup v2, the cgroup id is the same regardless of the controller anyway, in cgroup v1, it's kinda natural that we return the cgroup id of the memcg. I don't feel strongly, but I prefer that we use clearer naming, and either way a comment may help clarify things. > +{ > + struct mem_cgroup *memcg; > + u64 id = 0; > + > + rcu_read_lock(); > + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); > + if (likely(memcg)) > + id = cgroup_id(memcg->css.cgroup); We return 0 if the memcg is NULL here, shouldn't we return the cgroup id of the root memcg instead? This is more consistent with get_mem_cgroup_from_mm(), and makes sure we always return the id of a valid cgroup. > + rcu_read_unlock(); > + return id; > +} > + > #else /* CONFIG_MEMCG */ > > #define MEM_CGROUP_ID_SHIFT 0 > @@ -1466,6 +1479,11 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) > static inline void split_page_memcg(struct page *head, int old_order, int new_order) > { > } > + > +static inline u64 memcg_id_from_mm(struct mm_struct *mm) > +{ > + return 0; > +} > #endif /* CONFIG_MEMCG */ > > /* > diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h > index bc2e3ad787b3..5529933d19c5 100644 > --- a/include/trace/events/mmap_lock.h > +++ b/include/trace/events/mmap_lock.h > @@ -5,6 +5,7 @@ > #if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) > #define _TRACE_MMAP_LOCK_H > > +#include <linux/memcontrol.h> > #include <linux/tracepoint.h> > #include <linux/types.h> > > @@ -12,64 +13,61 @@ struct mm_struct; > > DECLARE_EVENT_CLASS(mmap_lock, > > - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), > + TP_PROTO(struct mm_struct *mm, bool write), > > - TP_ARGS(mm, memcg_path, write), > + TP_ARGS(mm, write), > > TP_STRUCT__entry( > __field(struct mm_struct *, mm) > - __string(memcg_path, memcg_path) > + __field(u64, memcg_id) > __field(bool, write) > ), > > TP_fast_assign( > __entry->mm = mm; > - __assign_str(memcg_path); > + __entry->memcg_id = memcg_id_from_mm(mm); > __entry->write = write; > ), > > TP_printk( > - "mm=%p memcg_path=%s write=%s", > - __entry->mm, > - __get_str(memcg_path), > + "mm=%p memcg_id=%llu write=%s", > + __entry->mm, __entry->memcg_id, > __entry->write ? "true" : "false" > ) > ); > > #define DEFINE_MMAP_LOCK_EVENT(name) \ > DEFINE_EVENT(mmap_lock, name, \ > - TP_PROTO(struct mm_struct *mm, const char *memcg_path, \ > - bool write), \ > - TP_ARGS(mm, memcg_path, write)) > + TP_PROTO(struct mm_struct *mm, bool write), \ > + TP_ARGS(mm, write)) > > DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking); > DEFINE_MMAP_LOCK_EVENT(mmap_lock_released); > > TRACE_EVENT(mmap_lock_acquire_returned, > > - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, > - bool success), > + TP_PROTO(struct mm_struct *mm, bool write, bool success), > > - TP_ARGS(mm, memcg_path, write, success), > + TP_ARGS(mm, write, success), > > TP_STRUCT__entry( > __field(struct mm_struct *, mm) > - __string(memcg_path, memcg_path) > + __field(u64, memcg_id) > __field(bool, write) > __field(bool, success) > ), > > TP_fast_assign( > __entry->mm = mm; > - __assign_str(memcg_path); > + __entry->memcg_id = memcg_id_from_mm(mm); > __entry->write = write; > __entry->success = success; > ), > > TP_printk( > - "mm=%p memcg_path=%s write=%s success=%s", > + "mm=%p memcg_id=%llu write=%s success=%s", > __entry->mm, > - __get_str(memcg_path), > + __entry->memcg_id, > __entry->write ? "true" : "false", > __entry->success ? "true" : "false" > ) > diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c > index f186d57df2c6..e7dbaf96aa17 100644 > --- a/mm/mmap_lock.c > +++ b/mm/mmap_lock.c > @@ -17,51 +17,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); > EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); > EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); > > -#ifdef CONFIG_MEMCG > - > -/* > - * Size of the buffer for memcg path names. Ignoring stack trace support, > - * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. > - */ > -#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL > - > -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ > - do { \ > - if (trace_mmap_lock_##type##_enabled()) { \ > - char buf[MEMCG_PATH_BUF_SIZE]; \ > - get_mm_memcg_path(mm, buf, sizeof(buf)); \ > - trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ > - } \ > - } while (0) > - > -#else /* !CONFIG_MEMCG */ > - > -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ > - trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) > - > -#endif /* CONFIG_MEMCG */ > - > #ifdef CONFIG_TRACING > -#ifdef CONFIG_MEMCG > -/* > - * Write the given mm_struct's memcg path to a buffer. If the path cannot be > - * determined, empty string is written. > - */ > -static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) > -{ > - struct mem_cgroup *memcg; > - > - buf[0] = '\0'; > - memcg = get_mem_cgroup_from_mm(mm); > - if (memcg == NULL) > - return; > - if (memcg->css.cgroup) > - cgroup_path(memcg->css.cgroup, buf, buflen); > - css_put(&memcg->css); > -} > - > -#endif /* CONFIG_MEMCG */ > - > /* > * Trace calls must be in a separate file, as otherwise there's a circular > * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. > @@ -69,20 +25,20 @@ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) > > void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) > { > - TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); > + trace_mmap_lock_start_locking(mm, write); > } > EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); > > void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, > bool success) > { > - TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); > + trace_mmap_lock_acquire_returned(mm, write, success); > } > EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); > > void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) > { > - TRACE_MMAP_LOCK_EVENT(released, mm, write); > + trace_mmap_lock_released(mm, write); > } > EXPORT_SYMBOL(__mmap_lock_do_trace_released); > #endif /* CONFIG_TRACING */ > -- > 2.43.5 > >