These tracepoints were practical to measure ongoing flusher wait time behavior and see that race do occur in production. Signed-off-by: Jesper Dangaard Brouer <hawk@xxxxxxxxxx> --- V8: Add TP for detecting ongoing_flusher yielding lock include/trace/events/cgroup.h | 56 +++++++++++++++++++++++++++++++++++++++++ kernel/cgroup/rstat.c | 18 +++++++++++-- 2 files changed, 71 insertions(+), 3 deletions(-) diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index af2755bda6eb..81f57fa751c4 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -296,6 +296,62 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath, TP_ARGS(cgrp, cpu, contended) ); +DECLARE_EVENT_CLASS(cgroup_ongoing, + + TP_PROTO(struct cgroup *cgrp, struct cgroup *cgrp_ongoing, \ + long res, unsigned int race, ktime_t ts), + + TP_ARGS(cgrp, cgrp_ongoing, res, race, ts), + + TP_STRUCT__entry( + __field( int, root ) + __field( int, level ) + __field( u64, id ) + __field( u64, id_ongoing ) + __field( ktime_t, ts ) + __field( long, res ) + __field( u64, race ) + ), + + TP_fast_assign( + __entry->root = cgrp->root->hierarchy_id; + __entry->id = cgroup_id(cgrp); + __entry->level = cgrp->level; + __entry->id_ongoing = cgroup_id(cgrp_ongoing); + __entry->res = res; + __entry->race = race; + __entry->ts = ts; + ), + + TP_printk("root=%d id=%llu level=%d ongoing_flusher=%llu res=%ld race=%llu ts=%lld", + __entry->root, __entry->id, __entry->level, + __entry->id_ongoing, __entry->res, __entry->race, __entry->ts) +); + +DEFINE_EVENT(cgroup_ongoing, cgroup_ongoing_flusher, + + TP_PROTO(struct cgroup *cgrp, struct cgroup *cgrp_ongoing, \ + long res, unsigned int race, ktime_t ts), + + TP_ARGS(cgrp, cgrp_ongoing, res, race, ts) +); + +DEFINE_EVENT(cgroup_ongoing, cgroup_ongoing_flusher_wait, + + TP_PROTO(struct cgroup *cgrp, struct cgroup *cgrp_ongoing, \ + long res, unsigned int race, ktime_t ts), + + TP_ARGS(cgrp, cgrp_ongoing, res, race, ts) +); + +DEFINE_EVENT(cgroup_ongoing, cgroup_ongoing_flusher_yield, + + TP_PROTO(struct cgroup *cgrp, struct cgroup *cgrp_ongoing, \ + long res, unsigned int race, ktime_t ts), + + TP_ARGS(cgrp, cgrp_ongoing, res, race, ts) +); + #endif /* _TRACE_CGROUP_H */ /* This part must be outside protection */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index eaa138f2da2f..cf344c0e71b3 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -328,6 +328,7 @@ static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) static bool cgroup_rstat_trylock_flusher(struct cgroup *cgrp) { struct cgroup *ongoing; + unsigned int race = 0; bool locked; /* @@ -338,17 +339,25 @@ static bool cgroup_rstat_trylock_flusher(struct cgroup *cgrp) retry: ongoing = READ_ONCE(cgrp_rstat_ongoing_flusher); if (ongoing && cgroup_is_descendant(cgrp, ongoing)) { - wait_for_completion_interruptible_timeout( + ktime_t ts = ktime_get_mono_fast_ns(); + long res = 0; + + trace_cgroup_ongoing_flusher(cgrp, ongoing, 0, race, ts); + + res = wait_for_completion_interruptible_timeout( &ongoing->flush_done, MAX_WAIT); - /* TODO: Add tracepoint here */ + trace_cgroup_ongoing_flusher_wait(cgrp, ongoing, res, race, ts); + return false; } locked = __cgroup_rstat_trylock(cgrp, -1); if (!locked) { /* Contended: Handle losing race for ongoing flusher */ - if (!ongoing && READ_ONCE(cgrp_rstat_ongoing_flusher)) + if (!ongoing && READ_ONCE(cgrp_rstat_ongoing_flusher)) { + race++; goto retry; + } __cgroup_rstat_lock(cgrp, -1, true); } @@ -369,6 +378,9 @@ static bool cgroup_rstat_trylock_flusher(struct cgroup *cgrp) reinit_completion(&cgrp->flush_done); WRITE_ONCE(cgrp_rstat_ongoing_flusher, cgrp); } + } else { + /* Detect multiple flushers as ongoing yielded lock */ + trace_cgroup_ongoing_flusher_yield(cgrp, ongoing, 0, 0, 0); } return true; }