On 2/28/19 9:14 AM, Yafang Shao wrote: > In the page alloc fast path, it may do node reclaim, which may cause > latency spike. > We should add tracepoint for this event, and also mesure the latency > it causes. > > So bellow two tracepoints are introduced, > mm_vmscan_node_reclaim_begin > mm_vmscan_node_reclaim_end > > Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx> > --- > include/trace/events/vmscan.h | 48 +++++++++++++++++++++++++++++++++++++++++++ > mm/vmscan.c | 13 +++++++++++- > 2 files changed, 60 insertions(+), 1 deletion(-) > > diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h > index a1cb913..9310d5b 100644 > --- a/include/trace/events/vmscan.h > +++ b/include/trace/events/vmscan.h > @@ -465,6 +465,54 @@ > __entry->ratio, > show_reclaim_flags(__entry->reclaim_flags)) > ); > + > +TRACE_EVENT(mm_vmscan_node_reclaim_begin, > + > + TP_PROTO(int nid, int order, int may_writepage, > + gfp_t gfp_flags, int zid), > + > + TP_ARGS(nid, order, may_writepage, gfp_flags, zid), > + > + TP_STRUCT__entry( > + __field(int, nid) > + __field(int, order) > + __field(int, may_writepage) For node reclaim may_writepage is statically set in node_reclaim_mode, so I'm not sure it's worth including it. > + __field(gfp_t, gfp_flags) > + __field(int, zid) zid seems wasteful and misleading as it's simply derived by gfp_zone(gfp_mask), so I would drop it. > + ), > + > + TP_fast_assign( > + __entry->nid = nid; > + __entry->order = order; > + __entry->may_writepage = may_writepage; > + __entry->gfp_flags = gfp_flags; > + __entry->zid = zid; > + ), > + > + TP_printk("nid=%d zid=%d order=%d may_writepage=%d gfp_flags=%s", > + __entry->nid, > + __entry->zid, > + __entry->order, > + __entry->may_writepage, > + show_gfp_flags(__entry->gfp_flags)) > +); > + > +TRACE_EVENT(mm_vmscan_node_reclaim_end, > + > + TP_PROTO(int result), > + > + TP_ARGS(result), > + > + TP_STRUCT__entry( > + __field(int, result) Reporting sc.nr_reclaimed sounds more useful and in line with other reclaim tracepoints. Result (sc.nr_reclaimed >= nr_pages) can then be derived by postprocessing as the beginning tracepoint contains 'order' thus we know nr_pages? > + ), > + > + TP_fast_assign( > + __entry->result = result; > + ), > + > + TP_printk("result=%d", __entry->result) > +); > #endif /* _TRACE_VMSCAN_H */ > > /* This part must be outside protection */ > diff --git a/mm/vmscan.c b/mm/vmscan.c > index ac4806f..01a0401 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -4240,6 +4240,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in > .may_swap = 1, > .reclaim_idx = gfp_zone(gfp_mask), > }; > + int result; > + > + trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, > + sc.may_writepage, > + sc.gfp_mask, > + sc.reclaim_idx); > > cond_resched(); > fs_reclaim_acquire(sc.gfp_mask); > @@ -4267,7 +4273,12 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in > current->flags &= ~PF_SWAPWRITE; > memalloc_noreclaim_restore(noreclaim_flag); > fs_reclaim_release(sc.gfp_mask); > - return sc.nr_reclaimed >= nr_pages; > + > + result = sc.nr_reclaimed >= nr_pages; > + > + trace_mm_vmscan_node_reclaim_end(result); > + > + return result; > } > > int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) >