Re: [PATCH v4 1/4] mm: memcg: properly name and document unified stats flushing

Michal Hocko <mhocko@xxxxxxxx> · Mon, 4 Sep 2023 16:44:36 +0200

On Thu 31-08-23 16:56:08, Yosry Ahmed wrote:
> Most contexts that flush memcg stats use "unified" flushing, where
> basically all flushers attempt to flush the entire hierarchy, but only
> one flusher is allowed at a time, others skip flushing.
> 
> This is needed because we need to flush the stats from paths such as
> reclaim or refaults, which may have high concurrency, especially on
> large systems. Serializing such performance-sensitive paths can
> introduce regressions, hence, unified flushing offers a tradeoff between
> stats staleness and the performance impact of flushing stats.
> 
> Document this properly and explicitly by renaming the common flushing
> helper from do_flush_stats() to do_unified_stats_flush(), and adding
> documentation to describe unified flushing. Additionally, rename
> flushing APIs to add "try" in the name, which implies that flushing will
> not always happen. Also add proper documentation.
> 
> No functional change intended.
> 
> Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx>

No objections to renaming but it would be really nice to simplify this.
It is just "funny" to see 4 different flushing methods (flush from
userspace, flush, try_flush_with_ratelimit_1 and try_flush_with_ratelimit_2).
This is all internal so I am not all that worried that this would get
confused but it surely is rather convoluted.

Acked-by: Michal Hocko <mhocko@xxxxxxxx>

> ---
>  include/linux/memcontrol.h |  8 ++---
>  mm/memcontrol.c            | 61 +++++++++++++++++++++++++-------------
>  mm/vmscan.c                |  2 +-
>  mm/workingset.c            |  4 +--
>  4 files changed, 47 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 11810a2cfd2d..d517b0cc5221 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1034,8 +1034,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>  	return x;
>  }
>  
> -void mem_cgroup_flush_stats(void);
> -void mem_cgroup_flush_stats_ratelimited(void);
> +void mem_cgroup_try_flush_stats(void);
> +void mem_cgroup_try_flush_stats_ratelimited(void);
>  
>  void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
>  			      int val);
> @@ -1519,11 +1519,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>  	return node_page_state(lruvec_pgdat(lruvec), idx);
>  }
>  
> -static inline void mem_cgroup_flush_stats(void)
> +static inline void mem_cgroup_try_flush_stats(void)
>  {
>  }
>  
> -static inline void mem_cgroup_flush_stats_ratelimited(void)
> +static inline void mem_cgroup_try_flush_stats_ratelimited(void)
>  {
>  }
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index cf57fe9318d5..2d0ec828a1c4 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -588,7 +588,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
>  static void flush_memcg_stats_dwork(struct work_struct *w);
>  static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
>  static DEFINE_PER_CPU(unsigned int, stats_updates);
> -static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
> +static atomic_t stats_unified_flush_ongoing = ATOMIC_INIT(0);
>  static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
>  static u64 flush_next_time;
>  
> @@ -630,7 +630,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
>  		/*
>  		 * If stats_flush_threshold exceeds the threshold
>  		 * (>num_online_cpus()), cgroup stats update will be triggered
> -		 * in __mem_cgroup_flush_stats(). Increasing this var further
> +		 * in mem_cgroup_try_flush_stats(). Increasing this var further
>  		 * is redundant and simply adds overhead in atomic update.
>  		 */
>  		if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
> @@ -639,15 +639,19 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
>  	}
>  }
>  
> -static void do_flush_stats(void)
> +/*
> + * do_unified_stats_flush - do a unified flush of memory cgroup statistics
> + *
> + * A unified flush tries to flush the entire hierarchy, but skips if there is
> + * another ongoing flush. This is meant for flushers that may have a lot of
> + * concurrency (e.g. reclaim, refault, etc), and should not be serialized to
> + * avoid slowing down performance-sensitive paths. A unified flush may skip, and
> + * hence may yield stale stats.
> + */
> +static void do_unified_stats_flush(void)
>  {
> -	/*
> -	 * We always flush the entire tree, so concurrent flushers can just
> -	 * skip. This avoids a thundering herd problem on the rstat global lock
> -	 * from memcg flushers (e.g. reclaim, refault, etc).
> -	 */
> -	if (atomic_read(&stats_flush_ongoing) ||
> -	    atomic_xchg(&stats_flush_ongoing, 1))
> +	if (atomic_read(&stats_unified_flush_ongoing) ||
> +	    atomic_xchg(&stats_unified_flush_ongoing, 1))
>  		return;
>  
>  	WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
> @@ -655,19 +659,34 @@ static void do_flush_stats(void)
>  	cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
>  
>  	atomic_set(&stats_flush_threshold, 0);
> -	atomic_set(&stats_flush_ongoing, 0);
> +	atomic_set(&stats_unified_flush_ongoing, 0);
>  }
>  
> -void mem_cgroup_flush_stats(void)
> +/*
> + * mem_cgroup_try_flush_stats - try to flush memory cgroup statistics
> + *
> + * Try to flush the stats of all memcgs that have stat updates since the last
> + * flush. We do not flush the stats if:
> + * - The magnitude of the pending updates is below a certain threshold.
> + * - There is another ongoing unified flush (see do_unified_stats_flush()).
> + *
> + * Hence, the stats may be stale, but ideally by less than FLUSH_TIME due to
> + * periodic flushing.
> + */
> +void mem_cgroup_try_flush_stats(void)
>  {
>  	if (atomic_read(&stats_flush_threshold) > num_online_cpus())
> -		do_flush_stats();
> +		do_unified_stats_flush();
>  }
>  
> -void mem_cgroup_flush_stats_ratelimited(void)
> +/*
> + * Like mem_cgroup_try_flush_stats(), but only flushes if the periodic flusher
> + * is late.
> + */
> +void mem_cgroup_try_flush_stats_ratelimited(void)
>  {
>  	if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
> -		mem_cgroup_flush_stats();
> +		mem_cgroup_try_flush_stats();
>  }
>  
>  static void flush_memcg_stats_dwork(struct work_struct *w)
> @@ -676,7 +695,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
>  	 * Always flush here so that flushing in latency-sensitive paths is
>  	 * as cheap as possible.
>  	 */
> -	do_flush_stats();
> +	do_unified_stats_flush();
>  	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
>  }
>  
> @@ -1576,7 +1595,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
>  	 *
>  	 * Current memory state:
>  	 */
> -	mem_cgroup_flush_stats();
> +	mem_cgroup_try_flush_stats();
>  
>  	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
>  		u64 size;
> @@ -4018,7 +4037,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
>  	int nid;
>  	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
>  
> -	mem_cgroup_flush_stats();
> +	mem_cgroup_try_flush_stats();
>  
>  	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
>  		seq_printf(m, "%s=%lu", stat->name,
> @@ -4093,7 +4112,7 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
>  
>  	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
>  
> -	mem_cgroup_flush_stats();
> +	mem_cgroup_try_flush_stats();
>  
>  	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
>  		unsigned long nr;
> @@ -4595,7 +4614,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
>  	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
>  	struct mem_cgroup *parent;
>  
> -	mem_cgroup_flush_stats();
> +	mem_cgroup_try_flush_stats();
>  
>  	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
>  	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
> @@ -6610,7 +6629,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
>  	int i;
>  	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
>  
> -	mem_cgroup_flush_stats();
> +	mem_cgroup_try_flush_stats();
>  
>  	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
>  		int nid;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c7c149cb8d66..457a18921fda 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2923,7 +2923,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
>  	 * Flush the memory cgroup stats, so that we read accurate per-memcg
>  	 * lruvec stats for heuristics.
>  	 */
> -	mem_cgroup_flush_stats();
> +	mem_cgroup_try_flush_stats();
>  
>  	/*
>  	 * Determine the scan balance between anon and file LRUs.
> diff --git a/mm/workingset.c b/mm/workingset.c
> index da58a26d0d4d..affb8699e58d 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -520,7 +520,7 @@ void workingset_refault(struct folio *folio, void *shadow)
>  	}
>  
>  	/* Flush stats (and potentially sleep) before holding RCU read lock */
> -	mem_cgroup_flush_stats_ratelimited();
> +	mem_cgroup_try_flush_stats_ratelimited();
>  
>  	rcu_read_lock();
>  
> @@ -664,7 +664,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
>  		struct lruvec *lruvec;
>  		int i;
>  
> -		mem_cgroup_flush_stats();
> +		mem_cgroup_try_flush_stats();
>  		lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
>  		for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
>  			pages += lruvec_page_state_local(lruvec,
> -- 
> 2.42.0.rc2.253.gd59a3bf2b4-goog

-- 
Michal Hocko
SUSE Labs