On Thu, Mar 07, 2019 at 08:56:32AM -0800, Greg Thelen wrote: > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -3880,6 +3880,7 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) > * @pheadroom: out parameter for number of allocatable pages according to memcg > * @pdirty: out parameter for number of dirty pages > * @pwriteback: out parameter for number of pages under writeback > + * @exact: determines exact counters are required, indicates more work. > * > * Determine the numbers of file, headroom, dirty, and writeback pages in > * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom > @@ -3890,18 +3891,29 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) > * ancestors. Note that this doesn't consider the actual amount of > * available memory in the system. The caller should further cap > * *@pheadroom accordingly. > + * > + * Return value is the error precision associated with *@pdirty > + * and *@pwriteback. When @exact is set this a minimal value. > */ > -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, > - unsigned long *pheadroom, unsigned long *pdirty, > - unsigned long *pwriteback) > +unsigned long > +mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, > + unsigned long *pheadroom, unsigned long *pdirty, > + unsigned long *pwriteback, bool exact) > { > struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); > struct mem_cgroup *parent; > + unsigned long precision; > > - *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); > - > + if (exact) { > + precision = 0; > + *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); > + *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); > + } else { > + precision = MEMCG_CHARGE_BATCH * num_online_cpus(); > + *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); > + *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); > + } > /* this should eventually include NR_UNSTABLE_NFS */ > - *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); > *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | > (1 << LRU_ACTIVE_FILE)); > *pheadroom = PAGE_COUNTER_MAX; > @@ -3913,6 +3925,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, > *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); > memcg = parent; > } > + > + return precision; Have you considered unconditionally using the exact version here? It does for_each_online_cpu(), but until very, very recently we did this per default for all stats, for years. It only became a problem in conjunction with the for_each_memcg loops when frequently reading memory stats at the top of a very large hierarchy. balance_dirty_pages() is called against memcgs that actually own the inodes/memory and doesn't do the additional recursive tree collection. It's also not *that* hot of a function, and in the io path... It would simplify this patch immensely.