Johannes Weiner <hannes@xxxxxxxxxxx> wrote: > On Thu, Mar 07, 2019 at 08:56:32AM -0800, Greg Thelen wrote: >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -3880,6 +3880,7 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) >> * @pheadroom: out parameter for number of allocatable pages according to memcg >> * @pdirty: out parameter for number of dirty pages >> * @pwriteback: out parameter for number of pages under writeback >> + * @exact: determines exact counters are required, indicates more work. >> * >> * Determine the numbers of file, headroom, dirty, and writeback pages in >> * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom >> @@ -3890,18 +3891,29 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) >> * ancestors. Note that this doesn't consider the actual amount of >> * available memory in the system. The caller should further cap >> * *@pheadroom accordingly. >> + * >> + * Return value is the error precision associated with *@pdirty >> + * and *@pwriteback. When @exact is set this a minimal value. >> */ >> -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, >> - unsigned long *pheadroom, unsigned long *pdirty, >> - unsigned long *pwriteback) >> +unsigned long >> +mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, >> + unsigned long *pheadroom, unsigned long *pdirty, >> + unsigned long *pwriteback, bool exact) >> { >> struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); >> struct mem_cgroup *parent; >> + unsigned long precision; >> >> - *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); >> - >> + if (exact) { >> + precision = 0; >> + *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); >> + *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); >> + } else { >> + precision = MEMCG_CHARGE_BATCH * num_online_cpus(); >> + *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); >> + *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); >> + } >> /* this should eventually include NR_UNSTABLE_NFS */ >> - *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); >> *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | >> (1 << LRU_ACTIVE_FILE)); >> *pheadroom = PAGE_COUNTER_MAX; >> @@ -3913,6 +3925,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, >> *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); >> memcg = parent; >> } >> + >> + return precision; > > Have you considered unconditionally using the exact version here? > > It does for_each_online_cpu(), but until very, very recently we did > this per default for all stats, for years. It only became a problem in > conjunction with the for_each_memcg loops when frequently reading > memory stats at the top of a very large hierarchy. > > balance_dirty_pages() is called against memcgs that actually own the > inodes/memory and doesn't do the additional recursive tree collection. > > It's also not *that* hot of a function, and in the io path... > > It would simplify this patch immensely. Good idea. Done in -v2 of the patch: https://lore.kernel.org/lkml/20190329174609.164344-1-gthelen@xxxxxxxxxx/