On Wed, Jul 24, 2024 at 08:21:01PM +0000, Roman Gushchin wrote: > --- a/include/linux/page_counter.h > +++ b/include/linux/page_counter.h > @@ -5,14 +5,71 @@ > #include <linux/atomic.h> > #include <linux/cache.h> > #include <linux/limits.h> > +#include <linux/mm_types.h> > #include <asm/page.h> > > +/* > + * Page counters are used by memory and hugetlb cgroups. > + * Memory cgroups are using up to 4 separate counters: > + * memory, swap (memory + swap on cgroup v1), kmem and tcpmem. > + * Hugetlb cgroups are using 2 arrays of counters with HUGE_MAX_HSTATE > + * in each to track the usage and reservations of each supported > + * hugepage size. > + * > + * Protection (min/low) is supported only for the first counter > + * with idx 0 and only if the counter was initialized with the protection > + * support. > + */ > + > +enum mem_counter_type { > +#ifdef CONFIG_MEMCG > + /* Unified memory counter */ > + MCT_MEM, > + > + /* Swap */ > + MCT_SWAP, > + > + /* Memory + swap */ > + MCT_MEMSW = MCT_SWAP, > + > +#ifdef CONFIG_MEMCG_V1 > + /* Kernel memory */ > + MCT_KMEM, > + > + /* Tcp memory */ > + MCT_TCPMEM, > +#endif /* CONFIG_MEMCG_V1 */ > +#endif /* CONFIG_MEMCG */ > + > + /* Maximum number of memcg counters */ > + MCT_NR_MEMCG_ITEMS, > +}; > + > +#ifdef CONFIG_CGROUP_HUGETLB > +#ifdef HUGE_MAX_HSTATE > +#define MCT_NR_HUGETLB_ITEMS HUGE_MAX_HSTATE > +#else > +#define MCT_NR_HUGETLB_ITEMS 1 > +#endif > + > +/* > + * max() can't be used here: even though __builtin_choose_expr() evaluates > + * to true, the false clause generates a compiler error: > + * error: braced-group within expression allowed only inside a function . > + */ > +#define MCT_NR_ITEMS (__builtin_choose_expr(MCT_NR_MEMCG_ITEMS > MCT_NR_HUGETLB_ITEMS, \ > + MCT_NR_MEMCG_ITEMS, MCT_NR_HUGETLB_ITEMS)) > + > +#else /* CONFIG_CGROUP_HUGETLB */ > +#define MCT_NR_ITEMS MCT_NR_MEMCG_ITEMS > +#endif /* CONFIG_CGROUP_HUGETLB */ > + > struct page_counter { > /* > * Make sure 'usage' does not share cacheline with any other field. The > * memcg->memory.usage is a hot member of struct mem_cgroup. > */ > - atomic_long_t usage; > + atomic_long_t usage[MCT_NR_ITEMS]; > CACHELINE_PADDING(_pad1_); > > /* effective memory.min and memory.min usage tracking */ > @@ -25,9 +82,9 @@ struct page_counter { > atomic_long_t low_usage; > atomic_long_t children_low_usage; > > - unsigned long watermark; > - unsigned long local_watermark; /* track min of fd-local resets */ > - unsigned long failcnt; > + unsigned long watermark[MCT_NR_ITEMS]; > + unsigned long local_watermark[MCT_NR_ITEMS]; /* track min of fd-local resets */ > + unsigned long failcnt[MCT_NR_ITEMS]; > > /* Keep all the read most fields in a separete cacheline. */ > CACHELINE_PADDING(_pad2_); > @@ -35,8 +92,9 @@ struct page_counter { > bool protection_support; > unsigned long min; > unsigned long low; > - unsigned long high; > - unsigned long max; > + unsigned long high[MCT_NR_ITEMS]; > + unsigned long max[MCT_NR_ITEMS]; > + > struct page_counter *parent; > } ____cacheline_internodealigned_in_smp; This hardcodes way too much user specifics into what should be a self-contained piece of abstraction. Should anybody else try to use the 'struct page_counter' for a single resource elsewhere, they'd end up with up to 5 counters, watermarks, failcnts, highs and maxs etc. It's clear that the hierarchical aspect of the page_counter no longer works. It was okay-ish when all we had was a simple hard limit. Now we carry to much stuff in it that is only useful to a small set of users. Instead of doubling down and repeating the mistakes we made for struct page, it would be better to move user specific stuff out of it entirely. I think there are two patterns that would be more natural: a) pass a callback function and have a priv pointer in page_counter for user-specifics; or b) remove the hierarchy aspect from the page counter entirely, let the *caller* walk the tree, call the page counter code to trycharge (and log watermark) only that level, and handle everything caller specific on the caller side. All users already have parent linkage in the css, so this would eliminate the parent pointer in page_counter altogether. The protection stuff could be moved to memcg proper, eliminating yet more fields that aren't interesting to any other user. You'd save more memory, while at the same time keeping struct page_counter clean and generic.