Re: Regression in workingset_refault latency on 5.15

Frank Hofmann <fhofmann@xxxxxxxxxxxxxx> · Wed, 2 Mar 2022 11:49:58 +0000

On Wed, Mar 2, 2022 at 2:50 AM 'Shakeel Butt' via
kernel-team+notifications <kernel-team@xxxxxxxxxxxxxx> wrote:
>
> On Tue, Mar 01, 2022 at 04:48:00PM -0800, Ivan Babrou wrote:
>
> [...]
>
> > Looks like you were right that targeted flush is not going to be as good.
>
>
> Thanks a lot. Can you please try the following patch (independent of other
> patches) as well?

Hi Shakeel,

maybe it's just me, but the codechange below confuses me. I'll comment inline.

>
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index d9b8df5ef212..499f75e066f3 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -75,29 +75,8 @@ enum mem_cgroup_events_target {
>         MEM_CGROUP_NTARGETS,
>   };
>
> -struct memcg_vmstats_percpu {
> -       /* Local (CPU and cgroup) page state & events */
> -       long                    state[MEMCG_NR_STAT];
> -       unsigned long           events[NR_VM_EVENT_ITEMS];
> -
> -       /* Delta calculation for lockless upward propagation */
> -       long                    state_prev[MEMCG_NR_STAT];
> -       unsigned long           events_prev[NR_VM_EVENT_ITEMS];
> -
> -       /* Cgroup1: threshold notifications & softlimit tree updates */
> -       unsigned long           nr_page_events;
> -       unsigned long           targets[MEM_CGROUP_NTARGETS];
> -};
> -
> -struct memcg_vmstats {
> -       /* Aggregated (CPU and subtree) page state & events */
> -       long                    state[MEMCG_NR_STAT];
> -       unsigned long           events[NR_VM_EVENT_ITEMS];
> -
> -       /* Pending child counts during tree propagation */
> -       long                    state_pending[MEMCG_NR_STAT];
> -       unsigned long           events_pending[NR_VM_EVENT_ITEMS];
> -};
> +struct memcg_vmstats_percpu;
> +struct memcg_vmstats;
>
>   struct mem_cgroup_reclaim_iter {
>         struct mem_cgroup *position;
> @@ -304,7 +283,7 @@ struct mem_cgroup {
>         MEMCG_PADDING(_pad1_);
>
>         /* memory.stat */
> -       struct memcg_vmstats    vmstats;
> +       struct memcg_vmstats    *vmstats;
>
>         /* memory.events */
>         atomic_long_t           memory_events[MEMCG_NR_MEMORY_EVENTS];
> @@ -964,11 +943,7 @@ static inline void mod_memcg_state(struct mem_cgroup
> *memcg,
>         local_irq_restore(flags);
>   }
>
> -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int
> idx)
> -{
> -       return READ_ONCE(memcg->vmstats.state[idx]);
> -}
> -
> +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
>   static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
>                                               enum node_stat_item idx)
>   {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 32ba963ebf2e..c65f155c2048 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -688,6 +688,71 @@ static void flush_memcg_stats_dwork(struct work_struct
> *w)
>         queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
>   }
>
> +static const unsigned int memcg_vm_events[] = {
> +       PGPGIN,
> +       PGPGOUT,
> +       PGFAULT,
> +       PGMAJFAULT,
> +       PGREFILL,
> +       PGSCAN_KSWAPD,
> +       PGSCAN_DIRECT,
> +       PGSTEAL_KSWAPD,
> +       PGSTEAL_DIRECT,
> +       PGACTIVATE,
> +       PGDEACTIVATE,
> +       PGLAZYFREE,
> +       PGLAZYFREED,
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +       THP_FAULT_ALLOC,
> +       THP_COLLAPSE_ALLOC,
> +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> +};
> +#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_events)
> +
> +static int memcg_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
> +
> +static void init_memcg_events(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < NR_MEMCG_EVENTS; ++i)
> +               memcg_events_index[memcg_vm_events[i]] = i + 1;
> +}
> +
> +static int get_memcg_events_index(enum vm_event_item idx)
> +{
> +       return memcg_events_index[idx] - 1;
> +}

It's this. The table, memcg_vm_events[], is basically a "truth set" -
if the event is in this set, update memcg_events_index[] for it.
This code looks like one attempts to re-base C arrays to start-at-one though.
It then does that accessor function to translate "unsigned zero" to
"signed -1" ... for a truth test.

> +
> +struct memcg_vmstats_percpu {
> +       /* Local (CPU and cgroup) page state & events */
> +       long                    state[MEMCG_NR_STAT];
> +       unsigned long           events[NR_MEMCG_EVENTS];
> +
> +       /* Delta calculation for lockless upward propagation */
> +       long                    state_prev[MEMCG_NR_STAT];
> +       unsigned long           events_prev[NR_MEMCG_EVENTS];
> +
> +       /* Cgroup1: threshold notifications & softlimit tree updates */
> +       unsigned long           nr_page_events;
> +       unsigned long           targets[MEM_CGROUP_NTARGETS];
> +};
> +
> +struct memcg_vmstats {
> +       /* Aggregated (CPU and subtree) page state & events */
> +       long                    state[MEMCG_NR_STAT];
> +       unsigned long           events[NR_MEMCG_EVENTS];
> +
> +       /* Pending child counts during tree propagation */
> +       long                    state_pending[MEMCG_NR_STAT];
> +       unsigned long           events_pending[NR_MEMCG_EVENTS];
> +};
> +
> +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
> +{
> +       return READ_ONCE(memcg->vmstats->state[idx]);
> +}
> +
>   /**
>    * __mod_memcg_state - update cgroup memory statistics
>    * @memcg: the memory cgroup
> @@ -831,25 +896,33 @@ static inline void mod_objcg_mlstate(struct
> obj_cgroup *objcg,
>   void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
>                           unsigned long count)
>   {
> -       if (mem_cgroup_disabled())
> +       int index = get_memcg_events_index(idx);
> +
> +       if (mem_cgroup_disabled() || index < 0)

And it's these where it's used; basically, this says "if the event is
not in the monitored set, return". Likewise in all below.

Why not have this explicit ? Make something like:

static int memcg_event_exists_percg(int index) {
    switch (index) {
    case PGPGIN:
    case PGPGOUT:
    case PGFAULT:
    case PGMAJFAULT:
    case PGREFILL:
    case PGSCAN_KSWAPD:
    case PGSCAN_DIRECT:
    case PGSTEAL_KSWAPD:
    case PGSTEAL_DIRECT:
    case PGACTIVATE:
    case PGDEACTIVATE:
    case PGLAZYFREE:
    case PGLAZYFREED:
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    case THP_FAULT_ALLOC:
    case THP_COLLAPSE_ALLOC:
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
        return 1;
    default:
        return 0;
    }
}

and let the compiler choose how to lay this out where-used ?

>                 return;
>
> -       __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
> +       __this_cpu_add(memcg->vmstats_percpu->events[index], count);
[ ... ]

I understand there is a memory saving from the different table
types/sizes, while the vm events per memcg is a (small) subset of vm
events. How relevant is this to the change, though?

Best regards,
Frank Hofmann