Re: [PATCH v2] mm/vmstat: sum up all possible CPUs instead of using vm_events_fold_cpu

Barry Song <21cnbao@xxxxxxxxx> · Fri, 3 May 2024 22:17:57 +1200

On Fri, May 3, 2024 at 5:16 PM Ryan Roberts <ryan.roberts@xxxxxxx> wrote:
>
> On 03/05/2024 03:09, Barry Song wrote:
> > From: Barry Song <v-songbaohua@xxxxxxxx>
> >
> > When unplugging a CPU, the current code merges its vm_events
> > with an online CPU. Because, during summation, it only considers
> > online CPUs, which is a crude workaround. By transitioning to
> > summing up all possible CPUs, we can eliminate the need for
> > vm_events_fold_cpu.
> >
> > Suggested-by: Ryan Roberts <ryan.roberts@xxxxxxx>
> > Signed-off-by: Barry Song <v-songbaohua@xxxxxxxx>
> > ---
> >  originally suggested by Ryan while he reviewed mTHP counters
> >  patchset[1]; I am also applying this suggestion to vm_events
> >
> >  -v2:
> >  also drop cpus_read_lock() as we don't care about cpu hotplug any more;
> >  -v1:
> >  https://lore.kernel.org/linux-mm/20240412123039.442743-1-21cnbao@xxxxxxxxx/
> >
> >  [1] https://lore.kernel.org/linux-mm/ca73cbf1-8304-4790-a721-3c3a42f9d293@xxxxxxx/
> >
> >  include/linux/vmstat.h |  5 -----
> >  mm/page_alloc.c        |  8 --------
> >  mm/vmstat.c            | 21 +--------------------
> >  3 files changed, 1 insertion(+), 33 deletions(-)
> >
> > diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> > index 735eae6e272c..f7eaeb8bfa47 100644
> > --- a/include/linux/vmstat.h
> > +++ b/include/linux/vmstat.h
> > @@ -83,8 +83,6 @@ static inline void count_vm_events(enum vm_event_item item, long delta)
> >
> >  extern void all_vm_events(unsigned long *);
> >
> > -extern void vm_events_fold_cpu(int cpu);
> > -
> >  #else
> >
> >  /* Disable counters */
> > @@ -103,9 +101,6 @@ static inline void __count_vm_events(enum vm_event_item item, long delta)
> >  static inline void all_vm_events(unsigned long *ret)
> >  {
> >  }
> > -static inline void vm_events_fold_cpu(int cpu)
> > -{
> > -}
> >
> >  #endif /* CONFIG_VM_EVENT_COUNTERS */
> >
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index cd584aace6bf..8b56d785d587 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -5826,14 +5826,6 @@ static int page_alloc_cpu_dead(unsigned int cpu)
> >       mlock_drain_remote(cpu);
> >       drain_pages(cpu);
> >
> > -     /*
> > -      * Spill the event counters of the dead processor
> > -      * into the current processors event counters.
> > -      * This artificially elevates the count of the current
> > -      * processor.
> > -      */
> > -     vm_events_fold_cpu(cpu);
> > -
> >       /*
> >        * Zero the differential counters of the dead processor
> >        * so that the vm statistics are consistent.
> > diff --git a/mm/vmstat.c b/mm/vmstat.c
> > index db79935e4a54..aaa32418652e 100644
> > --- a/mm/vmstat.c
> > +++ b/mm/vmstat.c
> > @@ -114,7 +114,7 @@ static void sum_vm_events(unsigned long *ret)
> >
> >       memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
> >
> > -     for_each_online_cpu(cpu) {
> > +     for_each_possible_cpu(cpu) {
>
> One thought comes to mind (due to my lack of understanding exactly what
> "possible" means): Linux is compiled with a max number of cpus - NR_CPUS - 512
> for arm64's defconfig. Does all possible cpus include all 512? On an 8 CPU
> system that would be increasing the number of loops by 64 times.
>
> Or perhaps possible just means CPUs that have ever been online?

Hi Ryan,

On arm64, we get possible CPUs either from device tree or ACPI. they are both
much less than NR_CPUS.

/*
 * Enumerate the possible CPU set from the device tree or ACPI and build the
 * cpu logical map array containing MPIDR values related to logical
 * cpus. Assumes that cpu_logical_map(0) has already been initialized.
 */
void __init smp_init_cpus(void)

for device tree case, it is,

/*
 * Enumerate the possible CPU set from the device tree and build the
 * cpu logical map array containing MPIDR values related to logical
 * cpus. Assumes that cpu_logical_map(0) has already been initialized.
 */
static void __init of_parse_and_init_cpus(void)
{
        struct device_node *dn;

        for_each_of_cpu_node(dn) {
                u64 hwid = of_get_cpu_hwid(dn, 0);

                if (hwid & ~MPIDR_HWID_BITMASK)
                        goto next;

                if (is_mpidr_duplicate(cpu_count, hwid)) {
                        pr_err("%pOF: duplicate cpu reg properties in the DT\n",
                                dn);
                        goto next;
                }

                /*
                 * The numbering scheme requires that the boot CPU
                 * must be assigned logical id 0. Record it so that
                 * the logical map built from DT is validated and can
                 * be used.
                 */
                if (hwid == cpu_logical_map(0)) {
                        if (bootcpu_valid) {
                                pr_err("%pOF: duplicate boot cpu reg
property in DT\n",
                                        dn);
                                goto next;
                        }

                        bootcpu_valid = true;
                        early_map_cpu_to_node(0, of_node_to_nid(dn));

                        /*
                         * cpu_logical_map has already been
                         * initialized and the boot cpu doesn't need
                         * the enable-method so continue without
                         * incrementing cpu.
                         */
                        continue;
                }

                if (cpu_count >= NR_CPUS)
                        goto next;

                pr_debug("cpu logical map 0x%llx\n", hwid);
                set_cpu_logical_map(cpu_count, hwid);

                early_map_cpu_to_node(cpu_count, of_node_to_nid(dn));
next:
                cpu_count++;
        }
}

even for ARM32, we get that sometimes from scu_get_core_count(),
eg.
static void __init omap4_smp_init_cpus(void)
{
        unsigned int i = 0, ncores = 1, cpu_id;

        /* Use ARM cpuid check here, as SoC detection will not work so early */
        cpu_id = read_cpuid_id() & CPU_MASK;
        if (cpu_id == CPU_CORTEX_A9) {
                /*
                 * Currently we can't call ioremap here because
                 * SoC detection won't work until after init_early.
                 */
                cfg.scu_base =  OMAP2_L4_IO_ADDRESS(scu_a9_get_base());
                BUG_ON(!cfg.scu_base);
                ncores = scu_get_core_count(cfg.scu_base);
        } else if (cpu_id == CPU_CORTEX_A15) {
                ncores = OMAP5_CORE_COUNT;
        }

        /* sanity check */
        if (ncores > nr_cpu_ids) {
                pr_warn("SMP: %u cores greater than maximum (%u), clipping\n",
                        ncores, nr_cpu_ids);
                ncores = nr_cpu_ids;
        }

        for (i = 0; i < ncores; i++)
                set_cpu_possible(i, true);
}

Other architectures do exactly the same jobs.

>
> Either way, I guess it's not considered a performance bottleneck because, from
> memory, the scheduler and many other places are iterating over all possible cpus.
>
> >               struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
> >
> >               for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
> > @@ -129,29 +129,10 @@ static void sum_vm_events(unsigned long *ret)
> >  */
> >  void all_vm_events(unsigned long *ret)
> >  {
> > -     cpus_read_lock();
> >       sum_vm_events(ret);
> > -     cpus_read_unlock();
> >  }
> >  EXPORT_SYMBOL_GPL(all_vm_events);
> >
> > -/*
> > - * Fold the foreign cpu events into our own.
> > - *
> > - * This is adding to the events on one processor
> > - * but keeps the global counts constant.
> > - */
> > -void vm_events_fold_cpu(int cpu)
> > -{
> > -     struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
> > -     int i;
> > -
> > -     for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
> > -             count_vm_events(i, fold_state->event[i]);
> > -             fold_state->event[i] = 0;
> > -     }
> > -}
> > -
> >  #endif /* CONFIG_VM_EVENT_COUNTERS */
> >
> >  /*
>

Thanks
Barry