Re: [PATCH v2] mm,memcg: provide per-cgroup counters for NUMA balancing operations

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Aug 12, 2024 at 3:29 PM <kaiyang2@xxxxxxxxxx> wrote:
>
> From: Kaiyang Zhao <kaiyang2@xxxxxxxxxx>
>
> v2:
> - fixed compilation error when CONFIG_NUMA_BALANCING is off
> - fixed doc warning due to missing parameter description in
> get_mem_cgroup_from_folio
>
> Reported-by: kernel test robot <lkp@xxxxxxxxx>
> Closes: https://lore.kernel.org/oe-kbuild-all/202408110848.pqaWv5zD-lkp@xxxxxxxxx/
> Closes: https://lore.kernel.org/oe-kbuild-all/202408110708.gCHsUKRI-lkp@xxxxxxxxx/
> Closes: https://lore.kernel.org/oe-kbuild-all/202408110706.DZD0TOV3-lkp@xxxxxxxxx/
> Reported-by: Stephen Rothwell <sfr@xxxxxxxxxxxxxxxx>
> ---
> The ability to observe the demotion and promotion decisions made by the
> kernel on a per-cgroup basis is important for monitoring and tuning
> containerized workloads on either NUMA machines or machines
> equipped with tiered memory.
>
> Different containers in the system may experience drastically different
> memory tiering actions that cannot be distinguished from the global
> counters alone.
>
> For example, a container running a workload that has a much hotter
> memory accesses will likely see more promotions and fewer demotions,
> potentially depriving a colocated container of top tier memory to such
> an extent that its performance degrades unacceptably.
>
> For another example, some containers may exhibit longer periods between
> data reuse, causing much more numa_hint_faults than numa_pages_migrated.
> In this case, tuning hot_threshold_ms may be appropriate, but the signal
> can easily be lost if only global counters are available.
>
> This patch set adds five counters to
> memory.stat in a cgroup: numa_pages_migrated, numa_pte_updates,
> numa_hint_faults, pgdemote_kswapd and pgdemote_direct.
>
> count_memcg_events_mm() is added to count multiple event occurrences at
> once, and get_mem_cgroup_from_folio() is added because we need to get a
> reference to the memcg of a folio before it's migrated to track
> numa_pages_migrated. The accounting of PGDEMOTE_* is moved to
> shrink_inactive_list() before being changed to per-cgroup.
>
> Signed-off-by: Kaiyang Zhao <kaiyang2@xxxxxxxxxx>
> ---
>  include/linux/memcontrol.h | 24 +++++++++++++++++++++---
>  include/linux/vmstat.h     |  1 +
>  mm/memcontrol.c            | 33 +++++++++++++++++++++++++++++++++
>  mm/memory.c                |  3 +++
>  mm/mempolicy.c             |  4 +++-
>  mm/migrate.c               |  3 +++
>  mm/vmscan.c                |  8 ++++----
>  7 files changed, 68 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 44f7fb7dc0c8..90ecd2dbca06 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -768,6 +768,8 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
>
>  struct mem_cgroup *get_mem_cgroup_from_current(void);
>
> +struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio);
> +
>  struct lruvec *folio_lruvec_lock(struct folio *folio);
>  struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
>  struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
> @@ -1012,8 +1014,8 @@ static inline void count_memcg_folio_events(struct folio *folio,
>                 count_memcg_events(memcg, idx, nr);
>  }
>
> -static inline void count_memcg_event_mm(struct mm_struct *mm,
> -                                       enum vm_event_item idx)
> +static inline void count_memcg_events_mm(struct mm_struct *mm,
> +                                       enum vm_event_item idx, unsigned long count)
>  {
>         struct mem_cgroup *memcg;
>
> @@ -1023,10 +1025,16 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
>         rcu_read_lock();
>         memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
>         if (likely(memcg))
> -               count_memcg_events(memcg, idx, 1);
> +               count_memcg_events(memcg, idx, count);
>         rcu_read_unlock();
>  }
>
> +static inline void count_memcg_event_mm(struct mm_struct *mm,
> +                                       enum vm_event_item idx)
> +{
> +       count_memcg_events_mm(mm, idx, 1);
> +}
> +
>  static inline void memcg_memory_event(struct mem_cgroup *memcg,
>                                       enum memcg_memory_event event)
>  {
> @@ -1246,6 +1254,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
>         return NULL;
>  }
>
> +static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
> +{
> +       return NULL;
> +}
> +
>  static inline
>  struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
>  {
> @@ -1468,6 +1481,11 @@ static inline void count_memcg_folio_events(struct folio *folio,
>  {
>  }
>
> +static inline void count_memcg_events_mm(struct mm_struct *mm,
> +                                       enum vm_event_item idx, unsigned long count)
> +{
> +}
> +
>  static inline
>  void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
>  {
> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> index 596c050ed492..ff0b49f76ca4 100644
> --- a/include/linux/vmstat.h
> +++ b/include/linux/vmstat.h
> @@ -32,6 +32,7 @@ struct reclaim_stat {
>         unsigned nr_ref_keep;
>         unsigned nr_unmap_fail;
>         unsigned nr_lazyfree_fail;
> +       unsigned nr_demoted;
>  };
>
>  /* Stat data for system wide items */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e1ffd2950393..fe7d057bbb67 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -307,6 +307,9 @@ static const unsigned int memcg_node_stat_items[] = {
>  #ifdef CONFIG_SWAP
>         NR_SWAPCACHE,
>  #endif
> +       PGDEMOTE_KSWAPD,
> +       PGDEMOTE_DIRECT,
> +       PGDEMOTE_KHUGEPAGED,
>  };
>
>  static const unsigned int memcg_stat_items[] = {
> @@ -437,6 +440,11 @@ static const unsigned int memcg_vm_event_stat[] = {
>         THP_SWPOUT,
>         THP_SWPOUT_FALLBACK,
>  #endif
> +#ifdef CONFIG_NUMA_BALANCING
> +       NUMA_PAGE_MIGRATE,
> +       NUMA_PTE_UPDATES,
> +       NUMA_HINT_FAULTS,
> +#endif
>  };
>
>  #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
> @@ -978,6 +986,24 @@ struct mem_cgroup *get_mem_cgroup_from_current(void)
>         return memcg;
>  }
>
> +/**
> + * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
> + * @folio: folio from which memcg should be extracted.
> + */
> +struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
> +{
> +       struct mem_cgroup *memcg = folio_memcg(folio);
> +
> +       if (mem_cgroup_disabled())
> +               return NULL;
> +
> +       rcu_read_lock();
> +       if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
> +               memcg = root_mem_cgroup;
> +       rcu_read_unlock();
> +       return memcg;
> +}
> +
>  /**
>   * mem_cgroup_iter - iterate over memory cgroup hierarchy
>   * @root: hierarchy root
> @@ -1383,6 +1409,10 @@ static const struct memory_stat memory_stats[] = {
>         { "workingset_restore_anon",    WORKINGSET_RESTORE_ANON         },
>         { "workingset_restore_file",    WORKINGSET_RESTORE_FILE         },
>         { "workingset_nodereclaim",     WORKINGSET_NODERECLAIM          },
> +
> +       { "pgdemote_kswapd",            PGDEMOTE_KSWAPD         },
> +       { "pgdemote_direct",            PGDEMOTE_DIRECT         },
> +       { "pgdemote_khugepaged",        PGDEMOTE_KHUGEPAGED     },
>  };
>
>  /* The actual unit of the state item, not the same as the output unit */
> @@ -1416,6 +1446,9 @@ static int memcg_page_state_output_unit(int item)
>         case WORKINGSET_RESTORE_ANON:
>         case WORKINGSET_RESTORE_FILE:
>         case WORKINGSET_NODERECLAIM:
> +       case PGDEMOTE_KSWAPD:
> +       case PGDEMOTE_DIRECT:
> +       case PGDEMOTE_KHUGEPAGED:
>                 return 1;
>         default:
>                 return memcg_page_state_unit(item);
> diff --git a/mm/memory.c b/mm/memory.c
> index d6af095d255b..7b6a3619fcce 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5373,6 +5373,9 @@ int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
>         vma_set_access_pid_bit(vma);
>
>         count_vm_numa_event(NUMA_HINT_FAULTS);
> +#ifdef CONFIG_NUMA_BALANCING
> +       count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1);
> +#endif
>         if (page_nid == numa_node_id()) {
>                 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
>                 *flags |= TNF_FAULT_LOCAL;
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index b3b5f376471f..b646fab3e45e 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -676,8 +676,10 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
>         tlb_gather_mmu(&tlb, vma->vm_mm);
>
>         nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
> -       if (nr_updated > 0)
> +       if (nr_updated > 0) {
>                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
> +               count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
> +       }
>
>         tlb_finish_mmu(&tlb);
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 66a5f73ebfdf..7e1267042a56 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -2614,6 +2614,7 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
>         int nr_remaining;
>         unsigned int nr_succeeded;
>         LIST_HEAD(migratepages);
> +       struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
>
>         list_add(&folio->lru, &migratepages);
>         nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio,
> @@ -2623,12 +2624,14 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
>                 putback_movable_pages(&migratepages);
>         if (nr_succeeded) {
>                 count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
> +               count_memcg_events(memcg, NUMA_PAGE_MIGRATE, nr_succeeded);
>                 if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
>                     && !node_is_toptier(folio_nid(folio))
>                     && node_is_toptier(node))
>                         mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
>                                             nr_succeeded);

Given that the motivating use case is for memory tiering, can we add
PGPROMOTE_SUCCESS to per-memcg stat here as well?

>         }
> +       mem_cgroup_put(memcg);
>         BUG_ON(!list_empty(&migratepages));
>         return nr_remaining ? -EAGAIN : 0;
>  }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 25e43bb3b574..fd66789a413b 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1008,9 +1008,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
>                       (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
>                       &nr_succeeded);
>
> -       mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(),
> -                           nr_succeeded);
> -
>         return nr_succeeded;
>  }
>
> @@ -1518,7 +1515,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>         /* 'folio_list' is always empty here */
>
>         /* Migrate folios selected for demotion */
> -       nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
> +       stat->nr_demoted = demote_folio_list(&demote_folios, pgdat);
> +       nr_reclaimed += stat->nr_demoted;
>         /* Folios that could not be demoted are still in @demote_folios */
>         if (!list_empty(&demote_folios)) {
>                 /* Folios which weren't demoted go back on @folio_list */
> @@ -1984,6 +1982,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
>         spin_lock_irq(&lruvec->lru_lock);
>         move_folios_to_lru(lruvec, &folio_list);
>
> +       __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
> +                                       stat.nr_demoted);
>         __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
>         item = PGSTEAL_KSWAPD + reclaimer_offset();
>         if (!cgroup_reclaim(sc))
> --
> 2.43.0
>
>





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux