Re: [PATCH] mm: vmscan: split khugepaged stats from direct reclaim stats

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Oct 25, 2022 at 10:05 AM Johannes Weiner <hannes@xxxxxxxxxxx> wrote:
>
> Direct reclaim stats are useful for identifying a potential source for
> application latency, as well as spotting issues with kswapd. However,
> khugepaged currently distorts the picture: as a kernel thread it
> doesn't impose allocation latencies on userspace, and it explicitly
> opts out of kswapd reclaim. Its activity showing up in the direct
> reclaim stats is misleading. Counting it as kswapd reclaim could also
> cause confusion when trying to understand actual kswapd behavior.
>
> Break out khugepaged from the direct reclaim counters into new
> pgsteal_khugepaged, pgdemote_khugepaged, pgscan_khugepaged counters.
>
> Test with a huge executable (CONFIG_READ_ONLY_THP_FOR_FS):
>
> pgsteal_kswapd 1342185
> pgsteal_direct 0
> pgsteal_khugepaged 3623
> pgscan_kswapd 1345025
> pgscan_direct 0
> pgscan_khugepaged 3623

There are other kernel threads or works may allocate memory then
trigger memory reclaim, there may be similar problems for them and
someone may try to add a new stat. So how's about we make the stats
more general, for example, call it "pg{steal|scan}_kthread"?

>
> Reported-by: Eric Bergen <ebergen@xxxxxxxx>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> ---
>  Documentation/admin-guide/cgroup-v2.rst |  6 +++++
>  include/linux/khugepaged.h              |  6 +++++
>  include/linux/vm_event_item.h           |  3 +++
>  mm/khugepaged.c                         |  5 +++++
>  mm/memcontrol.c                         |  8 +++++--
>  mm/vmscan.c                             | 30 ++++++++++++++++++-------
>  mm/vmstat.c                             |  3 +++
>  7 files changed, 51 insertions(+), 10 deletions(-)
>
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index dc254a3cb956..74cec76be9f2 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1488,12 +1488,18 @@ PAGE_SIZE multiple when read back.
>           pgscan_direct (npn)
>                 Amount of scanned pages directly  (in an inactive LRU list)
>
> +         pgscan_khugepaged (npn)
> +               Amount of scanned pages by khugepaged  (in an inactive LRU list)
> +
>           pgsteal_kswapd (npn)
>                 Amount of reclaimed pages by kswapd
>
>           pgsteal_direct (npn)
>                 Amount of reclaimed pages directly
>
> +         pgsteal_khugepaged (npn)
> +               Amount of reclaimed pages by khugepaged
> +
>           pgfault (npn)
>                 Total number of page faults incurred
>
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index 70162d707caf..f68865e19b0b 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -15,6 +15,7 @@ extern void __khugepaged_exit(struct mm_struct *mm);
>  extern void khugepaged_enter_vma(struct vm_area_struct *vma,
>                                  unsigned long vm_flags);
>  extern void khugepaged_min_free_kbytes_update(void);
> +extern bool current_is_khugepaged(void);
>  #ifdef CONFIG_SHMEM
>  extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>                                    bool install_pmd);
> @@ -57,6 +58,11 @@ static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
>  static inline void khugepaged_min_free_kbytes_update(void)
>  {
>  }
> +
> +static inline bool current_is_khugepaged(void)
> +{
> +       return false;
> +}
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>
>  #endif /* _LINUX_KHUGEPAGED_H */
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index 3518dba1e02f..7f5d1caf5890 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -40,10 +40,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
>                 PGREUSE,
>                 PGSTEAL_KSWAPD,
>                 PGSTEAL_DIRECT,
> +               PGSTEAL_KHUGEPAGED,
>                 PGDEMOTE_KSWAPD,
>                 PGDEMOTE_DIRECT,
> +               PGDEMOTE_KHUGEPAGED,
>                 PGSCAN_KSWAPD,
>                 PGSCAN_DIRECT,
> +               PGSCAN_KHUGEPAGED,
>                 PGSCAN_DIRECT_THROTTLE,
>                 PGSCAN_ANON,
>                 PGSCAN_FILE,
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 4734315f7940..36318ebbf50d 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2528,6 +2528,11 @@ void khugepaged_min_free_kbytes_update(void)
>         mutex_unlock(&khugepaged_mutex);
>  }
>
> +bool current_is_khugepaged(void)
> +{
> +       return kthread_func(current) == khugepaged;
> +}
> +
>  static int madvise_collapse_errno(enum scan_result r)
>  {
>         /*
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 2d8549ae1b30..a17a5cfa6a55 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -661,8 +661,10 @@ static const unsigned int memcg_vm_event_stat[] = {
>         PGPGOUT,
>         PGSCAN_KSWAPD,
>         PGSCAN_DIRECT,
> +       PGSCAN_KHUGEPAGED,
>         PGSTEAL_KSWAPD,
>         PGSTEAL_DIRECT,
> +       PGSTEAL_KHUGEPAGED,
>         PGFAULT,
>         PGMAJFAULT,
>         PGREFILL,
> @@ -1574,10 +1576,12 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
>         /* Accumulated memory events */
>         seq_buf_printf(&s, "pgscan %lu\n",
>                        memcg_events(memcg, PGSCAN_KSWAPD) +
> -                      memcg_events(memcg, PGSCAN_DIRECT));
> +                      memcg_events(memcg, PGSCAN_DIRECT) +
> +                      memcg_events(memcg, PGSCAN_KHUGEPAGED));
>         seq_buf_printf(&s, "pgsteal %lu\n",
>                        memcg_events(memcg, PGSTEAL_KSWAPD) +
> -                      memcg_events(memcg, PGSTEAL_DIRECT));
> +                      memcg_events(memcg, PGSTEAL_DIRECT) +
> +                      memcg_events(memcg, PGSTEAL_KHUGEPAGED));
>
>         for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
>                 if (memcg_vm_event_stat[i] == PGPGIN ||
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 04d8b88e5216..8ceae125bbf7 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -54,6 +54,7 @@
>  #include <linux/shmem_fs.h>
>  #include <linux/ctype.h>
>  #include <linux/debugfs.h>
> +#include <linux/khugepaged.h>
>
>  #include <asm/tlbflush.h>
>  #include <asm/div64.h>
> @@ -1047,6 +1048,22 @@ void drop_slab(void)
>                 drop_slab_node(nid);
>  }
>
> +static int reclaimer_offset(void)
> +{
> +       BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != 1);
> +       BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != 2);
> +       BUILD_BUG_ON(PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD != 1);
> +       BUILD_BUG_ON(PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD != 2);
> +       BUILD_BUG_ON(PGSCAN_DIRECT - PGSCAN_KSWAPD != 1);
> +       BUILD_BUG_ON(PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD != 2);
> +
> +       if (current_is_kswapd())
> +               return 0;
> +       if (current_is_khugepaged())
> +               return 2;
> +       return 1;
> +}
> +
>  static inline int is_page_cache_freeable(struct folio *folio)
>  {
>         /*
> @@ -1599,10 +1616,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
>                       (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
>                       &nr_succeeded);
>
> -       if (current_is_kswapd())
> -               __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
> -       else
> -               __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
> +       __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
>
>         return nr_succeeded;
>  }
> @@ -2475,7 +2489,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
>                                      &nr_scanned, sc, lru);
>
>         __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
> -       item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
> +       item = PGSCAN_KSWAPD + reclaimer_offset();
>         if (!cgroup_reclaim(sc))
>                 __count_vm_events(item, nr_scanned);
>         __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
> @@ -2492,7 +2506,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
>         move_folios_to_lru(lruvec, &folio_list);
>
>         __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
> -       item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
> +       item = PGSTEAL_KSWAPD + reclaimer_offset();
>         if (!cgroup_reclaim(sc))
>                 __count_vm_events(item, nr_reclaimed);
>         __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
> @@ -4857,7 +4871,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>                         break;
>         }
>
> -       item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
> +       item = PGSCAN_KSWAPD + reclaimer_offset();
>         if (!cgroup_reclaim(sc)) {
>                 __count_vm_events(item, isolated);
>                 __count_vm_events(PGREFILL, sorted);
> @@ -5015,7 +5029,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
>         if (walk && walk->batched)
>                 reset_batch_size(lruvec, walk);
>
> -       item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
> +       item = PGSTEAL_KSWAPD + reclaimer_offset();
>         if (!cgroup_reclaim(sc))
>                 __count_vm_events(item, reclaimed);
>         __count_memcg_events(memcg, item, reclaimed);
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index b2371d745e00..1ea6a5ce1c41 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1271,10 +1271,13 @@ const char * const vmstat_text[] = {
>         "pgreuse",
>         "pgsteal_kswapd",
>         "pgsteal_direct",
> +       "pgsteal_khugepaged",
>         "pgdemote_kswapd",
>         "pgdemote_direct",
> +       "pgdemote_khugepaged",
>         "pgscan_kswapd",
>         "pgscan_direct",
> +       "pgscan_khugepaged",
>         "pgscan_direct_throttle",
>         "pgscan_anon",
>         "pgscan_file",
> --
> 2.38.1
>
>




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux