Re: [PATCH 1/1] mm: add swapiness= arg to memory.reclaim

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




在 2023/11/30 23:36, Dan Schatzberg 写道:
[????????? schatzberg.dan@xxxxxxxxx ????????? https://aka.ms/LearnAboutSenderIdentification,????????????]

Allow proactive reclaimers to submit an additional swappiness=<val>
argument to memory.reclaim. This overrides the global or per-memcg
swappiness setting for that reclaim attempt.

For example:

echo "2M swappiness=0" > /sys/fs/cgroup/memory.reclaim

will perform reclaim on the rootcg with a swappiness setting of 0 (no
swap) regardless of the vm.swappiness sysctl setting.

Signed-off-by: Dan Schatzberg <schatzberg.dan@xxxxxxxxx>
---
 include/linux/swap.h |  3 ++-
 mm/memcontrol.c      | 55 +++++++++++++++++++++++++++++++++++---------
 mm/vmscan.c          | 13 +++++++++--
 3 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f6dd6575b905..c6e309199f10 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -410,7 +410,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                                  unsigned long nr_pages,
                                                  gfp_t gfp_mask,
-                                                 unsigned int reclaim_options);
+                                                 unsigned int reclaim_options,
+                                                 int *swappiness);
 extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
                                                pg_data_t *pgdat,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c1061df9cd1..ba1c89455ab0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,6 +63,7 @@
 #include <linux/resume_user_mode.h>
 #include <linux/psi.h>
 #include <linux/seq_buf.h>
+#include <linux/parser.h>
 #include <linux/sched/isolation.h>
 #include "internal.h"
 #include <net/sock.h>
@@ -2449,7 +2450,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
                psi_memstall_enter(&pflags);
                nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
                                                        gfp_mask,
-                                                       MEMCG_RECLAIM_MAY_SWAP);
+                                                       MEMCG_RECLAIM_MAY_SWAP, NULL);
                psi_memstall_leave(&pflags);
        } while ((memcg = parent_mem_cgroup(memcg)) &&
                 !mem_cgroup_is_root(memcg));
@@ -2740,7 +2741,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,

        psi_memstall_enter(&pflags);
        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
-                                                   gfp_mask, reclaim_options);
+                                                   gfp_mask, reclaim_options, NULL);
        psi_memstall_leave(&pflags);

        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -3660,7 +3661,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
                }

                if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
-                                       memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
+                                       memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
                        ret = -EBUSY;
                        break;
                }
@@ -3774,7 +3775,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
                        return -EINTR;

                if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
-                                                 MEMCG_RECLAIM_MAY_SWAP))
+                                                 MEMCG_RECLAIM_MAY_SWAP, NULL))
                        nr_retries--;
        }

@@ -6720,7 +6721,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
                }

                reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
-                                       GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
+                                       GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);

                if (!reclaimed && !nr_retries--)
                        break;
@@ -6769,7 +6770,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,

                if (nr_reclaims) {
                        if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
-                                       GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
+                                       GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
                                nr_reclaims--;
                        continue;
                }
@@ -6895,6 +6896,16 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
        return nbytes;
 }

+enum {
+       MEMORY_RECLAIM_SWAPPINESS = 0,
+       MEMORY_RECLAIM_NULL,
+};
+
+static const match_table_t if_tokens = {
+       { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+       { MEMORY_RECLAIM_NULL, NULL },
+};
+
 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
                              size_t nbytes, loff_t off)
 {
@@ -6902,12 +6913,33 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
        unsigned int nr_retries = MAX_RECLAIM_RETRIES;
        unsigned long nr_to_reclaim, nr_reclaimed = 0;
        unsigned int reclaim_options;
-       int err;
+       char *old_buf, *start;
+       substring_t args[MAX_OPT_ARGS];
+       int swappiness = -1;

        buf = strstrip(buf);
-       err = page_counter_memparse(buf, "", &nr_to_reclaim);
-       if (err)
-               return err;
+
+       old_buf = buf;
+       nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+       if (buf == old_buf)
+               return -EINVAL;
+
+       buf = strstrip(buf);
+
+       while ((start = strsep(&buf, " ")) != NULL) {
+               if (!strlen(start))
+                       continue;
+               switch (match_token(start, if_tokens, args)) {
+               case MEMORY_RECLAIM_SWAPPINESS:
+                       if (match_int(&args[0], &swappiness))
+                               return -EINVAL;
+                       if (swappiness < 0 || swappiness > 200)
+                               return -EINVAL;
+                       break;
+               default:
+                       return -EINVAL;
+               }
+       }

        reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
        while (nr_reclaimed < nr_to_reclaim) {
@@ -6926,7 +6958,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,

                reclaimed = try_to_free_mem_cgroup_pages(memcg,
                                        min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX),
-                                       GFP_KERNEL, reclaim_options);
+                                       GFP_KERNEL, reclaim_options,
+                                       swappiness == -1 ? NULL : &swappiness);

                if (!reclaimed && !nr_retries--)
                        return -EAGAIN;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 506f8220c5fe..546704ea01e1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -136,6 +136,9 @@ struct scan_control {
        /* Always discard instead of demoting to lower tier memory */
        unsigned int no_demotion:1;

+       /* Swappiness value for reclaim, if NULL use memcg/global value */
+       int *swappiness;
+
        /* Allocation order */
        s8 order;

@@ -2327,7 +2330,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        unsigned long anon_cost, file_cost, total_cost;
-       int swappiness = mem_cgroup_swappiness(memcg);
+       int swappiness = sc->swappiness ?
+               *sc->swappiness : mem_cgroup_swappiness(memcg);
Should we use "unlikely" here to indicate that sc->swappiness is an unexpected behavior?
Due to current use case only apply in proactive reclaim.
        u64 fraction[ANON_AND_FILE];
        u64 denominator = 0;    /* gcc */
        enum scan_balance scan_balance;
@@ -2608,6 +2612,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
            mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
                return 0;

+       if (sc->swappiness)
+               return *sc->swappiness;
Also there.
+
        return mem_cgroup_swappiness(memcg);
 }

@@ -6433,7 +6440,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                           unsigned long nr_pages,
                                           gfp_t gfp_mask,
-                                          unsigned int reclaim_options)
+                                          unsigned int reclaim_options,
+                                          int *swappiness)
 {
        unsigned long nr_reclaimed;
        unsigned int noreclaim_flag;
@@ -6448,6 +6456,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                .may_unmap = 1,
                .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
                .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+               .swappiness = swappiness,
        };
        /*
         * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
--
2.34.1
My previous patch attempted to ensure fully deterministic semantics under extreme swappiness.
For example, when swappiness is set to 200, only anonymous pages will be reclaimed.
Due to code in MGLRU isolate_folios will try scan anon if no scanned, will try other type.(We do not want
it to attempt this behavior.)
How do you think about extreme swappiness scenarios?

    

[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux