+David Rientjes The attached test reproduces the problem on a cgroup v2 hierarchy mounted with memory_recursiveprot, and fails without this patch. On Tue, Nov 22, 2022 at 3:27 PM Yosry Ahmed <yosryahmed@xxxxxxxxxx> wrote: > > During reclaim, mem_cgroup_calculate_protection() is used to determine > the effective protection (emin and elow) values of a memcg. The > protection of the reclaim target is ignored, but we cannot set their > effective protection to 0 due to a limitation of the current > implementation (see comment in mem_cgroup_protection()). Instead, > we leave their effective protection values unchaged, and later ignore it > in mem_cgroup_protection(). > > However, mem_cgroup_protection() is called later in > shrink_lruvec()->get_scan_count(), which is after the > mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a > result, the stale effective protection values of the target memcg may > lead us to skip reclaiming from the target memcg entirely, before > calling shrink_lruvec(). This can be even worse with recursive > protection, where the stale target memcg protection can be higher than > its standalone protection. > > An example where this can happen is as follows. Consider the following > hierarchy with memory_recursiveprot: > ROOT > | > A (memory.min = 50M) > | > B (memory.min = 10M, memory.high = 40M) > > Consider the following scenarion: > - B has memory.current = 35M. > - The system undergoes global reclaim (target memcg is NULL). > - B will have an effective min of 50M (all of A's unclaimed protection). > - B will not be reclaimed from. > - Now allocate 10M more memory in B, pushing it above it's high limit. > - The system undergoes memcg reclaim from B (target memcg is B) > - In shrink_node_memcgs(), we call mem_cgroup_calculate_protection(), > which immediately returns for B without doing anything, as B is the > target memcg, relying on mem_cgroup_protection() to ignore B's stale > effective min (still 50M). > - Directly after mem_cgroup_calculate_protection(), we will call > mem_cgroup_below_min(), which will read the stale effective min for B > and skip it (instead of ignoring its protection as intended). In this > case, it's really bad because we are not just considering B's > standalone protection (10M), but we are reading a much higher stale > protection (50M) which will cause us to not reclaim from B at all. > > This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple > e{low,min} state mutations from protection checks") which made > mem_cgroup_calculate_protection() only change the state without > returning any value. Before that commit, we used to return > MEMCG_PROT_NONE for the target memcg, which would cause us to skip the > mem_cgroup_below_{min/low}() checks. After that commit we do not return > anything and we end up checking the min & low effective protections for > the target memcg, which are stale. > > Add mem_cgroup_ignore_protection() that checks if we are reclaiming from > the target memcg, and call it in mem_cgroup_below_{min/low}() to ignore > the stale protection of the target memcg. > > Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks") > Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx> > --- > include/linux/memcontrol.h | 33 +++++++++++++++++++++++++++------ > mm/vmscan.c | 11 ++++++----- > 2 files changed, 33 insertions(+), 11 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index e1644a24009c..22c9c9f9c6b1 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -625,18 +625,32 @@ static inline bool mem_cgroup_supports_protection(struct mem_cgroup *memcg) > > } > > -static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg) > +static inline bool mem_cgroup_ignore_protection(struct mem_cgroup *target, > + struct mem_cgroup *memcg) > { > - if (!mem_cgroup_supports_protection(memcg)) > + /* > + * The target memcg's protection is ignored, see > + * mem_cgroup_calculate_protection() and mem_cgroup_protection() > + */ > + return target == memcg; > +} > + > +static inline bool mem_cgroup_below_low(struct mem_cgroup *target, > + struct mem_cgroup *memcg) > +{ > + if (!mem_cgroup_supports_protection(memcg) || > + mem_cgroup_ignore_protection(target, memcg)) > return false; > > return READ_ONCE(memcg->memory.elow) >= > page_counter_read(&memcg->memory); > } > > -static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) > +static inline bool mem_cgroup_below_min(struct mem_cgroup *target, > + struct mem_cgroup *memcg) > { > - if (!mem_cgroup_supports_protection(memcg)) > + if (!mem_cgroup_supports_protection(memcg) || > + mem_cgroup_ignore_protection(target, memcg)) > return false; > > return READ_ONCE(memcg->memory.emin) >= > @@ -1209,12 +1223,19 @@ static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, > { > } > > -static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg) > +static inline bool mem_cgroup_ignore_protection(struct mem_cgroup *target, > + struct mem_cgroup *memcg) > +{ > + return false; > +} > +static inline bool mem_cgroup_below_low(struct mem_cgroup *target, > + struct mem_cgroup *memcg) > { > return false; > } > > -static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) > +static inline bool mem_cgroup_below_min(struct mem_cgroup *target, > + struct mem_cgroup *memcg) > { > return false; > } > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 04d8b88e5216..79ef0fe67518 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -4486,7 +4486,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned > > mem_cgroup_calculate_protection(NULL, memcg); > > - if (mem_cgroup_below_min(memcg)) > + if (mem_cgroup_below_min(NULL, memcg)) > return false; > > need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); > @@ -5047,8 +5047,9 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * > DEFINE_MAX_SEQ(lruvec); > DEFINE_MIN_SEQ(lruvec); > > - if (mem_cgroup_below_min(memcg) || > - (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) > + if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || > + (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && > + !sc->memcg_low_reclaim)) > return 0; > > *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); > @@ -6048,13 +6049,13 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) > > mem_cgroup_calculate_protection(target_memcg, memcg); > > - if (mem_cgroup_below_min(memcg)) { > + if (mem_cgroup_below_min(target_memcg, memcg)) { > /* > * Hard protection. > * If there is no reclaimable memory, OOM. > */ > continue; > - } else if (mem_cgroup_below_low(memcg)) { > + } else if (mem_cgroup_below_low(target_memcg, memcg)) { > /* > * Soft protection. > * Respect the protection only as long as > -- > 2.38.1.584.g0f3c55d4c2-goog >
#!/bin/bash # # Mounted with: # mount -t cgroup2 -o memory_recursiveprot none $ROOT # Or remounted with: # mount -o remount,memory_recursiveprot $ROOT # Default root path : ${ROOT:="/sys/fs/cgroup"} A="$ROOT/A" B="$A/B" # Default to current directory : ${TEST_DIR:=.} SWAPFILE= TMPFS= PIPE= setup() { set -e # Setup 100M swapfile SWAPFILE=${TEST_DIR}/test_swapfile dd if=/dev/zero of=$SWAPFILE bs=1M count=100 status=none chmod 600 $SWAPFILE mkswap $SWAPFILE > /dev/null swapon $SWAPFILE # Setup tmpfs TMPFS=${TEST_DIR}/test_tmpfs mkdir $TMPFS mount -t tmpfs tmpfs $TMPFS # Setup pipe PIPE=${TEST_DIR}/test_pipe mkfifo $PIPE # Setup root enable_memcg_subtree $ROOT # Setup A mkdir $A enable_memcg_subtree $A echo 50M > "$A/memory.min" # Setup B mkdir $B echo 10M > "$B/memory.min" echo 40M > "$B/memory.high" } cleanup() { set +e # Kill any procs in B first, then remove it. procs=$(cat $B/cgroup.procs) if [[ -n $procs ]]; then kill -KILL $procs wait $procs 2>/dev/null fi while [[ -n $(cat $B/cgroup.procs) ]]; do sleep 0.1 done rmdir $B # Remove A rmdir $A # Cleanup everything else if [[ -n $TMPFS ]]; then umount $TMPFS rmdir $TMPFS fi swapoff $SWAPFILE rm $SWAPFILE rm $PIPE } fail() { echo "[FAIL] $*" exit 1 } pass() { echo "[PASS] $*" } allocate_tmpfs() { echo 0 > $1/cgroup.procs dd if=/dev/zero bs=1M count=$2 >> $TMPFS/file status=none echo "tmpfs" > $PIPE sleep 1000 } wait_for_pipe() { if read -t 10 line < $PIPE; then if [[ $line == "tmpfs" ]]; then return else fail "wrong input ($line) received on pipe, expected (tmpfs)" fi fi fail "pipe timoeut" } memcg_usage() { cat $1/memory.current } enable_memcg_subtree() { echo "+memory" > "$1/cgroup.subtree_control" } trap cleanup EXIT echo "Reference case" # A: memory.min = 50M # A/B: memory.min = 10M, memory.high = 40M # Allocate 35 M in B, nothing happens # Allocate 10 M more in B, memory.high pushes us back below 40 M setup (allocate_tmpfs $B 35)& wait_for_pipe usage_mb=$(( $(memcg_usage $B) / (1024 * 1024) )) echo "B's usage after initial allocation: $usage_mb M" (allocate_tmpfs $B 10)& wait_for_pipe usage_mb=$(( $(memcg_usage $B) / (1024 * 1024) )) echo "B's usage after allocation beyond high: $usage_mb M" if [[ $usage_mb -le 40 ]]; then pass "memory.high enforced effectively" else fail "memory.high not enforced" fi cleanup echo "Corner case" # Corner case # Same setup as above # Allocate 35 M in B, nothing happens # Invoke global reclaim, all of B's memory should be protected by A's memory.min # Allocate 10 M more in B, memory high should push us back below 40 M setup (allocate_tmpfs $B 35)& wait_for_pipe usage_mb=$(( $(memcg_usage $B) / (1024 * 1024) )) echo "B's usage after initial allocation: $usage_mb M" # Simulate global reclaim set +e echo 10M > $ROOT/memory.reclaim set -e usage_mb=$(( $(memcg_usage $B) / (1024 * 1024) )) echo "B's usage after global reclaim: $usage_mb M" # B should be fully protected from global reclaim by A's min usage_mb_after=$(( $(memcg_usage $B) / (1024 * 1024) )) if [[ $usage_mb_after -ne $usage_mb ]]; then fail "A's memory.min did not protect B from global reclaim" else pass "A's memory.min protected B from global reclaim" fi (allocate_tmpfs $B 10)& wait_for_pipe # B's memory.high should be enforced and it should remain below 40M usage_mb=$(( $(memcg_usage $B) / (1024 * 1024) )) echo "B's usage after global reclaim: $usage_mb M" if [[ $usage_mb -le 40 ]]; then pass "memory.high enforced effectively" else fail "memory.high not enforced" fi