This patch fixes an issue where the zswap global shrinker stopped iterating through the memcg tree. The problem was that `shrink_worker()` would stop iterating when a memcg was being offlined and restart from the tree root. Now, it properly handles the offlining memcg and continues shrinking with the next memcg. Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware") Signed-off-by: Takero Funaki <flintglass@xxxxxxxxx> --- mm/zswap.c | 76 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 20 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index a50e2986cd2f..0b1052cee36c 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -775,12 +775,27 @@ void zswap_folio_swapin(struct folio *folio) } } +/* + * This function should be called when a memcg is being offlined. + * + * Since the global shrinker shrink_worker() may hold a reference + * of the memcg, we must check and release the reference in + * zswap_next_shrink. + * + * shrink_worker() must handle the case where this function releases + * the reference of memcg being shrunk. + */ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { /* lock out zswap shrinker walking memcg tree */ spin_lock(&zswap_shrink_lock); - if (zswap_next_shrink == memcg) - zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + + if (READ_ONCE(zswap_next_shrink) == memcg) { + /* put back reference and advance the cursor */ + memcg = mem_cgroup_iter(NULL, memcg, NULL); + WRITE_ONCE(zswap_next_shrink, memcg); + } + spin_unlock(&zswap_shrink_lock); } @@ -1312,25 +1327,38 @@ static int shrink_memcg(struct mem_cgroup *memcg) static void shrink_worker(struct work_struct *w) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *next_memcg; int ret, failures = 0; unsigned long thr; /* Reclaim down to the accept threshold */ thr = zswap_accept_thr_pages(); - /* global reclaim will select cgroup in a round-robin fashion. */ + /* global reclaim will select cgroup in a round-robin fashion. + * + * We save iteration cursor memcg into zswap_next_shrink, + * which can be modified by the offline memcg cleaner + * zswap_memcg_offline_cleanup(). + */ do { spin_lock(&zswap_shrink_lock); - zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); - memcg = zswap_next_shrink; + next_memcg = READ_ONCE(zswap_next_shrink); + + if (memcg != next_memcg) { + /* + * Ours was released by offlining. + * Use the saved memcg reference. + */ + memcg = next_memcg; + } else { +iternext: + /* advance cursor */ + memcg = mem_cgroup_iter(NULL, memcg, NULL); + WRITE_ONCE(zswap_next_shrink, memcg); + } /* - * We need to retry if we have gone through a full round trip, or if we - * got an offline memcg (or else we risk undoing the effect of the - * zswap memcg offlining cleanup callback). This is not catastrophic - * per se, but it will keep the now offlined memcg hostage for a while. - * * Note that if we got an online memcg, we will keep the extra * reference in case the original reference obtained by mem_cgroup_iter * is dropped by the zswap memcg offlining callback, ensuring that the @@ -1345,16 +1373,18 @@ static void shrink_worker(struct work_struct *w) } if (!mem_cgroup_tryget_online(memcg)) { - /* drop the reference from mem_cgroup_iter() */ - mem_cgroup_iter_break(NULL, memcg); - zswap_next_shrink = NULL; - spin_unlock(&zswap_shrink_lock); - - if (++failures == MAX_RECLAIM_RETRIES) - break; - - goto resched; + /* + * It is an offline memcg which we cannot shrink + * until its pages are reparented. + * Put back the memcg reference before cleanup + * function reads it from zswap_next_shrink. + */ + goto iternext; } + /* + * We got an extra memcg reference before unlocking. + * The cleaner cannot free it using zswap_next_shrink. + */ spin_unlock(&zswap_shrink_lock); ret = shrink_memcg(memcg); @@ -1368,6 +1398,12 @@ static void shrink_worker(struct work_struct *w) resched: cond_resched(); } while (zswap_total_pages() > thr); + + /* + * We can still hold the original memcg reference. + * The reference is stored in zswap_next_shrink, and then reused + * by the next shrink_worker(). + */ } /********************************* -- 2.43.0