Re: [PATCH v3 2/5] zswap: make shrinking memcg-aware

Yosry Ahmed <yosryahmed@xxxxxxxxxx> · Thu, 19 Oct 2023 09:14:40 -0700

[..]
> > >
> > > +/*********************************
> > > +* lru functions
> > > +**********************************/
> > > +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> > > +{
> > > +       struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
> >
> > Could we avoid the need for get/put with an rcu_read_lock() instead?
>
> I think we can, I'm not entirely sure of the consequences though. By the
> look of it I'd say it's safe but I wouldn't trust my judgement on this.

It just seems like we have a pattern of short-lived get/put. If RCU
gives enough protection it should be simpler. IIUC taking a reference
does not protect against offlining or reparenting, so I am not sure if
taking a reference here would provide any more protection than

>
> >
[..]
> > > @@ -686,7 +716,36 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> > >         zswap_entry_put(tree, entry);
> > >  unlock:
> > >         spin_unlock(&tree->lock);
> > > -       return ret ? -EAGAIN : 0;
> > > +       spin_lock(lock);
> > > +       return ret;
> > > +}
> > > +
> > > +static int shrink_memcg(struct mem_cgroup *memcg)
> > > +{
> > > +       struct zswap_pool *pool;
> > > +       int nid, shrunk = 0;
> > > +
> > > +       pool = zswap_pool_current_get();
> > > +       if (!pool)
> > > +               return -EINVAL;
> > > +
> > > +       /*
> > > +        * Skip zombies because their LRUs are reparented and we would be
> > > +        * reclaiming from the parent instead of the dead memcgroup.
> >
> > nit: s/memcgroup/memcg.
> >
> > > +        */
> > > +       if (memcg && !mem_cgroup_online(memcg))
> > > +               goto out;
> >
> > If we move this above zswap_pool_current_get(), we can return directly
> > and remove the label. I noticed we will return -EAGAIN if memcg is
> > offline. IIUC -EAGAIN for the caller will move on to the next memcg,
> > but I am wondering if a different errno would be clearer here.
>
> True, I remember spending some time staring at error codes but couldn't find a
> better one. What if we use -EINVAL for retryable errors, and use something else
> for the one where there is no pool? -ENODEV?

Do you mean -EINVAL for non-retryable errors? Perhaps -ENOENT is more
appropriate as a return for offline memcgs?

>
> >
[..]
> > >  static void shrink_worker(struct work_struct *w)
> > > @@ -695,10 +754,13 @@ static void shrink_worker(struct work_struct *w)
> > >                                                 shrink_work);
> > >         int ret, failures = 0;
> > >
> > > +       /* global reclaim will select cgroup in a round-robin fashion. */
> > >         do {
> > > -               ret = zswap_reclaim_entry(pool);
> > > +               pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
> >
> > Perhaps next_shrink_memcg is a better name here?
>
> Will change if you have a strong preference, I'd keep it shorter because it's
> always used in conjunction with a memcg type or function.

I'd rather have the more explicit name unless it causes some annoying
line breaks or so.

>
> >
> > > +
> > > +               ret = shrink_memcg(pool->next_shrink);
> > > +
> > >                 if (ret) {
> > > -                       zswap_reject_reclaim_fail++;
> > >                         if (ret != -EAGAIN)
> > >                                 break;
> > >                         if (++failures == MAX_RECLAIM_RETRIES)
> > > @@ -764,8 +826,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> > >          */
> > >         kref_init(&pool->kref);
> > >         INIT_LIST_HEAD(&pool->list);
> > > -       INIT_LIST_HEAD(&pool->lru);
> > > -       spin_lock_init(&pool->lru_lock);
> > > +       list_lru_init_memcg(&pool->list_lru, NULL);
> > >         INIT_WORK(&pool->shrink_work, shrink_worker);
> > >
> > >         zswap_pool_debug("created", pool);
> > > @@ -831,6 +892,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
> > >
> > >         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> > >         free_percpu(pool->acomp_ctx);
> > > +       list_lru_destroy(&pool->list_lru);
> > > +       if (pool->next_shrink)
> > > +               mem_cgroup_put(pool->next_shrink);
> > >         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> > >                 zpool_destroy_pool(pool->zpools[i]);
> > >         kfree(pool);
> > > @@ -1076,7 +1140,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > >
> > >         /* try to allocate swap cache page */
> > >         page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
> > > -                                      &page_was_allocated);
> > > +                                      &page_was_allocated, true);
> > >         if (!page) {
> > >                 ret = -ENOMEM;
> > >                 goto fail;
> > > @@ -1142,7 +1206,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > >         /* start writeback */
> > >         __swap_writepage(page, &wbc);
> > >         put_page(page);
> > > -       zswap_written_back_pages++;
> > >
> > >         return ret;
> > >
> > > @@ -1199,8 +1262,10 @@ bool zswap_store(struct folio *folio)
> > >         struct scatterlist input, output;
> > >         struct crypto_acomp_ctx *acomp_ctx;
> > >         struct obj_cgroup *objcg = NULL;
> > > +       struct mem_cgroup *memcg = NULL;
> > >         struct zswap_pool *pool;
> > >         struct zpool *zpool;
> > > +       int lru_alloc_ret;
> > >         unsigned int dlen = PAGE_SIZE;
> > >         unsigned long handle, value;
> > >         char *buf;
> > > @@ -1230,15 +1295,15 @@ bool zswap_store(struct folio *folio)
> > >                 zswap_invalidate_entry(tree, dupentry);
> > >         }
> > >         spin_unlock(&tree->lock);
> > > -
> > > -       /*
> > > -        * XXX: zswap reclaim does not work with cgroups yet. Without a
> > > -        * cgroup-aware entry LRU, we will push out entries system-wide based on
> > > -        * local cgroup limits.
> > > -        */
> > >         objcg = get_obj_cgroup_from_folio(folio);
> > > -       if (objcg && !obj_cgroup_may_zswap(objcg))
> > > -               goto reject;
> > > +       if (objcg && !obj_cgroup_may_zswap(objcg)) {
> > > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > > +               if (shrink_memcg(memcg)) {
> > > +                       mem_cgroup_put(memcg);
> > > +                       goto reject;
> > > +               }
> > > +               mem_cgroup_put(memcg);
> > > +       }
> > >
> > >         /* reclaim space if needed */
> > >         if (zswap_is_full()) {
> > > @@ -1254,10 +1319,15 @@ bool zswap_store(struct folio *folio)
> > >                         zswap_pool_reached_full = false;
> > >         }
> > >
> > > +       pool = zswap_pool_current_get();
> > > +       if (!pool)
> > > +               goto reject;
> > > +
> >
> > Why do we need to move zswap_pool_current_get() up here?
>
> Ah, thanks. This is a leftover from a previous version where the pool was needed
> to allocate the entry.
>
>
> >
> > >         /* allocate entry */
> > > -       entry = zswap_entry_cache_alloc(GFP_KERNEL);
> > > +       entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
> > >         if (!entry) {
> > >                 zswap_reject_kmemcache_fail++;
> > > +               zswap_pool_put(pool);
> > >                 goto reject;
> > >         }
> > >
> > > @@ -1269,6 +1339,7 @@ bool zswap_store(struct folio *folio)
> > >                         entry->length = 0;
> > >                         entry->value = value;
> > >                         atomic_inc(&zswap_same_filled_pages);
> > > +                       zswap_pool_put(pool);
> > >                         goto insert_entry;
> > >                 }
> > >                 kunmap_atomic(src);
> > > @@ -1278,9 +1349,15 @@ bool zswap_store(struct folio *folio)
> > >                 goto freepage;
> > >
> > >         /* if entry is successfully added, it keeps the reference */
> > > -       entry->pool = zswap_pool_current_get();
> > > -       if (!entry->pool)
> > > -               goto freepage;
> > > +       entry->pool = pool;
> > > +       if (objcg) {
> > > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > > +               lru_alloc_ret = memcg_list_lru_alloc(memcg, &pool->list_lru, GFP_KERNEL);
> > > +               mem_cgroup_put(memcg);
> > > +
> > > +               if (lru_alloc_ret)
> > > +                       goto freepage;
> > > +       }
> > >
> > >         /* compress */
> > >         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > > @@ -1358,9 +1435,8 @@ bool zswap_store(struct folio *folio)
> > >                 zswap_invalidate_entry(tree, dupentry);
> > >         }
> > >         if (entry->length) {
> > > -               spin_lock(&entry->pool->lru_lock);
> > > -               list_add(&entry->lru, &entry->pool->lru);
> > > -               spin_unlock(&entry->pool->lru_lock);
> > > +               INIT_LIST_HEAD(&entry->lru);
> > > +               zswap_lru_add(&pool->list_lru, entry);
> > >         }
> > >         spin_unlock(&tree->lock);
> > >
> > > @@ -1373,8 +1449,8 @@ bool zswap_store(struct folio *folio)
> > >
> > >  put_dstmem:
> > >         mutex_unlock(acomp_ctx->mutex);
> > > -       zswap_pool_put(entry->pool);
> > >  freepage:
> > > +       zswap_pool_put(entry->pool);
> > >         zswap_entry_cache_free(entry);
> > >  reject:
> > >         if (objcg)
> > > @@ -1467,9 +1543,8 @@ bool zswap_load(struct folio *folio)
> > >                 zswap_invalidate_entry(tree, entry);
> > >                 folio_mark_dirty(folio);
> > >         } else if (entry->length) {
> > > -               spin_lock(&entry->pool->lru_lock);
> > > -               list_move(&entry->lru, &entry->pool->lru);
> > > -               spin_unlock(&entry->pool->lru_lock);
> > > +               zswap_lru_del(&entry->pool->list_lru, entry);
> > > +               zswap_lru_add(&entry->pool->list_lru, entry);
> > >         }
> > >         zswap_entry_put(tree, entry);
> > >         spin_unlock(&tree->lock);
> > > --
> > > 2.34.1