Re: [PATCH v5 11/12] mm: zswap: Restructure & simplify zswap_store() to make it amenable for batching.

Yosry Ahmed <yosryahmed@xxxxxxxxxx> · Mon, 6 Jan 2025 17:16:56 -0800

On Fri, Dec 20, 2024 at 10:31 PM Kanchana P Sridhar
<kanchana.p.sridhar@xxxxxxxxx> wrote:
>
> This patch introduces zswap_store_folio() that implements all the computes
> done earlier in zswap_store_page() for a single-page, for all the pages in
> a folio. This allows us to move the loop over the folio's pages from
> zswap_store() to zswap_store_folio().
>
> A distinct zswap_compress_folio() is also added, that simply calls
> zswap_compress() for each page in the folio it is called with.

The git diff looks funky, it may make things clearer to introduce
zswap_compress_folio() in a separate patch.

>
> zswap_store_folio() starts by allocating all zswap entries required to
> store the folio. Next, it calls zswap_compress_folio() and finally, adds
> the entries to the xarray and LRU.
>
> The error handling and cleanup required for all failure scenarios that can
> occur while storing a folio in zswap is now consolidated to a
> "store_folio_failed" label in zswap_store_folio().
>
> These changes facilitate developing support for compress batching in
> zswap_store_folio().
>
> Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@xxxxxxxxx>
> ---
>  mm/zswap.c | 183 +++++++++++++++++++++++++++++++++--------------------
>  1 file changed, 116 insertions(+), 67 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 99cd78891fd0..1be0f1807bfc 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1467,77 +1467,129 @@ static void shrink_worker(struct work_struct *w)
>  * main API
>  **********************************/
>
> -static ssize_t zswap_store_page(struct page *page,
> -                               struct obj_cgroup *objcg,
> -                               struct zswap_pool *pool)
> +static bool zswap_compress_folio(struct folio *folio,
> +                                struct zswap_entry *entries[],
> +                                struct zswap_pool *pool)
>  {
> -       swp_entry_t page_swpentry = page_swap_entry(page);
> -       struct zswap_entry *entry, *old;
> +       long index, nr_pages = folio_nr_pages(folio);
>
> -       /* allocate entry */
> -       entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
> -       if (!entry) {
> -               zswap_reject_kmemcache_fail++;
> -               return -EINVAL;
> +       for (index = 0; index < nr_pages; ++index) {
> +               struct page *page = folio_page(folio, index);
> +
> +               if (!zswap_compress(page, entries[index], pool))
> +                       return false;
>         }
>
> -       if (!zswap_compress(page, entry, pool))
> -               goto compress_failed;
> +       return true;
> +}
>
> -       old = xa_store(swap_zswap_tree(page_swpentry),
> -                      swp_offset(page_swpentry),
> -                      entry, GFP_KERNEL);
> -       if (xa_is_err(old)) {
> -               int err = xa_err(old);
> +/*
> + * Store all pages in a folio.
> + *
> + * The error handling from all failure points is consolidated to the
> + * "store_folio_failed" label, based on the initialization of the zswap entries'
> + * handles to ERR_PTR(-EINVAL) at allocation time, and the fact that the
> + * entry's handle is subsequently modified only upon a successful zpool_malloc()
> + * after the page is compressed.
> + */
> +static ssize_t zswap_store_folio(struct folio *folio,
> +                                struct obj_cgroup *objcg,
> +                                struct zswap_pool *pool)
> +{
> +       long index, nr_pages = folio_nr_pages(folio);
> +       struct zswap_entry **entries = NULL;
> +       int node_id = folio_nid(folio);
> +       size_t compressed_bytes = 0;
>
> -               WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
> -               zswap_reject_alloc_fail++;
> -               goto store_failed;
> +       entries = kmalloc(nr_pages * sizeof(*entries), GFP_KERNEL);

We can probably use kcalloc() here.

> +       if (!entries)
> +               return -ENOMEM;
> +
> +       /* allocate entries */

This comment can be dropped.

> +       for (index = 0; index < nr_pages; ++index) {
> +               entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id);
> +
> +               if (!entries[index]) {
> +                       zswap_reject_kmemcache_fail++;
> +                       nr_pages = index;
> +                       goto store_folio_failed;
> +               }
> +
> +               entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL);
>         }
>
> -       /*
> -        * We may have had an existing entry that became stale when
> -        * the folio was redirtied and now the new version is being
> -        * swapped out. Get rid of the old.
> -        */
> -       if (old)
> -               zswap_entry_free(old);
> +       if (!zswap_compress_folio(folio, entries, pool))
> +               goto store_folio_failed;
>
> -       /*
> -        * The entry is successfully compressed and stored in the tree, there is
> -        * no further possibility of failure. Grab refs to the pool and objcg.
> -        * These refs will be dropped by zswap_entry_free() when the entry is
> -        * removed from the tree.
> -        */
> -       zswap_pool_get(pool);
> -       if (objcg)
> -               obj_cgroup_get(objcg);
> +       for (index = 0; index < nr_pages; ++index) {
> +               swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, index));
> +               struct zswap_entry *old, *entry = entries[index];
> +
> +               old = xa_store(swap_zswap_tree(page_swpentry),
> +                              swp_offset(page_swpentry),
> +                              entry, GFP_KERNEL);
> +               if (xa_is_err(old)) {
> +                       int err = xa_err(old);
> +
> +                       WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
> +                       zswap_reject_alloc_fail++;
> +                       goto store_folio_failed;
> +               }
>
> -       /*
> -        * We finish initializing the entry while it's already in xarray.
> -        * This is safe because:
> -        *
> -        * 1. Concurrent stores and invalidations are excluded by folio lock.
> -        *
> -        * 2. Writeback is excluded by the entry not being on the LRU yet.
> -        *    The publishing order matters to prevent writeback from seeing
> -        *    an incoherent entry.
> -        */
> -       entry->pool = pool;
> -       entry->swpentry = page_swpentry;
> -       entry->objcg = objcg;
> -       entry->referenced = true;
> -       if (entry->length) {
> -               INIT_LIST_HEAD(&entry->lru);
> -               zswap_lru_add(&zswap_list_lru, entry);
> +               /*
> +                * We may have had an existing entry that became stale when
> +                * the folio was redirtied and now the new version is being
> +                * swapped out. Get rid of the old.
> +                */
> +               if (old)
> +                       zswap_entry_free(old);
> +
> +               /*
> +                * The entry is successfully compressed and stored in the tree, there is
> +                * no further possibility of failure. Grab refs to the pool and objcg.
> +                * These refs will be dropped by zswap_entry_free() when the entry is
> +                * removed from the tree.
> +                */
> +               zswap_pool_get(pool);
> +               if (objcg)
> +                       obj_cgroup_get(objcg);
> +
> +               /*
> +                * We finish initializing the entry while it's already in xarray.
> +                * This is safe because:
> +                *
> +                * 1. Concurrent stores and invalidations are excluded by folio lock.
> +                *
> +                * 2. Writeback is excluded by the entry not being on the LRU yet.
> +                *    The publishing order matters to prevent writeback from seeing
> +                *    an incoherent entry.
> +                */
> +               entry->pool = pool;
> +               entry->swpentry = page_swpentry;
> +               entry->objcg = objcg;
> +               entry->referenced = true;
> +               if (entry->length) {
> +                       INIT_LIST_HEAD(&entry->lru);
> +                       zswap_lru_add(&zswap_list_lru, entry);
> +               }
> +
> +               compressed_bytes += entry->length;
>         }
>
> -       return entry->length;
> +       kfree(entries);
> +
> +       return compressed_bytes;
> +
> +store_folio_failed:
> +       for (index = 0; index < nr_pages; ++index) {
> +               if (!IS_ERR_VALUE(entries[index]->handle))
> +                       zpool_free(pool->zpool, entries[index]->handle);
> +
> +               zswap_entry_cache_free(entries[index]);
> +       }

If there is a failure in xa_store() halfway through the entries, this
loop will free all the compressed objects and entries. But, some of
the entries are already in the xarray, and zswap_store() will try to
free them again. This seems like a bug, or did I miss something here?

> +
> +       kfree(entries);
>
> -store_failed:
> -       zpool_free(pool->zpool, entry->handle);
> -compress_failed:
> -       zswap_entry_cache_free(entry);
>         return -EINVAL;
>  }
>
> @@ -1549,8 +1601,8 @@ bool zswap_store(struct folio *folio)
>         struct mem_cgroup *memcg = NULL;
>         struct zswap_pool *pool;
>         size_t compressed_bytes = 0;
> +       ssize_t bytes;
>         bool ret = false;
> -       long index;
>
>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
>         VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1584,15 +1636,11 @@ bool zswap_store(struct folio *folio)
>                 mem_cgroup_put(memcg);
>         }
>
> -       for (index = 0; index < nr_pages; ++index) {
> -               struct page *page = folio_page(folio, index);
> -               ssize_t bytes;
> +       bytes = zswap_store_folio(folio, objcg, pool);
> +       if (bytes < 0)
> +               goto put_pool;
>
> -               bytes = zswap_store_page(page, objcg, pool);
> -               if (bytes < 0)
> -                       goto put_pool;
> -               compressed_bytes += bytes;
> -       }
> +       compressed_bytes = bytes;

What's the point of having both compressed_bytes and bytes now?

>
>         if (objcg) {
>                 obj_cgroup_charge_zswap(objcg, compressed_bytes);
> @@ -1622,6 +1670,7 @@ bool zswap_store(struct folio *folio)
>                 pgoff_t offset = swp_offset(swp);
>                 struct zswap_entry *entry;
>                 struct xarray *tree;
> +               long index;
>
>                 for (index = 0; index < nr_pages; ++index) {
>                         tree = swap_zswap_tree(swp_entry(type, offset + index));
> --
> 2.27.0
>