Re: [PATCH v2 6/6] mm/zswap: directly use percpu mutex and buffer in load/store

Yosry Ahmed <yosryahmed@xxxxxxxxxx> · Mon, 18 Dec 2023 01:37:17 -0800

On Mon, Dec 18, 2023 at 12:22 AM Chengming Zhou
<zhouchengming@xxxxxxxxxxxxx> wrote:
>
> Since the introduce of reusing the dstmem in the load path, it seems
> confusing that we are now using acomp_ctx->dstmem and acomp_ctx->mutex
> now for purposes other than what the naming suggests.
>
> Yosry suggested removing these two fields from acomp_ctx, and directly
> using zswap_dstmem and zswap_mutex in both the load and store paths,
> rename them, and add proper comments above their definitions that they
> are for generic percpu buffering on the load and store paths.
>
> So this patch remove dstmem and mutex from acomp_ctx, and rename the
> zswap_dstmem to zswap_buffer, using the percpu mutex and buffer on
> the load and store paths. And refactor out __zswap_store() to only
> include the compress & store, since I found zswap_store() is too long.

I am not sure refactoring out __zswap_store() is useful to be honest,
but I am not objecting to it, it mirrors __zswap_load() in a sense.
However, if you want to do so, please do it in a separate patch from
renaming the percpu buffers and mutex. This will make reviewing easier
(and make my Suggested-by correctly scoped).

Also, any reason why raw_smp_processor_id() is used here instead of
smp_processor_id()?

>
> Suggested-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx>
> Signed-off-by: Chengming Zhou <zhouchengming@xxxxxxxxxxxxx>
> ---
>  mm/zswap.c | 193 +++++++++++++++++++++++++++++++++----------------------------
>  1 file changed, 104 insertions(+), 89 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 2c349fd88904..b7449294ec3a 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -166,8 +166,6 @@ struct crypto_acomp_ctx {
>         struct crypto_acomp *acomp;
>         struct acomp_req *req;
>         struct crypto_wait wait;
> -       u8 *dstmem;
> -       struct mutex *mutex;
>  };
>
>  /*
> @@ -694,7 +692,7 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool)
>  /*********************************
>  * per-cpu code
>  **********************************/
> -static DEFINE_PER_CPU(u8 *, zswap_dstmem);
> +static DEFINE_PER_CPU(u8 *, zswap_buffer);
>  /*
>   * If users dynamically change the zpool type and compressor at runtime, i.e.
>   * zswap is running, zswap can have more than one zpool on one cpu, but they
> @@ -702,39 +700,39 @@ static DEFINE_PER_CPU(u8 *, zswap_dstmem);
>   */
>  static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
>
> -static int zswap_dstmem_prepare(unsigned int cpu)
> +static int zswap_buffer_prepare(unsigned int cpu)
>  {
>         struct mutex *mutex;
> -       u8 *dst;
> +       u8 *buf;
>
> -       dst = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
> -       if (!dst)
> +       buf = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
> +       if (!buf)
>                 return -ENOMEM;
>
>         mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
>         if (!mutex) {
> -               kfree(dst);
> +               kfree(buf);
>                 return -ENOMEM;
>         }
>
>         mutex_init(mutex);
> -       per_cpu(zswap_dstmem, cpu) = dst;
> +       per_cpu(zswap_buffer, cpu) = buf;
>         per_cpu(zswap_mutex, cpu) = mutex;
>         return 0;
>  }
>
> -static int zswap_dstmem_dead(unsigned int cpu)
> +static int zswap_buffer_dead(unsigned int cpu)
>  {
>         struct mutex *mutex;
> -       u8 *dst;
> +       u8 *buf;
>
>         mutex = per_cpu(zswap_mutex, cpu);
>         kfree(mutex);
>         per_cpu(zswap_mutex, cpu) = NULL;
>
> -       dst = per_cpu(zswap_dstmem, cpu);
> -       kfree(dst);
> -       per_cpu(zswap_dstmem, cpu) = NULL;
> +       buf = per_cpu(zswap_buffer, cpu);
> +       kfree(buf);
> +       per_cpu(zswap_buffer, cpu) = NULL;
>
>         return 0;
>  }
> @@ -772,9 +770,6 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
>         acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
>                                    crypto_req_done, &acomp_ctx->wait);
>
> -       acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
> -       acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
> -
>         return 0;
>  }
>
> @@ -1392,20 +1387,98 @@ static int zswap_enabled_param_set(const char *val,
>         return ret;
>  }
>
> +static int __zswap_store(struct zswap_entry *entry, struct page *page)
> +{
> +       struct scatterlist input, output;
> +       struct crypto_acomp_ctx *acomp_ctx;
> +       struct zpool *zpool;
> +       unsigned long handle;
> +       unsigned int dlen;
> +       u8 *buf, *dst;
> +       gfp_t gfp;
> +       int ret;
> +       int cpu;
> +       struct mutex *mutex;
> +
> +       cpu = raw_smp_processor_id();
> +       mutex = per_cpu(zswap_mutex, cpu);
> +       mutex_lock(mutex);
> +
> +       acomp_ctx = per_cpu_ptr(entry->pool->acomp_ctx, cpu);
> +       buf = per_cpu(zswap_buffer, cpu);
> +
> +       sg_init_table(&input, 1);
> +       sg_set_page(&input, page, PAGE_SIZE, 0);
> +       sg_init_one(&output, buf, PAGE_SIZE);
> +       acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, PAGE_SIZE);
> +       /*
> +        * it maybe looks a little bit silly that we send an asynchronous request,
> +        * then wait for its completion synchronously. This makes the process look
> +        * synchronous in fact.
> +        * Theoretically, acomp supports users send multiple acomp requests in one
> +        * acomp instance, then get those requests done simultaneously. but in this
> +        * case, zswap actually does store and load page by page, there is no
> +        * existing method to send the second page before the first page is done
> +        * in one thread doing zwap.
> +        * but in different threads running on different cpu, we have different
> +        * acomp instance, so multiple threads can do (de)compression in parallel.
> +        */
> +       ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
> +       dlen = acomp_ctx->req->dlen;
> +
> +       if (ret) {
> +               zswap_reject_compress_fail++;
> +               goto unlock;
> +       }
> +
> +       /* store */
> +       zpool = zswap_find_zpool(entry);
> +       gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> +       if (zpool_malloc_support_movable(zpool))
> +               gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> +       ret = zpool_malloc(zpool, dlen, gfp, &handle);
> +       if (ret == -ENOSPC) {
> +               zswap_reject_compress_poor++;
> +               goto unlock;
> +       }
> +       if (ret) {
> +               zswap_reject_alloc_fail++;
> +               goto unlock;
> +       }
> +       dst = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
> +       memcpy(dst, buf, dlen);
> +       zpool_unmap_handle(zpool, handle);
> +       mutex_unlock(mutex);
> +
> +       entry->handle = handle;
> +       entry->length = dlen;
> +       return 0;
> +
> +unlock:
> +       mutex_unlock(mutex);
> +       return ret;
> +}
> +
>  static void __zswap_load(struct zswap_entry *entry, struct page *page)
>  {
>         struct zpool *zpool = zswap_find_zpool(entry);
>         struct scatterlist input, output;
>         struct crypto_acomp_ctx *acomp_ctx;
> -       u8 *src;
> +       u8 *src, *buf;
> +       int cpu;
> +       struct mutex *mutex;
> +
> +       cpu = raw_smp_processor_id();
> +       mutex = per_cpu(zswap_mutex, cpu);
> +       mutex_lock(mutex);
>
> -       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> -       mutex_lock(acomp_ctx->mutex);
> +       acomp_ctx = per_cpu_ptr(entry->pool->acomp_ctx, cpu);
>
>         src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>         if (!zpool_can_sleep_mapped(zpool)) {
> -               memcpy(acomp_ctx->dstmem, src, entry->length);
> -               src = acomp_ctx->dstmem;
> +               buf = per_cpu(zswap_buffer, cpu);
> +               memcpy(buf, src, entry->length);
> +               src = buf;
>                 zpool_unmap_handle(zpool, entry->handle);
>         }
>
> @@ -1415,7 +1488,7 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
>         acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
>         BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
>         BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
> -       mutex_unlock(acomp_ctx->mutex);
> +       mutex_unlock(mutex);
>
>         if (zpool_can_sleep_mapped(zpool))
>                 zpool_unmap_handle(zpool, entry->handle);
> @@ -1539,18 +1612,11 @@ bool zswap_store(struct folio *folio)
>         struct page *page = &folio->page;
>         struct zswap_tree *tree = zswap_trees[type];
>         struct zswap_entry *entry, *dupentry;
> -       struct scatterlist input, output;
> -       struct crypto_acomp_ctx *acomp_ctx;
>         struct obj_cgroup *objcg = NULL;
>         struct mem_cgroup *memcg = NULL;
>         struct zswap_pool *pool;
> -       struct zpool *zpool;
> -       unsigned int dlen = PAGE_SIZE;
> -       unsigned long handle, value;
> -       char *buf;
> -       u8 *src, *dst;
> -       gfp_t gfp;
> -       int ret;
> +       u8 *src;
> +       unsigned long value;
>
>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
>         VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1635,60 +1701,11 @@ bool zswap_store(struct folio *folio)
>                 mem_cgroup_put(memcg);
>         }
>
> -       /* compress */
> -       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> -
> -       mutex_lock(acomp_ctx->mutex);
> -
> -       dst = acomp_ctx->dstmem;
> -       sg_init_table(&input, 1);
> -       sg_set_page(&input, page, PAGE_SIZE, 0);
> -
> -       sg_init_one(&output, dst, PAGE_SIZE);
> -       acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
> -       /*
> -        * it maybe looks a little bit silly that we send an asynchronous request,
> -        * then wait for its completion synchronously. This makes the process look
> -        * synchronous in fact.
> -        * Theoretically, acomp supports users send multiple acomp requests in one
> -        * acomp instance, then get those requests done simultaneously. but in this
> -        * case, zswap actually does store and load page by page, there is no
> -        * existing method to send the second page before the first page is done
> -        * in one thread doing zwap.
> -        * but in different threads running on different cpu, we have different
> -        * acomp instance, so multiple threads can do (de)compression in parallel.
> -        */
> -       ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
> -       dlen = acomp_ctx->req->dlen;
> -
> -       if (ret) {
> -               zswap_reject_compress_fail++;
> -               goto put_dstmem;
> -       }
> -
> -       /* store */
> -       zpool = zswap_find_zpool(entry);
> -       gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> -       if (zpool_malloc_support_movable(zpool))
> -               gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> -       ret = zpool_malloc(zpool, dlen, gfp, &handle);
> -       if (ret == -ENOSPC) {
> -               zswap_reject_compress_poor++;
> -               goto put_dstmem;
> -       }
> -       if (ret) {
> -               zswap_reject_alloc_fail++;
> -               goto put_dstmem;
> -       }
> -       buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
> -       memcpy(buf, dst, dlen);
> -       zpool_unmap_handle(zpool, handle);
> -       mutex_unlock(acomp_ctx->mutex);
> +       if (__zswap_store(entry, page))
> +               goto put_pool;
>
>         /* populate entry */
>         entry->swpentry = swp_entry(type, offset);
> -       entry->handle = handle;
> -       entry->length = dlen;
>
>  insert_entry:
>         entry->objcg = objcg;
> @@ -1725,8 +1742,6 @@ bool zswap_store(struct folio *folio)
>
>         return true;
>
> -put_dstmem:
> -       mutex_unlock(acomp_ctx->mutex);
>  put_pool:
>         zswap_pool_put(entry->pool);
>  freepage:
> @@ -1902,10 +1917,10 @@ static int zswap_setup(void)
>         }
>
>         ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
> -                               zswap_dstmem_prepare, zswap_dstmem_dead);
> +                               zswap_buffer_prepare, zswap_buffer_dead);
>         if (ret) {
> -               pr_err("dstmem alloc failed\n");
> -               goto dstmem_fail;
> +               pr_err("buffer alloc failed\n");
> +               goto buffer_fail;
>         }
>
>         ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
> @@ -1940,7 +1955,7 @@ static int zswap_setup(void)
>                 zswap_pool_destroy(pool);
>  hp_fail:
>         cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
> -dstmem_fail:
> +buffer_fail:
>         kmem_cache_destroy(zswap_entry_cache);
>  cache_fail:
>         /* if built-in, we aren't unloaded on failure; don't allow use */
>
> --
> b4 0.10.1