RE: [PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx to be configurable in nr of acomp_reqs.

"Sridhar, Kanchana P" <kanchana.p.sridhar@xxxxxxxxx> · Thu, 7 Nov 2024 22:21:04 +0000

Hi Johannes,

> -----Original Message-----
> From: Johannes Weiner <hannes@xxxxxxxxxxx>
> Sent: Thursday, November 7, 2024 9:21 AM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@xxxxxxxxx>
> Cc: linux-kernel@xxxxxxxxxxxxxxx; linux-mm@xxxxxxxxx;
> yosryahmed@xxxxxxxxxx; nphamcs@xxxxxxxxx;
> chengming.zhou@xxxxxxxxx; usamaarif642@xxxxxxxxx;
> ryan.roberts@xxxxxxx; Huang, Ying <ying.huang@xxxxxxxxx>;
> 21cnbao@xxxxxxxxx; akpm@xxxxxxxxxxxxxxxxxxxx; linux-
> crypto@xxxxxxxxxxxxxxx; herbert@xxxxxxxxxxxxxxxxxxx;
> davem@xxxxxxxxxxxxx; clabbe@xxxxxxxxxxxx; ardb@xxxxxxxxxx;
> ebiggers@xxxxxxxxxx; surenb@xxxxxxxxxx; Accardi, Kristen C
> <kristen.c.accardi@xxxxxxxxx>; zanussi@xxxxxxxxxx; Feghali, Wajdi K
> <wajdi.k.feghali@xxxxxxxxx>; Gopal, Vinodh <vinodh.gopal@xxxxxxxxx>
> Subject: Re: [PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx
> to be configurable in nr of acomp_reqs.
> 
> On Wed, Nov 06, 2024 at 11:21:01AM -0800, Kanchana P Sridhar wrote:
> > Modified the definition of "struct crypto_acomp_ctx" to represent a
> > configurable number of acomp_reqs and the required number of buffers.
> >
> > Accordingly, refactored the code that allocates/deallocates the acomp_ctx
> > resources, so that it can be called to create a regular acomp_ctx with
> > exactly one acomp_req/buffer, for use in the the existing non-batching
> > zswap_store(), as well as to create a separate "batching acomp_ctx" with
> > multiple acomp_reqs/buffers for IAA compress batching.
> >
> > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@xxxxxxxxx>
> > ---
> >  mm/zswap.c | 149 ++++++++++++++++++++++++++++++++++++++----------
> -----
> >  1 file changed, 107 insertions(+), 42 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 3e899fa61445..02e031122fdf 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -143,9 +143,10 @@ bool zswap_never_enabled(void)
> >
> >  struct crypto_acomp_ctx {
> >  	struct crypto_acomp *acomp;
> > -	struct acomp_req *req;
> > +	struct acomp_req **reqs;
> > +	u8 **buffers;
> > +	unsigned int nr_reqs;
> >  	struct crypto_wait wait;
> > -	u8 *buffer;
> >  	struct mutex mutex;
> >  	bool is_sleepable;
> >  };
> > @@ -241,6 +242,11 @@ static inline struct xarray
> *swap_zswap_tree(swp_entry_t swp)
> >  	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
> >  		 zpool_get_type((p)->zpool))
> >
> > +static int zswap_create_acomp_ctx(unsigned int cpu,
> > +				  struct crypto_acomp_ctx *acomp_ctx,
> > +				  char *tfm_name,
> > +				  unsigned int nr_reqs);
> 
> This looks unnecessary.

Thanks for the code review comments. I will make sure to avoid the
forward declarations.

> 
> > +
> >  /*********************************
> >  * pool functions
> >  **********************************/
> > @@ -813,69 +819,128 @@ static void zswap_entry_free(struct
> zswap_entry *entry)
> >  /*********************************
> >  * compressed storage functions
> >  **********************************/
> > -static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node
> *node)
> > +static int zswap_create_acomp_ctx(unsigned int cpu,
> > +				  struct crypto_acomp_ctx *acomp_ctx,
> > +				  char *tfm_name,
> > +				  unsigned int nr_reqs)
> >  {
> > -	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> node);
> > -	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool-
> >acomp_ctx, cpu);
> >  	struct crypto_acomp *acomp;
> > -	struct acomp_req *req;
> > -	int ret;
> > +	int ret = -ENOMEM;
> > +	int i, j;
> >
> > +	acomp_ctx->nr_reqs = 0;
> >  	mutex_init(&acomp_ctx->mutex);
> >
> > -	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL,
> cpu_to_node(cpu));
> > -	if (!acomp_ctx->buffer)
> > -		return -ENOMEM;
> > -
> > -	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0,
> cpu_to_node(cpu));
> > +	acomp = crypto_alloc_acomp_node(tfm_name, 0, 0,
> cpu_to_node(cpu));
> >  	if (IS_ERR(acomp)) {
> >  		pr_err("could not alloc crypto acomp %s : %ld\n",
> > -				pool->tfm_name, PTR_ERR(acomp));
> > -		ret = PTR_ERR(acomp);
> > -		goto acomp_fail;
> > +				tfm_name, PTR_ERR(acomp));
> > +		return PTR_ERR(acomp);
> >  	}
> > +
> >  	acomp_ctx->acomp = acomp;
> >  	acomp_ctx->is_sleepable = acomp_is_async(acomp);
> >
> > -	req = acomp_request_alloc(acomp_ctx->acomp);
> > -	if (!req) {
> > -		pr_err("could not alloc crypto acomp_request %s\n",
> > -		       pool->tfm_name);
> > -		ret = -ENOMEM;
> > +	acomp_ctx->buffers = kmalloc_node(nr_reqs * sizeof(u8 *),
> > +					  GFP_KERNEL, cpu_to_node(cpu));
> > +	if (!acomp_ctx->buffers)
> > +		goto buf_fail;
> > +
> > +	for (i = 0; i < nr_reqs; ++i) {
> > +		acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2,
> > +						     GFP_KERNEL,
> cpu_to_node(cpu));
> > +		if (!acomp_ctx->buffers[i]) {
> > +			for (j = 0; j < i; ++j)
> > +				kfree(acomp_ctx->buffers[j]);
> > +			kfree(acomp_ctx->buffers);
> > +			ret = -ENOMEM;
> > +			goto buf_fail;
> > +		}
> > +	}
> > +
> > +	acomp_ctx->reqs = kmalloc_node(nr_reqs * sizeof(struct acomp_req
> *),
> > +				       GFP_KERNEL, cpu_to_node(cpu));
> > +	if (!acomp_ctx->reqs)
> >  		goto req_fail;
> > +
> > +	for (i = 0; i < nr_reqs; ++i) {
> > +		acomp_ctx->reqs[i] = acomp_request_alloc(acomp_ctx-
> >acomp);
> > +		if (!acomp_ctx->reqs[i]) {
> > +			pr_err("could not alloc crypto acomp_request
> reqs[%d] %s\n",
> > +			       i, tfm_name);
> > +			for (j = 0; j < i; ++j)
> > +				acomp_request_free(acomp_ctx->reqs[j]);
> > +			kfree(acomp_ctx->reqs);
> > +			ret = -ENOMEM;
> > +			goto req_fail;
> > +		}
> >  	}
> > -	acomp_ctx->req = req;
> >
> > +	/*
> > +	 * The crypto_wait is used only in fully synchronous, i.e., with scomp
> > +	 * or non-poll mode of acomp, hence there is only one "wait" per
> > +	 * acomp_ctx, with callback set to reqs[0], under the assumption that
> > +	 * there is at least 1 request per acomp_ctx.
> > +	 */
> >  	crypto_init_wait(&acomp_ctx->wait);
> >  	/*
> >  	 * if the backend of acomp is async zip, crypto_req_done() will
> wakeup
> >  	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
> >  	 * won't be called, crypto_wait_req() will return without blocking.
> >  	 */
> > -	acomp_request_set_callback(req,
> CRYPTO_TFM_REQ_MAY_BACKLOG,
> > +	acomp_request_set_callback(acomp_ctx->reqs[0],
> CRYPTO_TFM_REQ_MAY_BACKLOG,
> >  				   crypto_req_done, &acomp_ctx->wait);
> >
> > +	acomp_ctx->nr_reqs = nr_reqs;
> >  	return 0;
> >
> >  req_fail:
> > +	for (i = 0; i < nr_reqs; ++i)
> > +		kfree(acomp_ctx->buffers[i]);
> > +	kfree(acomp_ctx->buffers);
> > +buf_fail:
> >  	crypto_free_acomp(acomp_ctx->acomp);
> > -acomp_fail:
> > -	kfree(acomp_ctx->buffer);
> >  	return ret;
> >  }
> >
> > -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node
> *node)
> > +static void zswap_delete_acomp_ctx(struct crypto_acomp_ctx
> *acomp_ctx)
> >  {
> > -	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> node);
> > -	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool-
> >acomp_ctx, cpu);
> > -
> >  	if (!IS_ERR_OR_NULL(acomp_ctx)) {
> > -		if (!IS_ERR_OR_NULL(acomp_ctx->req))
> > -			acomp_request_free(acomp_ctx->req);
> > +		int i;
> > +
> > +		for (i = 0; i < acomp_ctx->nr_reqs; ++i)
> > +			if (!IS_ERR_OR_NULL(acomp_ctx->reqs[i]))
> > +				acomp_request_free(acomp_ctx->reqs[i]);
> > +		kfree(acomp_ctx->reqs);
> > +
> > +		for (i = 0; i < acomp_ctx->nr_reqs; ++i)
> > +			kfree(acomp_ctx->buffers[i]);
> > +		kfree(acomp_ctx->buffers);
> > +
> >  		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
> >  			crypto_free_acomp(acomp_ctx->acomp);
> > -		kfree(acomp_ctx->buffer);
> > +
> > +		acomp_ctx->nr_reqs = 0;
> > +		acomp_ctx = NULL;
> >  	}
> > +}
> > +
> > +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node
> *node)
> > +{
> > +	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> node);
> > +	struct crypto_acomp_ctx *acomp_ctx;
> > +
> > +	acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > +	return zswap_create_acomp_ctx(cpu, acomp_ctx, pool->tfm_name,
> 1);
> > +}
> > +
> > +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node
> *node)
> > +{
> > +	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> node);
> > +	struct crypto_acomp_ctx *acomp_ctx;
> > +
> > +	acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > +	zswap_delete_acomp_ctx(acomp_ctx);
> >
> >  	return 0;
> >  }
> 
> There are no other callers to these functions. Just do the work
> directly in the cpu callbacks here like it used to be.

There will be other callers to zswap_create_acomp_ctx() and
zswap_delete_acomp_ctx() in patches 10 and 11 of this series, when the
per-cpu "acomp_batch_ctx" is introduced in struct zswap_pool. I was trying
to modularize the code first, so as to split the changes into smaller commits.

The per-cpu "acomp_batch_ctx" resources are allocated in patch 11 in the
"zswap_pool_can_batch()" function, that allocates batching resources
for this cpu. This was to address Yosry's earlier comment about minimizing
the memory footprint cost of batching.

The way I decided to do this is by reusing the code that allocates the de-facto
pool->acomp_ctx for the selected compressor for all cpu's in zswap_pool_create().
However, I did not want to add the acomp_batch_ctx multiple reqs/buffers
allocation to the cpuhp_state_add_instance() code path which would incur the
memory cost on all cpu's.

Instead, the approach I chose to follow is to allocate the batching resources
in patch 11 only as needed, on "a given cpu" that has to store a large folio. Hope
this explains the purpose of the modularization better.

Other ideas towards accomplishing this are very welcome.

Thanks,
Kanchana

> 
> Otherwise it looks good to me.