[PATCH v7 15/15] mm: zswap: Compress batching with request chaining in zswap_store() of large folios.

Kanchana P Sridhar <kanchana.p.sridhar@xxxxxxxxx> · Fri, 28 Feb 2025 02:00:24 -0800

This patch introduces zswap_batch_compress() that takes an index within a
folio, and sets up a request chain for compressing multiple pages of that
folio, as a batch.

The call to the crypto layer is exactly the same as in zswap_compress(),
when batch compressing a request chain in zswap_batch_compress().

zswap_store_folio() is modified to detect if the pool's acomp_ctx has
more than one "nr_reqs", which will be the case if the CPU onlining code
has allocated multiple batching resources in the acomp_ctx. If so, it means
compress batching can be used with a batch-size of "acomp_ctx->nr_reqs".

If compress batching can be used, zswap_store_folio() will invoke
zswap_batch_compress() to compress and store the folio in batches of
"acomp_ctx->nr_reqs" pages.

With Intel IAA, the iaa_crypto driver will compress each batch of pages in
parallel in hardware.

Hence, zswap_batch_compress() does the same computes for a batch, as
zswap_compress() does for a page; and returns true if the batch was
successfully compressed/stored, and false otherwise.

If the pool does not support compress batching, or the folio has only one
page, zswap_store_folio() calls zswap_compress() for each individual
page in the folio, as before.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@xxxxxxxxx>
---
 mm/zswap.c | 296 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 224 insertions(+), 72 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index ab9167220cb6..626574bd84f6 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1051,9 +1051,9 @@ static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
 }
 
 static bool zswap_compress(struct page *page, struct zswap_entry *entry,
-			   struct zswap_pool *pool)
+			   struct zswap_pool *pool,
+			   struct crypto_acomp_ctx *acomp_ctx)
 {
-	struct crypto_acomp_ctx *acomp_ctx;
 	struct scatterlist input, output;
 	int comp_ret = 0, alloc_ret = 0;
 	unsigned int dlen = PAGE_SIZE;
@@ -1063,7 +1063,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	gfp_t gfp;
 	u8 *dst;
 
-	acomp_ctx = acomp_ctx_get_cpu_lock(pool);
+	lockdep_assert_held(&acomp_ctx->mutex);
+
 	dst = acomp_ctx->buffers[0];
 	sg_init_table(&input, 1);
 	sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -1091,7 +1092,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->reqs[0]), &acomp_ctx->wait);
 	dlen = acomp_ctx->reqs[0]->dlen;
 	if (comp_ret)
-		goto unlock;
+		goto check_errors;
 
 	zpool = pool->zpool;
 	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
@@ -1099,7 +1100,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
 	alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle);
 	if (alloc_ret)
-		goto unlock;
+		goto check_errors;
 
 	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
 	memcpy(buf, dst, dlen);
@@ -1108,7 +1109,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	entry->handle = handle;
 	entry->length = dlen;
 
-unlock:
+check_errors:
 	if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC)
 		zswap_reject_compress_poor++;
 	else if (comp_ret)
@@ -1116,7 +1117,6 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	else if (alloc_ret)
 		zswap_reject_alloc_fail++;
 
-	acomp_ctx_put_unlock(acomp_ctx);
 	return comp_ret == 0 && alloc_ret == 0;
 }
 
@@ -1580,6 +1580,106 @@ static void shrink_worker(struct work_struct *w)
 * main API
 **********************************/
 
+/*
+ * Batch compress multiple @nr_pages in @folio, starting from @index.
+ */
+static bool zswap_batch_compress(struct folio *folio,
+				 long index,
+				 unsigned int nr_pages,
+				 struct zswap_entry *entries[],
+				 struct zswap_pool *pool,
+				 struct crypto_acomp_ctx *acomp_ctx)
+{
+	struct scatterlist inputs[ZSWAP_MAX_BATCH_SIZE];
+	struct scatterlist outputs[ZSWAP_MAX_BATCH_SIZE];
+	unsigned int i;
+	int err = 0;
+
+	lockdep_assert_held(&acomp_ctx->mutex);
+
+	for (i = 0; i < nr_pages; ++i) {
+		struct page *page = folio_page(folio, index + i);
+
+		sg_init_table(&inputs[i], 1);
+		sg_set_page(&inputs[i], page, PAGE_SIZE, 0);
+
+		/*
+		 * Each dst buffer should be of size (PAGE_SIZE * 2).
+		 * Reflect same in sg_list.
+		 */
+		sg_init_one(&outputs[i], acomp_ctx->buffers[i], PAGE_SIZE * 2);
+		acomp_request_set_params(acomp_ctx->reqs[i], &inputs[i],
+					 &outputs[i], PAGE_SIZE, PAGE_SIZE);
+
+		/* Use acomp request chaining. */
+		if (i)
+			acomp_request_chain(acomp_ctx->reqs[i], acomp_ctx->reqs[0]);
+		else
+			acomp_reqchain_init(acomp_ctx->reqs[0], 0, crypto_req_done,
+					    &acomp_ctx->wait);
+	}
+
+	err = crypto_wait_req(crypto_acomp_compress(acomp_ctx->reqs[0]), &acomp_ctx->wait);
+
+	/*
+	 * Get the individual compress errors from request chaining.
+	 */
+	for (i = 0; i < nr_pages; ++i) {
+		if (unlikely(acomp_request_err(acomp_ctx->reqs[i]))) {
+			err = -EINVAL;
+			if (acomp_request_err(acomp_ctx->reqs[i]) == -ENOSPC)
+				zswap_reject_compress_poor++;
+			else
+				zswap_reject_compress_fail++;
+		}
+	}
+
+	if (likely(!err)) {
+		/*
+		 * All batch pages were successfully compressed.
+		 * Store the pages in zpool.
+		 */
+		struct zpool *zpool = pool->zpool;
+		gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+
+		if (zpool_malloc_support_movable(zpool))
+			gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+
+		for (i = 0; i < nr_pages; ++i) {
+			unsigned long handle;
+			char *buf;
+
+			err = zpool_malloc(zpool, acomp_ctx->reqs[i]->dlen, gfp, &handle);
+
+			if (err) {
+				if (err == -ENOSPC)
+					zswap_reject_compress_poor++;
+				else
+					zswap_reject_alloc_fail++;
+
+				break;
+			}
+
+			buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
+			memcpy(buf, acomp_ctx->buffers[i], acomp_ctx->reqs[i]->dlen);
+			zpool_unmap_handle(zpool, handle);
+
+			entries[i]->handle = handle;
+			entries[i]->length = acomp_ctx->reqs[i]->dlen;
+		}
+	}
+
+	/*
+	 * Request chaining cleanup:
+	 *
+	 * - Clear the CRYPTO_TFM_REQ_CHAIN bit on acomp_ctx->reqs[0].
+	 * - Reset the acomp_ctx->wait to notify acomp_ctx->reqs[0].
+	 */
+	acomp_reqchain_clear(acomp_ctx->reqs[0], &acomp_ctx->wait);
+
+	return !err;
+}
+
 /*
  * Store all pages in a folio.
  *
@@ -1588,95 +1688,146 @@ static void shrink_worker(struct work_struct *w)
  * handles to ERR_PTR(-EINVAL) at allocation time, and the fact that the
  * entry's handle is subsequently modified only upon a successful zpool_malloc()
  * after the page is compressed.
+ *
+ * For compressors that don't support batching, the following structure
+ * showed a performance regression with zstd using 64K as well as 2M folios:
+ *
+ * Batched stores:
+ * ---------------
+ *  - Allocate all entries,
+ *  - Compress all entries,
+ *  - Store all entries in xarray/LRU.
+ *
+ * Hence, the above structure is maintained only for batched stores, and the
+ * following structure is implemented for sequential stores of large folio pages,
+ * that fixes the regression, while preserving common code paths for batched
+ * and sequential stores of a folio:
+ *
+ * Sequential stores:
+ * ------------------
+ * For each page in folio:
+ *  - allocate an entry,
+ *  - compress the page,
+ *  - store the entry in xarray/LRU.
  */
 static bool zswap_store_folio(struct folio *folio,
 			      struct obj_cgroup *objcg,
 			      struct zswap_pool *pool)
 {
-	long index, from_index = 0, nr_pages = folio_nr_pages(folio);
+	long index = 0, from_index = 0, nr_pages, nr_folio_pages = folio_nr_pages(folio);
 	struct zswap_entry **entries = NULL;
+	struct crypto_acomp_ctx *acomp_ctx;
 	int node_id = folio_nid(folio);
+	unsigned int batch_size;
+	bool batching;
 
-	entries = kmalloc(nr_pages * sizeof(*entries), GFP_KERNEL);
+	entries = kmalloc(nr_folio_pages * sizeof(*entries), GFP_KERNEL);
 	if (!entries)
 		return false;
 
-	for (index = from_index; index < nr_pages; ++index) {
-		entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id);
+	acomp_ctx = acomp_ctx_get_cpu_lock(pool);
 
-		if (!entries[index]) {
-			zswap_reject_kmemcache_fail++;
-			nr_pages = index;
-			goto store_folio_failed;
-		}
+	batch_size = acomp_ctx->nr_reqs;
 
-		entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL);
-	}
+	nr_pages = (batch_size > 1) ? nr_folio_pages : 1;
+	batching = (nr_pages > 1) ? true : false;
 
-	for (index = from_index; index < nr_pages; ++index) {
-		struct page *page = folio_page(folio, index);
-		swp_entry_t page_swpentry = page_swap_entry(page);
-		struct zswap_entry *old, *entry = entries[index];
+	while (1) {
+		for (index = from_index; index < nr_pages; ++index) {
+			entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id);
 
-		if (!zswap_compress(page, entry, pool)) {
-			from_index = index;
-			goto store_folio_failed;
-		}
+			if (!entries[index]) {
+				zswap_reject_kmemcache_fail++;
+				nr_pages = index;
+				goto store_folio_failed;
+			}
 
-		old = xa_store(swap_zswap_tree(page_swpentry),
-			       swp_offset(page_swpentry),
-			       entry, GFP_KERNEL);
-		if (xa_is_err(old)) {
-			int err = xa_err(old);
+			entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL);
+		}
 
-			WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
-			zswap_reject_alloc_fail++;
-			from_index = index;
-			goto store_folio_failed;
+		if (batching) {
+			/* Batch compress the pages in the folio. */
+			for (index = from_index; index < nr_pages; index += batch_size) {
+
+				if (!zswap_batch_compress(folio, index,
+							  min((unsigned int)(nr_pages - index),
+							      batch_size),
+							  &entries[index], pool, acomp_ctx))
+					goto store_folio_failed;
+			}
+		} else {
+			/* Sequential compress the next page in the folio. */
+			struct page *page = folio_page(folio, from_index);
+
+			if (!zswap_compress(page, entries[from_index], pool, acomp_ctx))
+				goto store_folio_failed;
 		}
 
-		/*
-		 * We may have had an existing entry that became stale when
-		 * the folio was redirtied and now the new version is being
-		 * swapped out. Get rid of the old.
-		 */
-		if (old)
-			zswap_entry_free(old);
+		for (index = from_index; index < nr_pages; ++index) {
+			swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, index));
+			struct zswap_entry *old, *entry = entries[index];
 
-		/*
-		 * The entry is successfully compressed and stored in the tree, there is
-		 * no further possibility of failure. Grab refs to the pool and objcg,
-		 * charge zswap memory, and increment zswap_stored_pages.
-		 * The opposite actions will be performed by zswap_entry_free()
-		 * when the entry is removed from the tree.
-		 */
-		zswap_pool_get(pool);
-		if (objcg) {
-			obj_cgroup_get(objcg);
-			obj_cgroup_charge_zswap(objcg, entry->length);
-		}
-		atomic_long_inc(&zswap_stored_pages);
+			old = xa_store(swap_zswap_tree(page_swpentry),
+				swp_offset(page_swpentry),
+				entry, GFP_KERNEL);
+			if (xa_is_err(old)) {
+				int err = xa_err(old);
 
-		/*
-		 * We finish initializing the entry while it's already in xarray.
-		 * This is safe because:
-		 *
-		 * 1. Concurrent stores and invalidations are excluded by folio lock.
-		 *
-		 * 2. Writeback is excluded by the entry not being on the LRU yet.
-		 *    The publishing order matters to prevent writeback from seeing
-		 *    an incoherent entry.
-		 */
-		entry->pool = pool;
-		entry->swpentry = page_swpentry;
-		entry->objcg = objcg;
-		entry->referenced = true;
-		if (entry->length) {
-			INIT_LIST_HEAD(&entry->lru);
-			zswap_lru_add(&zswap_list_lru, entry);
+				WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
+				zswap_reject_alloc_fail++;
+				from_index = index;
+				goto store_folio_failed;
+			}
+
+			/*
+			 * We may have had an existing entry that became stale when
+			 * the folio was redirtied and now the new version is being
+			 * swapped out. Get rid of the old.
+			 */
+			if (old)
+				zswap_entry_free(old);
+
+			/*
+			 * The entry is successfully compressed and stored in the tree, there is
+			 * no further possibility of failure. Grab refs to the pool and objcg,
+			 * charge zswap memory, and increment zswap_stored_pages.
+			 * The opposite actions will be performed by zswap_entry_free()
+			 * when the entry is removed from the tree.
+			 */
+			zswap_pool_get(pool);
+			if (objcg) {
+				obj_cgroup_get(objcg);
+				obj_cgroup_charge_zswap(objcg, entry->length);
+			}
+			atomic_long_inc(&zswap_stored_pages);
+
+			/*
+			 * We finish initializing the entry while it's already in xarray.
+			 * This is safe because:
+			 *
+			 * 1. Concurrent stores and invalidations are excluded by folio lock.
+			 *
+			 * 2. Writeback is excluded by the entry not being on the LRU yet.
+			 *    The publishing order matters to prevent writeback from seeing
+			 *    an incoherent entry.
+			 */
+			entry->pool = pool;
+			entry->swpentry = page_swpentry;
+			entry->objcg = objcg;
+			entry->referenced = true;
+			if (entry->length) {
+				INIT_LIST_HEAD(&entry->lru);
+				zswap_lru_add(&zswap_list_lru, entry);
+			}
 		}
+
+		from_index = nr_pages++;
+
+		if (nr_pages > nr_folio_pages)
+			break;
 	}
 
+	acomp_ctx_put_unlock(acomp_ctx);
 	kfree(entries);
 	return true;
 
@@ -1688,6 +1839,7 @@ static bool zswap_store_folio(struct folio *folio,
 		zswap_entry_cache_free(entries[index]);
 	}
 
+	acomp_ctx_put_unlock(acomp_ctx);
 	kfree(entries);
 	return false;
 }
-- 
2.27.0