[PATCH v6 16/16] mm: zswap: Fix for zstd performance regression with 2M folios.

Kanchana P Sridhar <kanchana.p.sridhar@xxxxxxxxx> · Wed, 5 Feb 2025 23:21:02 -0800

With the previous patch that enables support for batch compressions in
zswap_compress_folio(), a 6.2% throughput regression was seen with zstd and
2M folios, using vm-scalability/usemem.

For compressors that don't support batching, this was root-caused to the
following zswap_store_folio() structure:

 Batched stores:
 ---------------
 - Allocate all entries,
 - Compress all entries,
 - Store all entries in xarray/LRU.

Hence, the above structure is maintained only for batched stores, and the
following structure is implemented for sequential stores of large folio pages,
that fixes the zstd regression, while preserving common code paths for batched
and sequential stores of a folio:

 Sequential stores:
 ------------------
 For each page in folio:
  - allocate an entry,
  - compress the page,
  - store the entry in xarray/LRU.

This is submitted as a separate patch only for code review purposes. I will
squash this with the previous commit in subsequent versions of this
patch-series.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@xxxxxxxxx>
---
 mm/zswap.c | 193 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 111 insertions(+), 82 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index f1cba77eda62..7bfc720a6201 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1592,40 +1592,32 @@ static bool zswap_batch_compress(struct folio *folio,
 	return ret;
 }
 
-static bool zswap_compress_folio(struct folio *folio,
-				 struct zswap_entry *entries[],
-				 struct zswap_pool *pool)
+static __always_inline bool zswap_compress_folio(struct folio *folio,
+						 struct zswap_entry *entries[],
+						 long from_index,
+						 struct zswap_pool *pool,
+						 unsigned int batch_size,
+						 struct crypto_acomp_ctx *acomp_ctx)
 {
-	long index, nr_pages = folio_nr_pages(folio);
-	struct crypto_acomp_ctx *acomp_ctx;
-	unsigned int batch_size;
+	long index = from_index, nr_pages = folio_nr_pages(folio);
 	bool ret = true;
 
-	acomp_ctx = acomp_ctx_get_cpu_lock(pool);
-	batch_size = acomp_ctx->nr_reqs;
-
 	if ((batch_size > 1) && (nr_pages > 1)) {
-		for (index = 0; index < nr_pages; index += batch_size) {
+		for (; index < nr_pages; index += batch_size) {
 
 			if (!zswap_batch_compress(folio, index, batch_size,
 						  &entries[index], pool, acomp_ctx)) {
 				ret = false;
-				goto unlock_acomp_ctx;
+				break;
 			}
 		}
 	} else {
-		for (index = 0; index < nr_pages; ++index) {
-			struct page *page = folio_page(folio, index);
+		struct page *page = folio_page(folio, index);
 
-			if (!zswap_compress(page, entries[index], pool, acomp_ctx)) {
-				ret = false;
-				goto unlock_acomp_ctx;
-			}
-		}
+		if (!zswap_compress(page, entries[index], pool, acomp_ctx))
+			ret = false;
 	}
 
-unlock_acomp_ctx:
-	acomp_ctx_put_unlock(acomp_ctx);
 	return ret;
 }
 
@@ -1637,92 +1629,128 @@ static bool zswap_compress_folio(struct folio *folio,
  * handles to ERR_PTR(-EINVAL) at allocation time, and the fact that the
  * entry's handle is subsequently modified only upon a successful zpool_malloc()
  * after the page is compressed.
+ *
+ * For compressors that don't support batching, the following structure
+ * showed a performance regression with zstd/2M folios:
+ *
+ * Batched stores:
+ * ---------------
+ *  - Allocate all entries,
+ *  - Compress all entries,
+ *  - Store all entries in xarray/LRU.
+ *
+ * Hence, the above structure is maintained only for batched stores, and the
+ * following structure is implemented for sequential stores of large folio pages,
+ * that fixes the regression, while preserving common code paths for batched
+ * and sequential stores of a folio:
+ *
+ * Sequential stores:
+ * ------------------
+ * For each page in folio:
+ *  - allocate an entry,
+ *  - compress the page,
+ *  - store the entry in xarray/LRU.
  */
 static bool zswap_store_folio(struct folio *folio,
 			       struct obj_cgroup *objcg,
 			       struct zswap_pool *pool)
 {
-	long index, from_index = 0, nr_pages = folio_nr_pages(folio);
+	long index = 0, from_index = 0, nr_pages, nr_folio_pages = folio_nr_pages(folio);
 	struct zswap_entry **entries = NULL;
+	struct crypto_acomp_ctx *acomp_ctx;
 	int node_id = folio_nid(folio);
+	unsigned int batch_size;
 
-	entries = kmalloc(nr_pages * sizeof(*entries), GFP_KERNEL);
+	entries = kmalloc(nr_folio_pages * sizeof(*entries), GFP_KERNEL);
 	if (!entries)
 		return false;
 
-	for (index = 0; index < nr_pages; ++index) {
-		entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id);
+	acomp_ctx = acomp_ctx_get_cpu_lock(pool);
+	batch_size = acomp_ctx->nr_reqs;
 
-		if (!entries[index]) {
-			zswap_reject_kmemcache_fail++;
-			nr_pages = index;
-			goto store_folio_failed;
+	nr_pages = (batch_size > 1) ? nr_folio_pages : 1;
+
+	while (1) {
+		for (index = from_index; index < nr_pages; ++index) {
+			entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id);
+
+			if (!entries[index]) {
+				zswap_reject_kmemcache_fail++;
+				nr_pages = index;
+				goto store_folio_failed;
+			}
+
+			entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL);
 		}
 
-		entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL);
-	}
+		if (!zswap_compress_folio(folio, entries, from_index, pool, batch_size, acomp_ctx))
+			goto store_folio_failed;
 
-	if (!zswap_compress_folio(folio, entries, pool))
-		goto store_folio_failed;
+		for (index = from_index; index < nr_pages; ++index) {
+			swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, index));
+			struct zswap_entry *old, *entry = entries[index];
 
-	for (index = 0; index < nr_pages; ++index) {
-		swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, index));
-		struct zswap_entry *old, *entry = entries[index];
+			old = xa_store(swap_zswap_tree(page_swpentry),
+				swp_offset(page_swpentry),
+				entry, GFP_KERNEL);
+			if (xa_is_err(old)) {
+				int err = xa_err(old);
 
-		old = xa_store(swap_zswap_tree(page_swpentry),
-			       swp_offset(page_swpentry),
-			       entry, GFP_KERNEL);
-		if (xa_is_err(old)) {
-			int err = xa_err(old);
+				WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
+				zswap_reject_alloc_fail++;
+				from_index = index;
+				goto store_folio_failed;
+			}
 
-			WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
-			zswap_reject_alloc_fail++;
-			from_index = index;
-			goto store_folio_failed;
-		}
+			/*
+			 * We may have had an existing entry that became stale when
+			 * the folio was redirtied and now the new version is being
+			 * swapped out. Get rid of the old.
+			 */
+			if (old)
+				zswap_entry_free(old);
 
-		/*
-		 * We may have had an existing entry that became stale when
-		 * the folio was redirtied and now the new version is being
-		 * swapped out. Get rid of the old.
-		 */
-		if (old)
-			zswap_entry_free(old);
+			/*
+			 * The entry is successfully compressed and stored in the tree, there is
+			 * no further possibility of failure. Grab refs to the pool and objcg,
+			 * charge zswap memory, and increment zswap_stored_pages.
+			 * The opposite actions will be performed by zswap_entry_free()
+			 * when the entry is removed from the tree.
+			 */
+			zswap_pool_get(pool);
+			if (objcg) {
+				obj_cgroup_get(objcg);
+				obj_cgroup_charge_zswap(objcg, entry->length);
+			}
+			atomic_long_inc(&zswap_stored_pages);
 
-		/*
-		 * The entry is successfully compressed and stored in the tree, there is
-		 * no further possibility of failure. Grab refs to the pool and objcg,
-		 * charge zswap memory, and increment zswap_stored_pages.
-		 * The opposite actions will be performed by zswap_entry_free()
-		 * when the entry is removed from the tree.
-		 */
-		zswap_pool_get(pool);
-		if (objcg) {
-			obj_cgroup_get(objcg);
-			obj_cgroup_charge_zswap(objcg, entry->length);
+			/*
+			 * We finish initializing the entry while it's already in xarray.
+			 * This is safe because:
+			 *
+			 * 1. Concurrent stores and invalidations are excluded by folio lock.
+			 *
+			 * 2. Writeback is excluded by the entry not being on the LRU yet.
+			 *    The publishing order matters to prevent writeback from seeing
+			 *    an incoherent entry.
+			 */
+			entry->pool = pool;
+			entry->swpentry = page_swpentry;
+			entry->objcg = objcg;
+			entry->referenced = true;
+			if (entry->length) {
+				INIT_LIST_HEAD(&entry->lru);
+				zswap_lru_add(&zswap_list_lru, entry);
+			}
 		}
-		atomic_long_inc(&zswap_stored_pages);
 
-		/*
-		 * We finish initializing the entry while it's already in xarray.
-		 * This is safe because:
-		 *
-		 * 1. Concurrent stores and invalidations are excluded by folio lock.
-		 *
-		 * 2. Writeback is excluded by the entry not being on the LRU yet.
-		 *    The publishing order matters to prevent writeback from seeing
-		 *    an incoherent entry.
-		 */
-		entry->pool = pool;
-		entry->swpentry = page_swpentry;
-		entry->objcg = objcg;
-		entry->referenced = true;
-		if (entry->length) {
-			INIT_LIST_HEAD(&entry->lru);
-			zswap_lru_add(&zswap_list_lru, entry);
-		}
+		from_index = nr_pages++;
+
+		if (nr_pages > nr_folio_pages)
+			break;
 	}
 
+	acomp_ctx_put_unlock(acomp_ctx);
 	kfree(entries);
 	return true;
 
@@ -1734,6 +1762,7 @@ static bool zswap_store_folio(struct folio *folio,
 		zswap_entry_cache_free(entries[index]);
 	}
 
+	acomp_ctx_put_unlock(acomp_ctx);
 	kfree(entries);
 	return false;
 }
-- 
2.27.0