[PATCH] zswap: do not crash the kernel on decompression failure

Nhat Pham <nphamcs@xxxxxxxxx> · Tue, 25 Feb 2025 13:32:00 -0800

Currently, we crash the kernel when a decompression failure occurs in
zswap (either because of memory corruption, or a bug in the compression
algorithm). This is overkill. We should only SIGBUS the unfortunate
process asking for the zswap entry on zswap load, and skip the corrupted
entry in zswap writeback.

See [1] for a recent upstream discussion about this.

[1]: https://lore.kernel.org/all/ZsiLElTykamcYZ6J@xxxxxxxxxxxxxxxxxxxx/

Suggested-by: Matthew Wilcox <willy@xxxxxxxxxxxxx>
Suggested-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx>
Signed-off-by: Nhat Pham <nphamcs@xxxxxxxxx>
---
 mm/zswap.c | 85 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 27 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index f6316b66fb23..31d4397eed61 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail;
 static u64 zswap_reject_compress_fail;
 /* Compressed page was too big for the allocator to (optimally) store */
 static u64 zswap_reject_compress_poor;
+/* Load and writeback failed due to decompression failure */
+static u64 zswap_reject_decompress_fail;
 /* Store failed because underlying allocator could not get memory */
 static u64 zswap_reject_alloc_fail;
 /* Store failed because the entry metadata could not be allocated (rare) */
@@ -953,11 +955,12 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	return comp_ret == 0 && alloc_ret == 0;
 }
 
-static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
+static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 {
 	struct zpool *zpool = entry->pool->zpool;
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
+	bool ret = true;
 	u8 *src;
 
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
@@ -984,12 +987,19 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 	sg_init_table(&output, 1);
 	sg_set_folio(&output, folio, PAGE_SIZE, 0);
 	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
-	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
-	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
+	if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait) ||
+			acomp_ctx->req->dlen != PAGE_SIZE) {
+		ret = false;
+		zswap_reject_decompress_fail++;
+		pr_alert_ratelimited(
+			"decompression failed on zswap entry with offset %08lx\n",
+			entry->swpentry.val);
+	}
 	mutex_unlock(&acomp_ctx->mutex);
 
 	if (src != acomp_ctx->buffer)
 		zpool_unmap_handle(zpool, entry->handle);
+	return ret;
 }
 
 /*********************************
@@ -1018,6 +1028,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_NONE,
 	};
+	int ret = 0;
 
 	/* try to allocate swap cache folio */
 	mpol = get_task_policy(current);
@@ -1034,8 +1045,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	 * and freed when invalidated by the concurrent shrinker anyway.
 	 */
 	if (!folio_was_allocated) {
-		folio_put(folio);
-		return -EEXIST;
+		ret = -EEXIST;
+		goto put_folio;
 	}
 
 	/*
@@ -1048,14 +1059,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	 * be dereferenced.
 	 */
 	tree = swap_zswap_tree(swpentry);
-	if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) {
-		delete_from_swap_cache(folio);
-		folio_unlock(folio);
-		folio_put(folio);
-		return -ENOMEM;
+	if (entry != xa_load(tree, offset)) {
+		ret = -ENOMEM;
+		goto fail;
 	}
 
-	zswap_decompress(entry, folio);
+	if (!zswap_decompress(entry, folio)) {
+		ret = -EIO;
+		goto fail;
+	}
+
+	xa_erase(tree, offset);
 
 	count_vm_event(ZSWPWB);
 	if (entry->objcg)
@@ -1071,9 +1085,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 
 	/* start writeback */
 	__swap_writepage(folio, &wbc);
-	folio_put(folio);
+	goto put_folio;
 
-	return 0;
+fail:
+	delete_from_swap_cache(folio);
+	folio_unlock(folio);
+put_folio:
+	folio_put(folio);
+	return ret;
 }
 
 /*********************************
@@ -1600,6 +1619,29 @@ bool zswap_load(struct folio *folio)
 	if (WARN_ON_ONCE(folio_test_large(folio)))
 		return true;
 
+	/*
+	 * We cannot invalidate the zswap entry before decompressing it. If
+	 * decompression fails, we must keep the entry in the tree so that
+	 * a future read by another process on the same swap entry will also
+	 * have to go through zswap. Otherwise, we risk silently reading
+	 * corrupted data for the other process.
+	 */
+	entry = xa_load(tree, offset);
+	if (!entry)
+		return false;
+
+	/*
+	 * If decompression fails, we return true to notify the caller that the
+	 * folio's data were in zswap, but do not mark the folio as up-to-date.
+	 * This will effectively SIGBUS the calling process.
+	 */
+	if (!zswap_decompress(entry, folio))
+		return true;
+
+	count_vm_event(ZSWPIN);
+	if (entry->objcg)
+		count_objcg_events(entry->objcg, ZSWPIN, 1);
+
 	/*
 	 * When reading into the swapcache, invalidate our entry. The
 	 * swapcache can be the authoritative owner of the page and
@@ -1612,21 +1654,8 @@ bool zswap_load(struct folio *folio)
 	 * files, which reads into a private page and may free it if
 	 * the fault fails. We remain the primary owner of the entry.)
 	 */
-	if (swapcache)
-		entry = xa_erase(tree, offset);
-	else
-		entry = xa_load(tree, offset);
-
-	if (!entry)
-		return false;
-
-	zswap_decompress(entry, folio);
-
-	count_vm_event(ZSWPIN);
-	if (entry->objcg)
-		count_objcg_events(entry->objcg, ZSWPIN, 1);
-
 	if (swapcache) {
+		xa_erase(tree, offset);
 		zswap_entry_free(entry);
 		folio_mark_dirty(folio);
 	}
@@ -1727,6 +1756,8 @@ static int zswap_debugfs_init(void)
 			   zswap_debugfs_root, &zswap_reject_compress_fail);
 	debugfs_create_u64("reject_compress_poor", 0444,
 			   zswap_debugfs_root, &zswap_reject_compress_poor);
+	debugfs_create_u64("reject_decompress_fail", 0444,
+			   zswap_debugfs_root, &zswap_reject_decompress_fail);
 	debugfs_create_u64("written_back_pages", 0444,
 			   zswap_debugfs_root, &zswap_written_back_pages);
 	debugfs_create_file("pool_total_size", 0444,
-- 
2.43.5