Re: [PATCH] zswap: do not crash the kernel on decompression failure

Yosry Ahmed <yosry.ahmed@xxxxxxxxx> · Wed, 26 Feb 2025 02:40:35 +0000

On Tue, Feb 25, 2025 at 01:32:00PM -0800, Nhat Pham wrote:
> Currently, we crash the kernel when a decompression failure occurs in
> zswap (either because of memory corruption, or a bug in the compression
> algorithm). This is overkill. We should only SIGBUS the unfortunate
> process asking for the zswap entry on zswap load, and skip the corrupted
> entry in zswap writeback.

It's probably worth adding more details here. For example, how the
SIGBUS is delivered (it's not super clear in the code), and the
distinction between handling decompression failures for loads and
writeback.

> 
> See [1] for a recent upstream discussion about this.
> 
> [1]: https://lore.kernel.org/all/ZsiLElTykamcYZ6J@xxxxxxxxxxxxxxxxxxxx/
> 
> Suggested-by: Matthew Wilcox <willy@xxxxxxxxxxxxx>
> Suggested-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx>

It's Yosry Ahmed <yosry.ahmed@xxxxxxxxx> now :P

> Signed-off-by: Nhat Pham <nphamcs@xxxxxxxxx>
> ---
>  mm/zswap.c | 85 +++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 58 insertions(+), 27 deletions(-)
> 
> diff --git a/mm/zswap.c b/mm/zswap.c
> index f6316b66fb23..31d4397eed61 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail;
>  static u64 zswap_reject_compress_fail;
>  /* Compressed page was too big for the allocator to (optimally) store */
>  static u64 zswap_reject_compress_poor;
> +/* Load and writeback failed due to decompression failure */

Nit: Load or writeback?

> +static u64 zswap_reject_decompress_fail;
>  /* Store failed because underlying allocator could not get memory */
>  static u64 zswap_reject_alloc_fail;
>  /* Store failed because the entry metadata could not be allocated (rare) */
> @@ -953,11 +955,12 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>  	return comp_ret == 0 && alloc_ret == 0;
>  }
>  
> -static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>  {
>  	struct zpool *zpool = entry->pool->zpool;
>  	struct scatterlist input, output;
>  	struct crypto_acomp_ctx *acomp_ctx;
> +	bool ret = true;
>  	u8 *src;
>  
>  	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> @@ -984,12 +987,19 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>  	sg_init_table(&output, 1);
>  	sg_set_folio(&output, folio, PAGE_SIZE, 0);
>  	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
> -	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
> -	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
> +	if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait) ||
> +			acomp_ctx->req->dlen != PAGE_SIZE) {
> +		ret = false;
> +		zswap_reject_decompress_fail++;
> +		pr_alert_ratelimited(
> +			"decompression failed on zswap entry with offset %08lx\n",
> +			entry->swpentry.val);
> +	}

This can probably be prettier if we save the return value of
crypto_wait_req() in a variable, and then do the check at the end of the
function:

zswap_decompress()
{
	mutex_lock();
	...
	ret = crypto_wait_req(..);
	...
	mutex_unlock();
	...
	if (ret || acomp_ctx->req->dlen != PAGE_SIZE) {
		/* SHIT */
		return false;
	}
	return true;
}

>  	mutex_unlock(&acomp_ctx->mutex);
>  
>  	if (src != acomp_ctx->buffer)
>  		zpool_unmap_handle(zpool, entry->handle);
> +	return ret;
>  }
>  
>  /*********************************
> @@ -1018,6 +1028,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>  	struct writeback_control wbc = {
>  		.sync_mode = WB_SYNC_NONE,
>  	};
> +	int ret = 0;
>  
>  	/* try to allocate swap cache folio */
>  	mpol = get_task_policy(current);
> @@ -1034,8 +1045,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>  	 * and freed when invalidated by the concurrent shrinker anyway.
>  	 */
>  	if (!folio_was_allocated) {
> -		folio_put(folio);
> -		return -EEXIST;
> +		ret = -EEXIST;
> +		goto put_folio;
>  	}
>  
>  	/*
> @@ -1048,14 +1059,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>  	 * be dereferenced.
>  	 */
>  	tree = swap_zswap_tree(swpentry);
> -	if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) {
> -		delete_from_swap_cache(folio);
> -		folio_unlock(folio);
> -		folio_put(folio);
> -		return -ENOMEM;
> +	if (entry != xa_load(tree, offset)) {
> +		ret = -ENOMEM;
> +		goto fail;
>  	}
>  
> -	zswap_decompress(entry, folio);
> +	if (!zswap_decompress(entry, folio)) {
> +		ret = -EIO;
> +		goto fail;
> +	}
> +
> +	xa_erase(tree, offset);
>  
>  	count_vm_event(ZSWPWB);
>  	if (entry->objcg)
> @@ -1071,9 +1085,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>  
>  	/* start writeback */
>  	__swap_writepage(folio, &wbc);
> -	folio_put(folio);
> +	goto put_folio;
>  
> -	return 0;
> +fail:
> +	delete_from_swap_cache(folio);
> +	folio_unlock(folio);
> +put_folio:
> +	folio_put(folio);
> +	return ret;
>  }
>  
>  /*********************************
> @@ -1600,6 +1619,29 @@ bool zswap_load(struct folio *folio)
>  	if (WARN_ON_ONCE(folio_test_large(folio)))
>  		return true;
>  
> +	/*
> +	 * We cannot invalidate the zswap entry before decompressing it. If
> +	 * decompression fails, we must keep the entry in the tree so that
> +	 * a future read by another process on the same swap entry will also
> +	 * have to go through zswap. Otherwise, we risk silently reading
> +	 * corrupted data for the other process.
> +	 */
> +	entry = xa_load(tree, offset);

We are doing load + erase here and in the writeback path now, so two
xarray walks instead of one. How does this affect performance? We had a
similar about the possiblity of doing a lockless xas_load() followed by
a conditional xas_erase() for zswap_invalidate():

https://lore.kernel.org/lkml/20241018192525.95862-1-ryncsn@xxxxxxxxx/

Unfortunately it seems like we can't trivially do that unless we keep
the tree locked, which we probably don't want to do throughout
decompression.

How crazy would it be to remove the entry from the tree and re-add it if
compression fails? Does swapcache_prepare() provide sufficient
protection for us to do this without anyone else looking at this entry
(seems like it)?

Anyway, this is all moot if the second walk is not noticeable from a
perf perspective.