On Tue, Feb 25, 2025 at 4:51 PM Johannes Weiner <hannes@xxxxxxxxxxx> wrote: > > On Tue, Feb 25, 2025 at 01:32:00PM -0800, Nhat Pham wrote: > > Currently, we crash the kernel when a decompression failure occurs in > > zswap (either because of memory corruption, or a bug in the compression > > algorithm). This is overkill. We should only SIGBUS the unfortunate > > process asking for the zswap entry on zswap load, and skip the corrupted > > entry in zswap writeback. > > > > See [1] for a recent upstream discussion about this. > > > > [1]: https://lore.kernel.org/all/ZsiLElTykamcYZ6J@xxxxxxxxxxxxxxxxxxxx/ > > > > Suggested-by: Matthew Wilcox <willy@xxxxxxxxxxxxx> > > Suggested-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx> > > Signed-off-by: Nhat Pham <nphamcs@xxxxxxxxx> > > --- > > mm/zswap.c | 85 +++++++++++++++++++++++++++++++++++++----------------- > > 1 file changed, 58 insertions(+), 27 deletions(-) > > > > diff --git a/mm/zswap.c b/mm/zswap.c > > index f6316b66fb23..31d4397eed61 100644 > > --- a/mm/zswap.c > > +++ b/mm/zswap.c > > @@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail; > > static u64 zswap_reject_compress_fail; > > /* Compressed page was too big for the allocator to (optimally) store */ > > static u64 zswap_reject_compress_poor; > > +/* Load and writeback failed due to decompression failure */ > > +static u64 zswap_reject_decompress_fail; > > "reject" refers to "rejected store", so the name doesn't quite make > sense. zswap_decompress_fail? SGTM :) > > > /* Store failed because underlying allocator could not get memory */ > > static u64 zswap_reject_alloc_fail; > > /* Store failed because the entry metadata could not be allocated (rare) */ > > @@ -953,11 +955,12 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, > > return comp_ret == 0 && alloc_ret == 0; > > } > > > > -static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) > > +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) > > { > > struct zpool *zpool = entry->pool->zpool; > > struct scatterlist input, output; > > struct crypto_acomp_ctx *acomp_ctx; > > + bool ret = true; > > u8 *src; > > > > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > > @@ -984,12 +987,19 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) > > sg_init_table(&output, 1); > > sg_set_folio(&output, folio, PAGE_SIZE, 0); > > acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); > > - BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); > > - BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); > > + if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait) || > > + acomp_ctx->req->dlen != PAGE_SIZE) { > > + ret = false; > > + zswap_reject_decompress_fail++; > > + pr_alert_ratelimited( > > + "decompression failed on zswap entry with offset %08lx\n", > > + entry->swpentry.val); > > Since this is a pretty dramatic failure scenario, IMO it would be > useful to dump as much info as possible. > > The exact return value of crypto_wait_req() could be useful, > entry->length and req->dlen too. > > entry->pool->tfm_name just to make absolute sure there is no > confusion, as the compressor can be switched for new stores. > > Maybe swp_type() and swp_offset() of entry->swpentry? Those could be > easy markers to see if the entry was corrupted for example. I'll include them all :) > > > + } > > mutex_unlock(&acomp_ctx->mutex); > > > > if (src != acomp_ctx->buffer) > > zpool_unmap_handle(zpool, entry->handle); > > + return ret; > > } > > > > /********************************* > > @@ -1018,6 +1028,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, > > struct writeback_control wbc = { > > .sync_mode = WB_SYNC_NONE, > > }; > > + int ret = 0; > > > > /* try to allocate swap cache folio */ > > mpol = get_task_policy(current); > > @@ -1034,8 +1045,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry, > > * and freed when invalidated by the concurrent shrinker anyway. > > */ > > if (!folio_was_allocated) { > > - folio_put(folio); > > - return -EEXIST; > > + ret = -EEXIST; > > + goto put_folio; > > } > > > > /* > > @@ -1048,14 +1059,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry, > > * be dereferenced. > > */ > > tree = swap_zswap_tree(swpentry); > > - if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) { > > - delete_from_swap_cache(folio); > > - folio_unlock(folio); > > - folio_put(folio); > > - return -ENOMEM; > > + if (entry != xa_load(tree, offset)) { > > + ret = -ENOMEM; > > + goto fail; > > } > > > > - zswap_decompress(entry, folio); > > + if (!zswap_decompress(entry, folio)) { > > + ret = -EIO; > > + goto fail; > > + } > > + > > + xa_erase(tree, offset); > > > > count_vm_event(ZSWPWB); > > if (entry->objcg) > > @@ -1071,9 +1085,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry, > > > > /* start writeback */ > > __swap_writepage(folio, &wbc); > > - folio_put(folio); > > + goto put_folio; > > > > - return 0; > > +fail: > > + delete_from_swap_cache(folio); > > + folio_unlock(folio); > > +put_folio: > > + folio_put(folio); > > + return ret; > > } > > Nice, yeah it's time for factoring out the error unwinding. If you > write it like this, you can save a jump in the main sequence: > > __swap_writepage(folio, &wbc); > ret = 0; > put: > folio_put(folio); > return ret; > delete_unlock: > delete_from_swap_cache(folio); > folio_unlock(folio); > goto put; SGTM :) > > > > > /********************************* > > @@ -1600,6 +1619,29 @@ bool zswap_load(struct folio *folio) > > if (WARN_ON_ONCE(folio_test_large(folio))) > > return true; > > > > + /* > > + * We cannot invalidate the zswap entry before decompressing it. If > > + * decompression fails, we must keep the entry in the tree so that > > + * a future read by another process on the same swap entry will also > > + * have to go through zswap. Otherwise, we risk silently reading > > + * corrupted data for the other process. > > + */ > > + entry = xa_load(tree, offset); > > + if (!entry) > > + return false; > > The explanation in the comment makes sense in the context of this > change. But fresh eyes reading this function and having never seen > that this *used to* open with xa_erase() will be confused. It answers > questions the reader doesn't have at this point - it's just a function > called zswap_load() beginning with an xa_load(), so what? > > At first I was going to suggest moving it down to the swapcache > branch. But honestly after reading *that* comment again, in the new > sequence, I don't think the question will arise there either. It's > pretty self-evident that the whole "we can invalidate when reading > into the swapcache" does not apply if the read failed. > > > + /* > > + * If decompression fails, we return true to notify the caller that the > > + * folio's data were in zswap, but do not mark the folio as up-to-date. > > + * This will effectively SIGBUS the calling process. > > + */ > > It would be good to put a lampshade on this weirdness that the return > value has nothing to do with success or not. This wasn't as important > a distinction, but with actual decompression failures I think it is. > > Something like this? > > if (!zswap_decompress(entry, folio)) { > /* > * The zswap_load() return value doesn't indicate success or > * failure, but whether zswap owns the swapped out contents. > * This MUST return true here, otherwise swap_readpage() will > * read garbage from the backend. > * > * Success is signaled by marking the folio uptodate. > */ > return true; > } Per Yosry's comment, I'll just move this into zswap_load()'s function documentation. > > folio_mark_uptodate(folio); > > > + if (!zswap_decompress(entry, folio)) > > + return true; > > + > > + count_vm_event(ZSWPIN); > > + if (entry->objcg) > > + count_objcg_events(entry->objcg, ZSWPIN, 1); > > + > > /* > > * When reading into the swapcache, invalidate our entry. The > > * swapcache can be the authoritative owner of the page and > > @@ -1612,21 +1654,8 @@ bool zswap_load(struct folio *folio) > > * files, which reads into a private page and may free it if > > * the fault fails. We remain the primary owner of the entry.) > > */ > > - if (swapcache) > > - entry = xa_erase(tree, offset); > > - else > > - entry = xa_load(tree, offset); > > - > > - if (!entry) > > - return false; > > - > > - zswap_decompress(entry, folio); > > - > > - count_vm_event(ZSWPIN); > > - if (entry->objcg) > > - count_objcg_events(entry->objcg, ZSWPIN, 1); > > - > > if (swapcache) { > > + xa_erase(tree, offset); > > zswap_entry_free(entry); > > folio_mark_dirty(folio); > > }