Re: [PATCH] zswap: do not crash the kernel on decompression failure

Nhat Pham <nphamcs@xxxxxxxxx> · Wed, 26 Feb 2025 15:39:30 -0800

On Tue, Feb 25, 2025 at 4:51 PM Johannes Weiner <hannes@xxxxxxxxxxx> wrote:
>
> On Tue, Feb 25, 2025 at 01:32:00PM -0800, Nhat Pham wrote:
> > Currently, we crash the kernel when a decompression failure occurs in
> > zswap (either because of memory corruption, or a bug in the compression
> > algorithm). This is overkill. We should only SIGBUS the unfortunate
> > process asking for the zswap entry on zswap load, and skip the corrupted
> > entry in zswap writeback.
> >
> > See [1] for a recent upstream discussion about this.
> >
> > [1]: https://lore.kernel.org/all/ZsiLElTykamcYZ6J@xxxxxxxxxxxxxxxxxxxx/
> >
> > Suggested-by: Matthew Wilcox <willy@xxxxxxxxxxxxx>
> > Suggested-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx>
> > Signed-off-by: Nhat Pham <nphamcs@xxxxxxxxx>
> > ---
> >  mm/zswap.c | 85 +++++++++++++++++++++++++++++++++++++-----------------
> >  1 file changed, 58 insertions(+), 27 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index f6316b66fb23..31d4397eed61 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail;
> >  static u64 zswap_reject_compress_fail;
> >  /* Compressed page was too big for the allocator to (optimally) store */
> >  static u64 zswap_reject_compress_poor;
> > +/* Load and writeback failed due to decompression failure */
> > +static u64 zswap_reject_decompress_fail;
>
> "reject" refers to "rejected store", so the name doesn't quite make
> sense. zswap_decompress_fail?

SGTM :)

>
> >  /* Store failed because underlying allocator could not get memory */
> >  static u64 zswap_reject_alloc_fail;
> >  /* Store failed because the entry metadata could not be allocated (rare) */
> > @@ -953,11 +955,12 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> >       return comp_ret == 0 && alloc_ret == 0;
> >  }
> >
> > -static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> > +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> >  {
> >       struct zpool *zpool = entry->pool->zpool;
> >       struct scatterlist input, output;
> >       struct crypto_acomp_ctx *acomp_ctx;
> > +     bool ret = true;
> >       u8 *src;
> >
> >       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > @@ -984,12 +987,19 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> >       sg_init_table(&output, 1);
> >       sg_set_folio(&output, folio, PAGE_SIZE, 0);
> >       acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
> > -     BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
> > -     BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
> > +     if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait) ||
> > +                     acomp_ctx->req->dlen != PAGE_SIZE) {
> > +             ret = false;
> > +             zswap_reject_decompress_fail++;
> > +             pr_alert_ratelimited(
> > +                     "decompression failed on zswap entry with offset %08lx\n",
> > +                     entry->swpentry.val);
>
> Since this is a pretty dramatic failure scenario, IMO it would be
> useful to dump as much info as possible.
>
> The exact return value of crypto_wait_req() could be useful,
> entry->length and req->dlen too.
>
> entry->pool->tfm_name just to make absolute sure there is no
> confusion, as the compressor can be switched for new stores.
>
> Maybe swp_type() and swp_offset() of entry->swpentry? Those could be
> easy markers to see if the entry was corrupted for example.

I'll include them all :)

>
> > +     }
> >       mutex_unlock(&acomp_ctx->mutex);
> >
> >       if (src != acomp_ctx->buffer)
> >               zpool_unmap_handle(zpool, entry->handle);
> > +     return ret;
> >  }
> >
> >  /*********************************
> > @@ -1018,6 +1028,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >       struct writeback_control wbc = {
> >               .sync_mode = WB_SYNC_NONE,
> >       };
> > +     int ret = 0;
> >
> >       /* try to allocate swap cache folio */
> >       mpol = get_task_policy(current);
> > @@ -1034,8 +1045,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >        * and freed when invalidated by the concurrent shrinker anyway.
> >        */
> >       if (!folio_was_allocated) {
> > -             folio_put(folio);
> > -             return -EEXIST;
> > +             ret = -EEXIST;
> > +             goto put_folio;
> >       }
> >
> >       /*
> > @@ -1048,14 +1059,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >        * be dereferenced.
> >        */
> >       tree = swap_zswap_tree(swpentry);
> > -     if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) {
> > -             delete_from_swap_cache(folio);
> > -             folio_unlock(folio);
> > -             folio_put(folio);
> > -             return -ENOMEM;
> > +     if (entry != xa_load(tree, offset)) {
> > +             ret = -ENOMEM;
> > +             goto fail;
> >       }
> >
> > -     zswap_decompress(entry, folio);
> > +     if (!zswap_decompress(entry, folio)) {
> > +             ret = -EIO;
> > +             goto fail;
> > +     }
> > +
> > +     xa_erase(tree, offset);
> >
> >       count_vm_event(ZSWPWB);
> >       if (entry->objcg)
> > @@ -1071,9 +1085,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >
> >       /* start writeback */
> >       __swap_writepage(folio, &wbc);
> > -     folio_put(folio);
> > +     goto put_folio;
> >
> > -     return 0;
> > +fail:
> > +     delete_from_swap_cache(folio);
> > +     folio_unlock(folio);
> > +put_folio:
> > +     folio_put(folio);
> > +     return ret;
> >  }
>
> Nice, yeah it's time for factoring out the error unwinding. If you
> write it like this, you can save a jump in the main sequence:
>
>         __swap_writepage(folio, &wbc);
>         ret = 0;
> put:
>         folio_put(folio);
>         return ret;
> delete_unlock:
>         delete_from_swap_cache(folio);
>         folio_unlock(folio);
>         goto put;

SGTM :)

>
> >
> >  /*********************************
> > @@ -1600,6 +1619,29 @@ bool zswap_load(struct folio *folio)
> >       if (WARN_ON_ONCE(folio_test_large(folio)))
> >               return true;
> >
> > +     /*
> > +      * We cannot invalidate the zswap entry before decompressing it. If
> > +      * decompression fails, we must keep the entry in the tree so that
> > +      * a future read by another process on the same swap entry will also
> > +      * have to go through zswap. Otherwise, we risk silently reading
> > +      * corrupted data for the other process.
> > +      */
> > +     entry = xa_load(tree, offset);
> > +     if (!entry)
> > +             return false;
>
> The explanation in the comment makes sense in the context of this
> change. But fresh eyes reading this function and having never seen
> that this *used to* open with xa_erase() will be confused. It answers
> questions the reader doesn't have at this point - it's just a function
> called zswap_load() beginning with an xa_load(), so what?
>
> At first I was going to suggest moving it down to the swapcache
> branch. But honestly after reading *that* comment again, in the new
> sequence, I don't think the question will arise there either. It's
> pretty self-evident that the whole "we can invalidate when reading
> into the swapcache" does not apply if the read failed.
>
> > +     /*
> > +      * If decompression fails, we return true to notify the caller that the
> > +      * folio's data were in zswap, but do not mark the folio as up-to-date.
> > +      * This will effectively SIGBUS the calling process.
> > +      */
>
> It would be good to put a lampshade on this weirdness that the return
> value has nothing to do with success or not. This wasn't as important
> a distinction, but with actual decompression failures I think it is.
>
> Something like this?
>
>         if (!zswap_decompress(entry, folio)) {
>                 /*
>                  * The zswap_load() return value doesn't indicate success or
>                  * failure, but whether zswap owns the swapped out contents.
>                  * This MUST return true here, otherwise swap_readpage() will
>                  * read garbage from the backend.
>                  *
>                  * Success is signaled by marking the folio uptodate.
>                  */
>                 return true;
>         }

Per Yosry's comment, I'll just move this into zswap_load()'s function
documentation.

>
>         folio_mark_uptodate(folio);
>
> > +     if (!zswap_decompress(entry, folio))
> > +             return true;
> > +
> > +     count_vm_event(ZSWPIN);
> > +     if (entry->objcg)
> > +             count_objcg_events(entry->objcg, ZSWPIN, 1);
> > +
> >       /*
> >        * When reading into the swapcache, invalidate our entry. The
> >        * swapcache can be the authoritative owner of the page and
> > @@ -1612,21 +1654,8 @@ bool zswap_load(struct folio *folio)
> >        * files, which reads into a private page and may free it if
> >        * the fault fails. We remain the primary owner of the entry.)
> >        */
> > -     if (swapcache)
> > -             entry = xa_erase(tree, offset);
> > -     else
> > -             entry = xa_load(tree, offset);
> > -
> > -     if (!entry)
> > -             return false;
> > -
> > -     zswap_decompress(entry, folio);
> > -
> > -     count_vm_event(ZSWPIN);
> > -     if (entry->objcg)
> > -             count_objcg_events(entry->objcg, ZSWPIN, 1);
> > -
> >       if (swapcache) {
> > +             xa_erase(tree, offset);
> >               zswap_entry_free(entry);
> >               folio_mark_dirty(folio);
> >       }