This patch provides the functionality that processes a "zswap_batch" in which swap_read_folio() had previously stored swap entries found in zswap, for batched loading. The newly added zswap_finish_load_batch() API implements the main zswap load batching functionality. This makes use of the sub-batches of zswap_entry/xarray/page/source-length readily available from zswap_add_load_batch(). These sub-batch arrays are processed one at a time, until the entire zswap folio_batch has been loaded. The existing zswap_load() functionality of deleting zswap_entries for folios found in the swapcache, is preserved. Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@xxxxxxxxx> --- include/linux/zswap.h | 22 ++++++ mm/page_io.c | 35 +++++++++ mm/swap.h | 17 +++++ mm/zswap.c | 171 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 245 insertions(+) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 1d6de281f243..a0792c2b300a 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -110,6 +110,15 @@ struct zswap_store_pipeline_state { u8 nr_comp_pages; }; +/* Note: If SWAP_CRYPTO_SUB_BATCH_SIZE exceeds 256, change the u8 to u16. */ +struct zswap_load_sub_batch_state { + struct xarray **trees; + struct zswap_entry **entries; + struct page **pages; + unsigned int *slens; + u8 nr_decomp; +}; + bool zswap_store_batching_enabled(void); void __zswap_store_batch(struct swap_in_memory_cache_cb *simc); void __zswap_store_batch_single(struct swap_in_memory_cache_cb *simc); @@ -136,6 +145,14 @@ static inline bool zswap_add_load_batch( return false; } +void __zswap_finish_load_batch(struct zswap_decomp_batch *zd_batch); +static inline void zswap_finish_load_batch( + struct zswap_decomp_batch *zd_batch) +{ + if (zswap_load_batching_enabled()) + __zswap_finish_load_batch(zd_batch); +} + unsigned long zswap_total_pages(void); bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); @@ -188,6 +205,11 @@ static inline bool zswap_add_load_batch( return false; } +static inline void zswap_finish_load_batch( + struct zswap_decomp_batch *zd_batch) +{ +} + static inline bool zswap_store(struct folio *folio) { return false; diff --git a/mm/page_io.c b/mm/page_io.c index 9750302d193b..aa83221318ef 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -816,6 +816,41 @@ bool swap_read_folio(struct folio *folio, struct swap_iocb **plug, return true; } +static void __swap_post_process_zswap_load_batch( + struct zswap_decomp_batch *zswap_batch) +{ + u8 i; + + for (i = 0; i < folio_batch_count(&zswap_batch->fbatch); ++i) { + struct folio *folio = zswap_batch->fbatch.folios[i]; + folio_unlock(folio); + } +} + +/* + * The swapin_readahead batching interface makes sure that the + * input zswap_batch consists of folios belonging to the same swap + * device type. + */ +void __swap_read_zswap_batch_unplug(struct zswap_decomp_batch *zswap_batch, + struct swap_iocb **splug) +{ + unsigned long pflags; + + if (!folio_batch_count(&zswap_batch->fbatch)) + return; + + psi_memstall_enter(&pflags); + delayacct_swapin_start(); + + /* Load the zswap batch. */ + zswap_finish_load_batch(zswap_batch); + __swap_post_process_zswap_load_batch(zswap_batch); + + psi_memstall_leave(&pflags); + delayacct_swapin_end(); +} + void __swap_read_unplug(struct swap_iocb *sio) { struct iov_iter from; diff --git a/mm/swap.h b/mm/swap.h index 310f99007fe6..2b82c8ed765c 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -125,6 +125,16 @@ struct swap_iocb; bool swap_read_folio(struct folio *folio, struct swap_iocb **plug, struct zswap_decomp_batch *zswap_batch, struct folio_batch *non_zswap_batch); +void __swap_read_zswap_batch_unplug( + struct zswap_decomp_batch *zswap_batch, + struct swap_iocb **splug); +static inline void swap_read_zswap_batch_unplug( + struct zswap_decomp_batch *zswap_batch, + struct swap_iocb **splug) +{ + if (likely(zswap_batch)) + __swap_read_zswap_batch_unplug(zswap_batch, splug); +} void __swap_read_unplug(struct swap_iocb *plug); static inline void swap_read_unplug(struct swap_iocb *plug) { @@ -268,6 +278,13 @@ static inline bool swap_read_folio(struct folio *folio, struct swap_iocb **plug, { return false; } + +static inline void swap_read_zswap_batch_unplug( + struct zswap_decomp_batch *zswap_batch, + struct swap_iocb **splug) +{ +} + static inline void swap_write_unplug(struct swap_iocb *sio) { } diff --git a/mm/zswap.c b/mm/zswap.c index 1d293f95d525..39bf7d8810e9 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -35,6 +35,7 @@ #include <linux/pagemap.h> #include <linux/workqueue.h> #include <linux/list_lru.h> +#include <linux/delayacct.h> #include "swap.h" #include "internal.h" @@ -2401,6 +2402,176 @@ bool __zswap_add_load_batch(struct zswap_decomp_batch *zd_batch, return true; } +static __always_inline void zswap_load_sub_batch_init( + struct zswap_decomp_batch *zd_batch, + unsigned int sb, + struct zswap_load_sub_batch_state *zls) +{ + zls->trees = zd_batch->trees[sb]; + zls->entries = zd_batch->entries[sb]; + zls->pages = zd_batch->pages[sb]; + zls->slens = zd_batch->slens[sb]; + zls->nr_decomp = zd_batch->nr_decomp[sb]; +} + +static void zswap_load_map_sources( + struct zswap_load_sub_batch_state *zls, + u8 *srcs[]) +{ + u8 i; + + for (i = 0; i < zls->nr_decomp; ++i) { + struct zswap_entry *entry = zls->entries[i]; + struct zpool *zpool = entry->pool->zpool; + u8 *buf = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); + memcpy(srcs[i], buf, entry->length); + zpool_unmap_handle(zpool, entry->handle); + } +} + +static void zswap_decompress_batch( + struct zswap_load_sub_batch_state *zls, + u8 *srcs[], + int decomp_errors[]) +{ + struct crypto_acomp_ctx *acomp_ctx; + + acomp_ctx = raw_cpu_ptr(zls->entries[0]->pool->acomp_ctx); + + swap_crypto_acomp_decompress_batch( + srcs, + zls->pages, + zls->slens, + decomp_errors, + zls->nr_decomp, + acomp_ctx); +} + +static void zswap_load_batch_updates( + struct zswap_decomp_batch *zd_batch, + unsigned int sb, + struct zswap_load_sub_batch_state *zls, + int decomp_errors[]) +{ + unsigned int j; + u8 i; + + for (i = 0; i < zls->nr_decomp; ++i) { + j = (sb * SWAP_CRYPTO_SUB_BATCH_SIZE) + i; + struct folio *folio = zd_batch->fbatch.folios[j]; + struct zswap_entry *entry = zls->entries[i]; + + BUG_ON(decomp_errors[i]); + count_vm_event(ZSWPIN); + if (entry->objcg) + count_objcg_events(entry->objcg, ZSWPIN, 1); + + if (zd_batch->swapcache[j]) { + zswap_entry_free(entry); + folio_mark_dirty(folio); + } + + folio_mark_uptodate(folio); + } +} + +static void zswap_load_decomp_batch( + struct zswap_decomp_batch *zd_batch, + unsigned int sb, + struct zswap_load_sub_batch_state *zls) +{ + int decomp_errors[SWAP_CRYPTO_SUB_BATCH_SIZE]; + struct crypto_acomp_ctx *acomp_ctx; + + acomp_ctx = raw_cpu_ptr(zls->entries[0]->pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); + + zswap_load_map_sources(zls, acomp_ctx->buffer); + + zswap_decompress_batch(zls, acomp_ctx->buffer, decomp_errors); + + mutex_unlock(&acomp_ctx->mutex); + + zswap_load_batch_updates(zd_batch, sb, zls, decomp_errors); +} + +static void zswap_load_start_accounting( + struct zswap_decomp_batch *zd_batch, + unsigned int sb, + struct zswap_load_sub_batch_state *zls, + bool workingset[], + bool in_thrashing[]) +{ + unsigned int j; + u8 i; + + for (i = 0; i < zls->nr_decomp; ++i) { + j = (sb * SWAP_CRYPTO_SUB_BATCH_SIZE) + i; + struct folio *folio = zd_batch->fbatch.folios[j]; + workingset[i] = folio_test_workingset(folio); + if (workingset[i]) + delayacct_thrashing_start(&in_thrashing[i]); + } +} + +static void zswap_load_end_accounting( + struct zswap_decomp_batch *zd_batch, + struct zswap_load_sub_batch_state *zls, + bool workingset[], + bool in_thrashing[]) +{ + u8 i; + + for (i = 0; i < zls->nr_decomp; ++i) + if (workingset[i]) + delayacct_thrashing_end(&in_thrashing[i]); +} + +/* + * All entries in a zd_batch belong to the same swap device. + */ +void __zswap_finish_load_batch(struct zswap_decomp_batch *zd_batch) +{ + struct zswap_load_sub_batch_state zls; + unsigned int nr_folios = folio_batch_count(&zd_batch->fbatch); + unsigned int nr_sb = DIV_ROUND_UP(nr_folios, SWAP_CRYPTO_SUB_BATCH_SIZE); + unsigned int sb; + + /* + * Process the zd_batch in sub-batches of + * SWAP_CRYPTO_SUB_BATCH_SIZE. + */ + for (sb = 0; sb < nr_sb; ++sb) { + bool workingset[SWAP_CRYPTO_SUB_BATCH_SIZE]; + bool in_thrashing[SWAP_CRYPTO_SUB_BATCH_SIZE]; + + zswap_load_sub_batch_init(zd_batch, sb, &zls); + + zswap_load_start_accounting(zd_batch, sb, &zls, + workingset, in_thrashing); + + /* Decompress the batch. */ + if (zls.nr_decomp) + zswap_load_decomp_batch(zd_batch, sb, &zls); + + /* + * Should we free zswap_entries, as in zswap_load(): + * With the new swapin_readahead batching interface, + * all prefetch entries are read into the swapcache. + * Freeing the zswap entries here causes segfaults, + * most probably because a page-fault occured while + * the buffer was being decompressed. + * Allowing the regular folio_free_swap() sequence + * in do_swap_page() appears to keep things stable + * without duplicated zswap-swapcache memory, as far + * as I can tell from my testing. + */ + + zswap_load_end_accounting(zd_batch, &zls, + workingset, in_thrashing); + } +} + void zswap_invalidate(swp_entry_t swp) { pgoff_t offset = swp_offset(swp); -- 2.27.0