On Wed, Feb 5, 2025 at 11:21 PM Kanchana P Sridhar <kanchana.p.sridhar@xxxxxxxxx> wrote: > > With the previous patch that enables support for batch compressions in > zswap_compress_folio(), a 6.2% throughput regression was seen with zstd and > 2M folios, using vm-scalability/usemem. > > For compressors that don't support batching, this was root-caused to the > following zswap_store_folio() structure: > > Batched stores: > --------------- > - Allocate all entries, > - Compress all entries, > - Store all entries in xarray/LRU. Can you clarify why the above structure leads to performance regression? > > Hence, the above structure is maintained only for batched stores, and the > following structure is implemented for sequential stores of large folio pages, > that fixes the zstd regression, while preserving common code paths for batched > and sequential stores of a folio: > > Sequential stores: > ------------------ > For each page in folio: > - allocate an entry, > - compress the page, > - store the entry in xarray/LRU. > > This is submitted as a separate patch only for code review purposes. I will > squash this with the previous commit in subsequent versions of this > patch-series. > > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@xxxxxxxxx> > --- > mm/zswap.c | 193 ++++++++++++++++++++++++++++++----------------------- > 1 file changed, 111 insertions(+), 82 deletions(-) > > diff --git a/mm/zswap.c b/mm/zswap.c > index f1cba77eda62..7bfc720a6201 100644 > --- a/mm/zswap.c > +++ b/mm/zswap.c > @@ -1592,40 +1592,32 @@ static bool zswap_batch_compress(struct folio *folio, > return ret; > } > > -static bool zswap_compress_folio(struct folio *folio, > - struct zswap_entry *entries[], > - struct zswap_pool *pool) > +static __always_inline bool zswap_compress_folio(struct folio *folio, > + struct zswap_entry *entries[], > + long from_index, > + struct zswap_pool *pool, > + unsigned int batch_size, > + struct crypto_acomp_ctx *acomp_ctx) > { > - long index, nr_pages = folio_nr_pages(folio); > - struct crypto_acomp_ctx *acomp_ctx; > - unsigned int batch_size; > + long index = from_index, nr_pages = folio_nr_pages(folio); > bool ret = true; > > - acomp_ctx = acomp_ctx_get_cpu_lock(pool); > - batch_size = acomp_ctx->nr_reqs; > - > if ((batch_size > 1) && (nr_pages > 1)) { > - for (index = 0; index < nr_pages; index += batch_size) { > + for (; index < nr_pages; index += batch_size) { > > if (!zswap_batch_compress(folio, index, batch_size, > &entries[index], pool, acomp_ctx)) { > ret = false; > - goto unlock_acomp_ctx; > + break; > } > } > } else { > - for (index = 0; index < nr_pages; ++index) { > - struct page *page = folio_page(folio, index); > + struct page *page = folio_page(folio, index); > > - if (!zswap_compress(page, entries[index], pool, acomp_ctx)) { > - ret = false; > - goto unlock_acomp_ctx; > - } > - } > + if (!zswap_compress(page, entries[index], pool, acomp_ctx)) > + ret = false; > } > > -unlock_acomp_ctx: > - acomp_ctx_put_unlock(acomp_ctx); > return ret; > } > > @@ -1637,92 +1629,128 @@ static bool zswap_compress_folio(struct folio *folio, > * handles to ERR_PTR(-EINVAL) at allocation time, and the fact that the > * entry's handle is subsequently modified only upon a successful zpool_malloc() > * after the page is compressed. > + * > + * For compressors that don't support batching, the following structure > + * showed a performance regression with zstd/2M folios: > + * > + * Batched stores: > + * --------------- > + * - Allocate all entries, > + * - Compress all entries, > + * - Store all entries in xarray/LRU. > + * > + * Hence, the above structure is maintained only for batched stores, and the > + * following structure is implemented for sequential stores of large folio pages, > + * that fixes the regression, while preserving common code paths for batched > + * and sequential stores of a folio: > + * > + * Sequential stores: > + * ------------------ > + * For each page in folio: > + * - allocate an entry, > + * - compress the page, > + * - store the entry in xarray/LRU. > */ > static bool zswap_store_folio(struct folio *folio, > struct obj_cgroup *objcg, > struct zswap_pool *pool) > { > - long index, from_index = 0, nr_pages = folio_nr_pages(folio); > + long index = 0, from_index = 0, nr_pages, nr_folio_pages = folio_nr_pages(folio); > struct zswap_entry **entries = NULL; > + struct crypto_acomp_ctx *acomp_ctx; > int node_id = folio_nid(folio); > + unsigned int batch_size; > > - entries = kmalloc(nr_pages * sizeof(*entries), GFP_KERNEL); > + entries = kmalloc(nr_folio_pages * sizeof(*entries), GFP_KERNEL); > if (!entries) > return false; > > - for (index = 0; index < nr_pages; ++index) { > - entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id); > + acomp_ctx = acomp_ctx_get_cpu_lock(pool); > + batch_size = acomp_ctx->nr_reqs; > > - if (!entries[index]) { > - zswap_reject_kmemcache_fail++; > - nr_pages = index; > - goto store_folio_failed; > + nr_pages = (batch_size > 1) ? nr_folio_pages : 1; > + > + while (1) { > + for (index = from_index; index < nr_pages; ++index) { > + entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id); > + > + if (!entries[index]) { > + zswap_reject_kmemcache_fail++; > + nr_pages = index; > + goto store_folio_failed; > + } > + > + entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL); > } > > - entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL); > - } > + if (!zswap_compress_folio(folio, entries, from_index, pool, batch_size, acomp_ctx)) > + goto store_folio_failed; > > - if (!zswap_compress_folio(folio, entries, pool)) > - goto store_folio_failed; > + for (index = from_index; index < nr_pages; ++index) { > + swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, index)); > + struct zswap_entry *old, *entry = entries[index]; > > - for (index = 0; index < nr_pages; ++index) { > - swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, index)); > - struct zswap_entry *old, *entry = entries[index]; > + old = xa_store(swap_zswap_tree(page_swpentry), > + swp_offset(page_swpentry), > + entry, GFP_KERNEL); > + if (xa_is_err(old)) { > + int err = xa_err(old); > > - old = xa_store(swap_zswap_tree(page_swpentry), > - swp_offset(page_swpentry), > - entry, GFP_KERNEL); > - if (xa_is_err(old)) { > - int err = xa_err(old); > + WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); > + zswap_reject_alloc_fail++; > + from_index = index; > + goto store_folio_failed; > + } > > - WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); > - zswap_reject_alloc_fail++; > - from_index = index; > - goto store_folio_failed; > - } > + /* > + * We may have had an existing entry that became stale when > + * the folio was redirtied and now the new version is being > + * swapped out. Get rid of the old. > + */ > + if (old) > + zswap_entry_free(old); > > - /* > - * We may have had an existing entry that became stale when > - * the folio was redirtied and now the new version is being > - * swapped out. Get rid of the old. > - */ > - if (old) > - zswap_entry_free(old); > + /* > + * The entry is successfully compressed and stored in the tree, there is > + * no further possibility of failure. Grab refs to the pool and objcg, > + * charge zswap memory, and increment zswap_stored_pages. > + * The opposite actions will be performed by zswap_entry_free() > + * when the entry is removed from the tree. > + */ > + zswap_pool_get(pool); > + if (objcg) { > + obj_cgroup_get(objcg); > + obj_cgroup_charge_zswap(objcg, entry->length); > + } > + atomic_long_inc(&zswap_stored_pages); > > - /* > - * The entry is successfully compressed and stored in the tree, there is > - * no further possibility of failure. Grab refs to the pool and objcg, > - * charge zswap memory, and increment zswap_stored_pages. > - * The opposite actions will be performed by zswap_entry_free() > - * when the entry is removed from the tree. > - */ > - zswap_pool_get(pool); > - if (objcg) { > - obj_cgroup_get(objcg); > - obj_cgroup_charge_zswap(objcg, entry->length); > + /* > + * We finish initializing the entry while it's already in xarray. > + * This is safe because: > + * > + * 1. Concurrent stores and invalidations are excluded by folio lock. > + * > + * 2. Writeback is excluded by the entry not being on the LRU yet. > + * The publishing order matters to prevent writeback from seeing > + * an incoherent entry. > + */ > + entry->pool = pool; > + entry->swpentry = page_swpentry; > + entry->objcg = objcg; > + entry->referenced = true; > + if (entry->length) { > + INIT_LIST_HEAD(&entry->lru); > + zswap_lru_add(&zswap_list_lru, entry); > + } > } > - atomic_long_inc(&zswap_stored_pages); > > - /* > - * We finish initializing the entry while it's already in xarray. > - * This is safe because: > - * > - * 1. Concurrent stores and invalidations are excluded by folio lock. > - * > - * 2. Writeback is excluded by the entry not being on the LRU yet. > - * The publishing order matters to prevent writeback from seeing > - * an incoherent entry. > - */ > - entry->pool = pool; > - entry->swpentry = page_swpentry; > - entry->objcg = objcg; > - entry->referenced = true; > - if (entry->length) { > - INIT_LIST_HEAD(&entry->lru); > - zswap_lru_add(&zswap_list_lru, entry); > - } > + from_index = nr_pages++; > + > + if (nr_pages > nr_folio_pages) > + break; > } > > + acomp_ctx_put_unlock(acomp_ctx); > kfree(entries); > return true; > > @@ -1734,6 +1762,7 @@ static bool zswap_store_folio(struct folio *folio, > zswap_entry_cache_free(entries[index]); > } > > + acomp_ctx_put_unlock(acomp_ctx); > kfree(entries); > return false; > } > -- > 2.27.0 >