Just a polite nudge on this if anyone has any feedback? Alternatively, happy to repost against -rc1 if that's preferred? On 19/10/2023 12:05, Ryan Roberts wrote: > Previously zswap would refuse to store any folio bigger than order-0, > and therefore all of those folios would be sent directly to the swap > file. This is a minor inconvenience since swap can currently only > support order-0 and PMD-sized THP, but with the pending introduction of > "small-sized THP", and corresponding changes to swapfile to support any > order of folio, these large folios will become more prevalent and > without this zswap change, zswap will become unusable. Independently of > the "small-sized THP" feature, this change makes it possible to store > existing PMD-sized THPs in zswap. > > Modify zswap_store() to allow storing large folios. The function is > split into 2 parts; zswap_store() does all the per-folio operations > (i.e. checking there is enough space, etc). Then it calls a new helper, > zswap_store_page(), for each page in the folio, which are stored as > their own entries in the zswap pool. (These entries continue to be > loaded back individually as single pages). If a store fails for any > single page, then all previously successfully stored folio pages are > invalidated. > > Signed-off-by: Ryan Roberts <ryan.roberts@xxxxxxx> > --- > I've tested this on arm64 (m2) with zswap enabled, and running > vm-scalability's `usemem` across multiple cores from within a > memory-constrained memcg to force high volumes of swap. I've also run mm > selftests and observe no regressions (although there is nothing [z]swap > specific there) - does zswap have any specific tests I should run? > > This is based on mm-stable, since mm-unstable contains a zswap patch > known to be buggy [1]. I thought it would be best to get comments on the > shape, then do the rebase after that patch has been fixed. > > For context, small-sized THP is being discussed here [2], and I'm > working on changes to swapfile to support non-PMD-sized large folios > here [3]. > > [1] https://lore.kernel.org/linux-mm/21606fe5-fb9b-4d37-98ab-38c96819893b@xxxxxxx/ > [2] https://lore.kernel.org/linux-mm/20230929114421.3761121-1-ryan.roberts@xxxxxxx/ > [3] https://lore.kernel.org/linux-mm/20231017161302.2518826-1-ryan.roberts@xxxxxxx/ > > Thanks, > Ryan > > > mm/zswap.c | 155 +++++++++++++++++++++++++++++++++-------------------- > 1 file changed, 98 insertions(+), 57 deletions(-) > > diff --git a/mm/zswap.c b/mm/zswap.c > index 37d2b1cb2ecb..51cbfc4e1ef8 100644 > --- a/mm/zswap.c > +++ b/mm/zswap.c > @@ -1188,18 +1188,17 @@ static void zswap_fill_page(void *ptr, unsigned long value) > memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); > } > > -bool zswap_store(struct folio *folio) > +static bool zswap_store_page(struct folio *folio, long index, > + struct obj_cgroup *objcg, struct zswap_pool *pool) > { > swp_entry_t swp = folio->swap; > int type = swp_type(swp); > - pgoff_t offset = swp_offset(swp); > - struct page *page = &folio->page; > + pgoff_t offset = swp_offset(swp) + index; > + struct page *page = folio_page(folio, index); > struct zswap_tree *tree = zswap_trees[type]; > struct zswap_entry *entry, *dupentry; > struct scatterlist input, output; > struct crypto_acomp_ctx *acomp_ctx; > - struct obj_cgroup *objcg = NULL; > - struct zswap_pool *pool; > struct zpool *zpool; > unsigned int dlen = PAGE_SIZE; > unsigned long handle, value; > @@ -1208,51 +1207,11 @@ bool zswap_store(struct folio *folio) > gfp_t gfp; > int ret; > > - VM_WARN_ON_ONCE(!folio_test_locked(folio)); > - VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); > - > - /* Large folios aren't supported */ > - if (folio_test_large(folio)) > - return false; > - > - if (!zswap_enabled || !tree) > + /* entry keeps the references if successfully stored. */ > + if (!zswap_pool_get(pool)) > return false; > - > - /* > - * If this is a duplicate, it must be removed before attempting to store > - * it, otherwise, if the store fails the old page won't be removed from > - * the tree, and it might be written back overriding the new data. > - */ > - spin_lock(&tree->lock); > - dupentry = zswap_rb_search(&tree->rbroot, offset); > - if (dupentry) { > - zswap_duplicate_entry++; > - zswap_invalidate_entry(tree, dupentry); > - } > - spin_unlock(&tree->lock); > - > - /* > - * XXX: zswap reclaim does not work with cgroups yet. Without a > - * cgroup-aware entry LRU, we will push out entries system-wide based on > - * local cgroup limits. > - */ > - objcg = get_obj_cgroup_from_folio(folio); > - if (objcg && !obj_cgroup_may_zswap(objcg)) > - goto reject; > - > - /* reclaim space if needed */ > - if (zswap_is_full()) { > - zswap_pool_limit_hit++; > - zswap_pool_reached_full = true; > - goto shrink; > - } > - > - if (zswap_pool_reached_full) { > - if (!zswap_can_accept()) > - goto shrink; > - else > - zswap_pool_reached_full = false; > - } > + if (objcg) > + obj_cgroup_get(objcg); > > /* allocate entry */ > entry = zswap_entry_cache_alloc(GFP_KERNEL); > @@ -1260,6 +1219,8 @@ bool zswap_store(struct folio *folio) > zswap_reject_kmemcache_fail++; > goto reject; > } > + entry->objcg = objcg; > + entry->pool = pool; > > if (zswap_same_filled_pages_enabled) { > src = kmap_atomic(page); > @@ -1277,11 +1238,6 @@ bool zswap_store(struct folio *folio) > if (!zswap_non_same_filled_pages_enabled) > goto freepage; > > - /* if entry is successfully added, it keeps the reference */ > - entry->pool = zswap_pool_current_get(); > - if (!entry->pool) > - goto freepage; > - > /* compress */ > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > > @@ -1337,7 +1293,6 @@ bool zswap_store(struct folio *folio) > entry->length = dlen; > > insert_entry: > - entry->objcg = objcg; > if (objcg) { > obj_cgroup_charge_zswap(objcg, entry->length); > /* Account before objcg ref is moved to tree */ > @@ -1373,19 +1328,105 @@ bool zswap_store(struct folio *folio) > > put_dstmem: > mutex_unlock(acomp_ctx->mutex); > - zswap_pool_put(entry->pool); > freepage: > zswap_entry_cache_free(entry); > reject: > if (objcg) > obj_cgroup_put(objcg); > + zswap_pool_put(pool); > return false; > +} > > +bool zswap_store(struct folio *folio) > +{ > + long nr_pages = folio_nr_pages(folio); > + swp_entry_t swp = folio->swap; > + int type = swp_type(swp); > + pgoff_t offset = swp_offset(swp); > + struct zswap_tree *tree = zswap_trees[type]; > + struct zswap_entry *entry; > + struct obj_cgroup *objcg = NULL; > + struct zswap_pool *pool; > + bool ret = false; > + long i; > + > + VM_WARN_ON_ONCE(!folio_test_locked(folio)); > + VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); > + > + if (!zswap_enabled || !tree) > + return false; > + > + /* > + * If this is a duplicate, it must be removed before attempting to store > + * it, otherwise, if the store fails the old page won't be removed from > + * the tree, and it might be written back overriding the new data. > + */ > + spin_lock(&tree->lock); > + for (i = 0; i < nr_pages; i++) { > + entry = zswap_rb_search(&tree->rbroot, offset + i); > + if (entry) { > + zswap_duplicate_entry++; > + zswap_invalidate_entry(tree, entry); > + } > + } > + spin_unlock(&tree->lock); > + > + /* > + * XXX: zswap reclaim does not work with cgroups yet. Without a > + * cgroup-aware entry LRU, we will push out entries system-wide based on > + * local cgroup limits. > + */ > + objcg = get_obj_cgroup_from_folio(folio); > + if (objcg && !obj_cgroup_may_zswap(objcg)) > + goto put_objcg; > + > + /* reclaim space if needed */ > + if (zswap_is_full()) { > + zswap_pool_limit_hit++; > + zswap_pool_reached_full = true; > + goto shrink; > + } > + > + if (zswap_pool_reached_full) { > + if (!zswap_can_accept()) > + goto shrink; > + else > + zswap_pool_reached_full = false; > + } > + > + pool = zswap_pool_current_get(); > + if (!pool) > + goto put_objcg; > + > + /* > + * Store each page of the folio as a separate entry. If we fail to store > + * a page, unwind by removing all the previous pages we stored. > + */ > + for (i = 0; i < nr_pages; i++) { > + if (!zswap_store_page(folio, i, objcg, pool)) { > + spin_lock(&tree->lock); > + for (i--; i >= 0; i--) { > + entry = zswap_rb_search(&tree->rbroot, offset + i); > + if (entry) > + zswap_invalidate_entry(tree, entry); > + } > + spin_unlock(&tree->lock); > + goto put_pool; > + } > + } > + > + ret = true; > +put_pool: > + zswap_pool_put(pool); > +put_objcg: > + if (objcg) > + obj_cgroup_put(objcg); > + return ret; > shrink: > pool = zswap_pool_last_get(); > if (pool && !queue_work(shrink_wq, &pool->shrink_work)) > zswap_pool_put(pool); > - goto reject; > + goto put_objcg; > } > > bool zswap_load(struct folio *folio) > > base-commit: 158978945f3173b8c1a88f8c5684a629736a57ac > -- > 2.25.1 >