Re: [PATCH v3 13/13] mm: zswap: Compress batching with Intel IAA in zswap_store() of large folios.

Johannes Weiner <hannes@xxxxxxxxxxx> · Thu, 7 Nov 2024 13:53:40 -0500

On Wed, Nov 06, 2024 at 11:21:05AM -0800, Kanchana P Sridhar wrote:
> +static void zswap_zpool_store_sub_batch(
> +	struct zswap_store_pipeline_state *zst)

There is a zswap_store_sub_batch() below, which does something
completely different. Naming is hard, but please invest a bit more
time into this to make this readable.

> +{
> +	u8 i;
> +
> +	for (i = 0; i < zst->nr_comp_pages; ++i) {
> +		struct zswap_store_sub_batch_page *sbp = &zst->sub_batch[i];
> +		struct zpool *zpool;
> +		unsigned long handle;
> +		char *buf;
> +		gfp_t gfp;
> +		int err;
> +
> +		/* Skip pages that had compress errors. */
> +		if (sbp->error)
> +			continue;
> +
> +		zpool = zst->pool->zpool;
> +		gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> +		if (zpool_malloc_support_movable(zpool))
> +			gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> +		err = zpool_malloc(zpool, zst->comp_dlens[i], gfp, &handle);
> +
> +		if (err) {
> +			if (err == -ENOSPC)
> +				zswap_reject_compress_poor++;
> +			else
> +				zswap_reject_alloc_fail++;
> +
> +			/*
> +			 * An error should be propagated to other pages of the
> +			 * same folio in the sub-batch, and zpool resources for
> +			 * those pages (in sub-batch order prior to this zpool
> +			 * error) should be de-allocated.
> +			 */
> +			zswap_store_propagate_errors(zst, sbp->batch_idx);
> +			continue;
> +		}
> +
> +		buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
> +		memcpy(buf, zst->comp_dsts[i], zst->comp_dlens[i]);
> +		zpool_unmap_handle(zpool, handle);
> +
> +		sbp->entry->handle = handle;
> +		sbp->entry->length = zst->comp_dlens[i];
> +	}
> +}
> +
> +/*
> + * Returns true if the entry was successfully
> + * stored in the xarray, and false otherwise.
> + */
> +static bool zswap_store_entry(swp_entry_t page_swpentry,
> +			      struct zswap_entry *entry)
> +{
> +	struct zswap_entry *old = xa_store(swap_zswap_tree(page_swpentry),
> +					   swp_offset(page_swpentry),
> +					   entry, GFP_KERNEL);
> +	if (xa_is_err(old)) {
> +		int err = xa_err(old);
> +
> +		WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
> +		zswap_reject_alloc_fail++;
> +		return false;
> +	}
> +
> +	/*
> +	 * We may have had an existing entry that became stale when
> +	 * the folio was redirtied and now the new version is being
> +	 * swapped out. Get rid of the old.
> +	 */
> +	if (old)
> +		zswap_entry_free(old);
> +
> +	return true;
> +}
> +
> +static void zswap_batch_compress_post_proc(
> +	struct zswap_store_pipeline_state *zst)
> +{
> +	int nr_objcg_pages = 0, nr_pages = 0;
> +	struct obj_cgroup *objcg = NULL;
> +	size_t compressed_bytes = 0;
> +	u8 i;
> +
> +	zswap_zpool_store_sub_batch(zst);
> +
> +	for (i = 0; i < zst->nr_comp_pages; ++i) {
> +		struct zswap_store_sub_batch_page *sbp = &zst->sub_batch[i];
> +
> +		if (sbp->error)
> +			continue;
> +
> +		if (!zswap_store_entry(sbp->swpentry, sbp->entry)) {
> +			zswap_store_propagate_errors(zst, sbp->batch_idx);
> +			continue;
> +		}
> +
> +		/*
> +		 * The entry is successfully compressed and stored in the tree,
> +		 * there is no further possibility of failure. Grab refs to the
> +		 * pool and objcg. These refs will be dropped by
> +		 * zswap_entry_free() when the entry is removed from the tree.
> +		 */
> +		zswap_pool_get(zst->pool);
> +		if (sbp->objcg)
> +			obj_cgroup_get(sbp->objcg);
> +
> +		/*
> +		 * We finish initializing the entry while it's already in xarray.
> +		 * This is safe because:
> +		 *
> +		 * 1. Concurrent stores and invalidations are excluded by folio
> +		 *    lock.
> +		 *
> +		 * 2. Writeback is excluded by the entry not being on the LRU yet.
> +		 *    The publishing order matters to prevent writeback from seeing
> +		 *    an incoherent entry.
> +		 */
> +		sbp->entry->pool = zst->pool;
> +		sbp->entry->swpentry = sbp->swpentry;
> +		sbp->entry->objcg = sbp->objcg;
> +		sbp->entry->referenced = true;
> +		if (sbp->entry->length) {
> +			INIT_LIST_HEAD(&sbp->entry->lru);
> +			zswap_lru_add(&zswap_list_lru, sbp->entry);
> +		}
> +
> +		if (!objcg && sbp->objcg) {
> +			objcg = sbp->objcg;
> +		} else if (objcg && sbp->objcg && (objcg != sbp->objcg)) {
> +			obj_cgroup_charge_zswap(objcg, compressed_bytes);
> +			count_objcg_events(objcg, ZSWPOUT, nr_objcg_pages);
> +			compressed_bytes = 0;
> +			nr_objcg_pages = 0;
> +			objcg = sbp->objcg;
> +		}
> +
> +		if (sbp->objcg) {
> +			compressed_bytes += sbp->entry->length;
> +			++nr_objcg_pages;
> +		}
> +
> +		++nr_pages;
> +	} /* for sub-batch pages. */
> +
> +	if (objcg) {
> +		obj_cgroup_charge_zswap(objcg, compressed_bytes);
> +		count_objcg_events(objcg, ZSWPOUT, nr_objcg_pages);
> +	}
> +
> +	atomic_long_add(nr_pages, &zswap_stored_pages);
> +	count_vm_events(ZSWPOUT, nr_pages);
> +}
> +
> +static void zswap_store_sub_batch(struct zswap_store_pipeline_state *zst)
> +{
> +	u8 i;
> +
> +	for (i = 0; i < zst->nr_comp_pages; ++i) {
> +		zst->comp_dsts[i] = zst->acomp_ctx->buffers[i];
> +		zst->comp_dlens[i] = PAGE_SIZE;
> +	} /* for sub-batch pages. */
> +
> +	/*
> +	 * Batch compress sub-batch "N". If IAA is the compressor, the
> +	 * hardware will compress multiple pages in parallel.
> +	 */
> +	zswap_compress_batch(zst);
> +
> +	zswap_batch_compress_post_proc(zst);

The control flow here is a mess. Keep loops over the same batch at the
same function level. IOW, pull the nr_comp_pages loop out of
zswap_batch_compress_post_proc() and call the function from the loop.

Also give it a more descriptive name. If that's hard to do, then
you're probably doing too many different things in it. Create
functions for a specific purpose, don't carve up sequences at
arbitrary points.

My impression after trying to read this is that the existing
zswap_store() sequence could be a subset of the batched store, where
you can reuse most code to get the pool, charge the cgroup, allocate
entries, store entries, bump the stats etc. for both cases. Alas, your
naming choices make it a bit difficult to be sure.

Please explore this direction. Don't worry about the CONFIG symbol for
now, we can still look at this later.

Right now, it's basically

	if (special case)
		lots of duplicative code in slightly different order
	regular store sequence

and that isn't going to be maintainable.

Look for a high-level sequence that makes sense for both cases. E.g.:

	if (!zswap_enabled)
		goto check_old;

	get objcg

	check limits

	allocate memcg list lru

	for each batch {
		for each entry {
			allocate entry
			acquire objcg ref
			acquire pool ref
		}
		compress
		for each entry {
			store in tree
			add to lru
			bump stats and counters
		}
	}

	put objcg

	return true;

check_error:
	...

and then set up the two loops such that they also makes sense when the
folio is just a single page.