Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Jan 27, 2025 at 04:59:30PM +0900, Sergey Senozhatsky wrote:
> Introduce new API to map/unmap zsmalloc handle/object.  The key
> difference is that this API does not impose atomicity restrictions
> on its users, unlike zs_map_object() which returns with page-faults
> and preemption disabled - handle mapping API does not need a per-CPU
> vm-area because the users are required to provide an aux buffer for
> objects that span several physical pages.

I like the idea of supplying the buffer directly to zsmalloc, and zswap
already has per-CPU buffers allocated. This will help remove the special
case to handle not being able to sleep in zswap_decompress().

That being said, I am not a big fan of the new API for several reasons:
- The interface seems complicated, why do we need struct
zs_handle_mapping? Can't the user just pass an extra parameter to
zs_map_object/zs_unmap_object() to supply the buffer, and the return
value is the pointer to the data within the buffer?

- This seems to require an additional buffer on the compress side. Right
now, zswap compresses the page into its own buffer, maps the handle,
and copies to it. Now the map operation will require an extra buffer.
I guess in the WO case the buffer is not needed and we can just pass
NULL?

Taking a step back, it actually seems to me that the mapping interface
may not be the best, at least from a zswap perspective. In both cases,
we map, copy from/to the handle, then unmap. The special casing here is
essentially handling the copy direction. Zram looks fairly similar but I
didn't look too closely.

I wonder if the API should store/load instead. You either pass a buffer
to be stored (equivalent to today's alloc + map + copy), or pass a
buffer to load into (equivalent to today's map + copy). What we really
need on the zswap side is zs_store() and zs_load(), not zs_map() with
different mapping types and an optional buffer if we are going to
eventually store. I guess that's part of a larger overhaul and we'd need
to update other zpool allocators (or remove them, z3fold should be
coming soon).

Anyway this is mostly just me ranting because improving the interface to
avoid the atomicity requires making it even more complicated, when it's
really simple when you think about it in terms of what you really want
to do (i.e. store and load).

> Keep zs_map_object/zs_unmap_object for the time being, as there are
> still users of it, but eventually old API will be removed.
> 
> Signed-off-by: Sergey Senozhatsky <senozhatsky@xxxxxxxxxxxx>
> ---
>  include/linux/zsmalloc.h |  29 ++++++++
>  mm/zsmalloc.c            | 148 ++++++++++++++++++++++++++++-----------
>  2 files changed, 138 insertions(+), 39 deletions(-)
> 
> diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
> index a48cd0ffe57d..72d84537dd38 100644
> --- a/include/linux/zsmalloc.h
> +++ b/include/linux/zsmalloc.h
> @@ -58,4 +58,33 @@ unsigned long zs_compact(struct zs_pool *pool);
>  unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size);
>  
>  void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
> +
> +struct zs_handle_mapping {
> +	unsigned long handle;
> +	/* Points to start of the object data either within local_copy or
> +	 * within local_mapping. This is what callers should use to access
> +	 * or modify handle data.
> +	 */
> +	void *handle_mem;
> +
> +	enum zs_mapmode mode;
> +	union {
> +		/*
> +		 * Handle object data copied, because it spans across several
> +		 * (non-contiguous) physical pages. This pointer should be
> +		 * set by the zs_map_handle() caller beforehand and should
> +		 * never be accessed directly.
> +		 */
> +		void *local_copy;
> +		/*
> +		 * Handle object mapped directly. Should never be used
> +		 * directly.
> +		 */
> +		void *local_mapping;
> +	};
> +};
> +
> +int zs_map_handle(struct zs_pool *pool, struct zs_handle_mapping *map);
> +void zs_unmap_handle(struct zs_pool *pool, struct zs_handle_mapping *map);
> +
>  #endif
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index a5c1f9852072..281bba4a3277 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -1132,18 +1132,14 @@ static inline void __zs_cpu_down(struct mapping_area *area)
>  	area->vm_buf = NULL;
>  }
>  
> -static void *__zs_map_object(struct mapping_area *area,
> -			struct zpdesc *zpdescs[2], int off, int size)
> +static void zs_obj_copyin(void *buf, struct zpdesc *zpdesc, int off, int size)
>  {
> +	struct zpdesc *zpdescs[2];
>  	size_t sizes[2];
> -	char *buf = area->vm_buf;
> -
> -	/* disable page faults to match kmap_local_page() return conditions */
> -	pagefault_disable();
>  
> -	/* no read fastpath */
> -	if (area->vm_mm == ZS_MM_WO)
> -		goto out;
> +	zpdescs[0] = zpdesc;
> +	zpdescs[1] = get_next_zpdesc(zpdesc);
> +	BUG_ON(!zpdescs[1]);
>  
>  	sizes[0] = PAGE_SIZE - off;
>  	sizes[1] = size - sizes[0];
> @@ -1151,21 +1147,17 @@ static void *__zs_map_object(struct mapping_area *area,
>  	/* copy object to per-cpu buffer */
>  	memcpy_from_page(buf, zpdesc_page(zpdescs[0]), off, sizes[0]);
>  	memcpy_from_page(buf + sizes[0], zpdesc_page(zpdescs[1]), 0, sizes[1]);
> -out:
> -	return area->vm_buf;
>  }
>  
> -static void __zs_unmap_object(struct mapping_area *area,
> -			struct zpdesc *zpdescs[2], int off, int size)
> +static void zs_obj_copyout(void *buf, struct zpdesc *zpdesc, int off, int size)
>  {
> +	struct zpdesc *zpdescs[2];
>  	size_t sizes[2];
> -	char *buf;
>  
> -	/* no write fastpath */
> -	if (area->vm_mm == ZS_MM_RO)
> -		goto out;
> +	zpdescs[0] = zpdesc;
> +	zpdescs[1] = get_next_zpdesc(zpdesc);
> +	BUG_ON(!zpdescs[1]);
>  
> -	buf = area->vm_buf;
>  	buf = buf + ZS_HANDLE_SIZE;
>  	size -= ZS_HANDLE_SIZE;
>  	off += ZS_HANDLE_SIZE;
> @@ -1176,10 +1168,6 @@ static void __zs_unmap_object(struct mapping_area *area,
>  	/* copy per-cpu buffer to object */
>  	memcpy_to_page(zpdesc_page(zpdescs[0]), off, buf, sizes[0]);
>  	memcpy_to_page(zpdesc_page(zpdescs[1]), 0, buf + sizes[0], sizes[1]);
> -
> -out:
> -	/* enable page faults to match kunmap_local() return conditions */
> -	pagefault_enable();
>  }
>  
>  static int zs_cpu_prepare(unsigned int cpu)
> @@ -1260,6 +1248,8 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
>   * against nested mappings.
>   *
>   * This function returns with preemption and page faults disabled.
> + *
> + * NOTE: this function is deprecated and will be removed.
>   */
>  void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>  			enum zs_mapmode mm)
> @@ -1268,10 +1258,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>  	struct zpdesc *zpdesc;
>  	unsigned long obj, off;
>  	unsigned int obj_idx;
> -
>  	struct size_class *class;
>  	struct mapping_area *area;
> -	struct zpdesc *zpdescs[2];
>  	void *ret;
>  
>  	/*
> @@ -1309,12 +1297,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>  		goto out;
>  	}
>  
> -	/* this object spans two pages */
> -	zpdescs[0] = zpdesc;
> -	zpdescs[1] = get_next_zpdesc(zpdesc);
> -	BUG_ON(!zpdescs[1]);
> +	ret = area->vm_buf;
> +	/* disable page faults to match kmap_local_page() return conditions */
> +	pagefault_disable();
> +	if (mm != ZS_MM_WO) {
> +		/* this object spans two pages */
> +		zs_obj_copyin(area->vm_buf, zpdesc, off, class->size);
> +	}
>  
> -	ret = __zs_map_object(area, zpdescs, off, class->size);
>  out:
>  	if (likely(!ZsHugePage(zspage)))
>  		ret += ZS_HANDLE_SIZE;
> @@ -1323,13 +1313,13 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>  }
>  EXPORT_SYMBOL_GPL(zs_map_object);
>  
> +/* NOTE: this function is deprecated and will be removed. */
>  void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
>  {
>  	struct zspage *zspage;
>  	struct zpdesc *zpdesc;
>  	unsigned long obj, off;
>  	unsigned int obj_idx;
> -
>  	struct size_class *class;
>  	struct mapping_area *area;
>  
> @@ -1340,23 +1330,103 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
>  	off = offset_in_page(class->size * obj_idx);
>  
>  	area = this_cpu_ptr(&zs_map_area);
> -	if (off + class->size <= PAGE_SIZE)
> +	if (off + class->size <= PAGE_SIZE) {
>  		kunmap_local(area->vm_addr);
> -	else {
> -		struct zpdesc *zpdescs[2];
> +		goto out;
> +	}
>  
> -		zpdescs[0] = zpdesc;
> -		zpdescs[1] = get_next_zpdesc(zpdesc);
> -		BUG_ON(!zpdescs[1]);
> +	if (area->vm_mm != ZS_MM_RO)
> +		zs_obj_copyout(area->vm_buf, zpdesc, off, class->size);
> +	/* enable page faults to match kunmap_local() return conditions */
> +	pagefault_enable();
>  
> -		__zs_unmap_object(area, zpdescs, off, class->size);
> -	}
> +out:
>  	local_unlock(&zs_map_area.lock);
> -
>  	zspage_read_unlock(zspage);
>  }
>  EXPORT_SYMBOL_GPL(zs_unmap_object);
>  
> +void zs_unmap_handle(struct zs_pool *pool, struct zs_handle_mapping *map)
> +{
> +	struct zspage *zspage;
> +	struct zpdesc *zpdesc;
> +	unsigned long obj, off;
> +	unsigned int obj_idx;
> +	struct size_class *class;
> +
> +	obj = handle_to_obj(map->handle);
> +	obj_to_location(obj, &zpdesc, &obj_idx);
> +	zspage = get_zspage(zpdesc);
> +	class = zspage_class(pool, zspage);
> +	off = offset_in_page(class->size * obj_idx);
> +
> +	if (off + class->size <= PAGE_SIZE) {
> +		kunmap_local(map->local_mapping);
> +		goto out;
> +	}
> +
> +	if (map->mode != ZS_MM_RO)
> +		zs_obj_copyout(map->local_copy, zpdesc, off, class->size);
> +
> +out:
> +	zspage_read_unlock(zspage);
> +}
> +EXPORT_SYMBOL_GPL(zs_unmap_handle);
> +
> +int zs_map_handle(struct zs_pool *pool, struct zs_handle_mapping *map)
> +{
> +	struct zspage *zspage;
> +	struct zpdesc *zpdesc;
> +	unsigned long obj, off;
> +	unsigned int obj_idx;
> +	struct size_class *class;
> +
> +	WARN_ON(in_interrupt());
> +
> +	/* It guarantees it can get zspage from handle safely */
> +	pool_read_lock(pool);
> +	obj = handle_to_obj(map->handle);
> +	obj_to_location(obj, &zpdesc, &obj_idx);
> +	zspage = get_zspage(zpdesc);
> +
> +	/*
> +	 * migration cannot move any zpages in this zspage. Here, class->lock
> +	 * is too heavy since callers would take some time until they calls
> +	 * zs_unmap_object API so delegate the locking from class to zspage
> +	 * which is smaller granularity.
> +	 */
> +	zspage_read_lock(zspage);
> +	pool_read_unlock(pool);
> +
> +	class = zspage_class(pool, zspage);
> +	off = offset_in_page(class->size * obj_idx);
> +
> +	if (off + class->size <= PAGE_SIZE) {
> +		/* this object is contained entirely within a page */
> +		map->local_mapping = kmap_local_zpdesc(zpdesc);
> +		map->handle_mem = map->local_mapping + off;
> +		goto out;
> +	}
> +
> +	if (WARN_ON_ONCE(!map->local_copy)) {
> +		zspage_read_unlock(zspage);
> +		return -EINVAL;
> +	}
> +
> +	map->handle_mem = map->local_copy;
> +	if (map->mode != ZS_MM_WO) {
> +		/* this object spans two pages */
> +		zs_obj_copyin(map->local_copy, zpdesc, off, class->size);
> +	}
> +
> +out:
> +	if (likely(!ZsHugePage(zspage)))
> +		map->handle_mem += ZS_HANDLE_SIZE;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(zs_map_handle);
> +
>  /**
>   * zs_huge_class_size() - Returns the size (in bytes) of the first huge
>   *                        zsmalloc &size_class.
> -- 
> 2.48.1.262.g85cc9f2d1e-goog
> 




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux