Hi Sergey, On Fri, Jan 31, 2025 at 5:07 PM Sergey Senozhatsky <senozhatsky@xxxxxxxxxxxx> wrote: > > Similarly to per-entry spin-lock per-CPU compression streams > also have a number of shortcoming. > > First, per-CPU stream access has to be done from a non-preemptible > (atomic) section, which imposes the same atomicity requirements on > compression backends as entry spin-lock do and makes it impossible > to use algorithms that can schedule/wait/sleep during compression > and decompression. > > Second, per-CPU streams noticeably increase memory usage (actually > more like wastage) of secondary compression streams. The problem > is that secondary compression streams are allocated per-CPU, just > like the primary streams are. Yet we never use more that one > secondary stream at a time, because recompression is a single > threaded action. Which means that remaining num_online_cpu() - 1 > streams are allocated for nothing, and this is per-priority list > (we can have several secondary compression algorithms). Depending > on the algorithm this may lead to a significant memory wastage, in > addition each stream also carries a workmem buffer (2 physical > pages). > > Instead of per-CPU streams, maintain a list of idle compression > streams and allocate new streams on-demand (something that we > used to do many years ago). So that zram read() and write() become > non-atomic and ease requirements on the compression algorithm > implementation. This also means that we now should have only one > secondary stream per-priority list. > > Signed-off-by: Sergey Senozhatsky <senozhatsky@xxxxxxxxxxxx> > --- > drivers/block/zram/zcomp.c | 164 +++++++++++++++++++--------------- > drivers/block/zram/zcomp.h | 17 ++-- > drivers/block/zram/zram_drv.c | 29 +++--- > include/linux/cpuhotplug.h | 1 - > 4 files changed, 109 insertions(+), 102 deletions(-) > > diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c > index bb514403e305..982c769d5831 100644 > --- a/drivers/block/zram/zcomp.c > +++ b/drivers/block/zram/zcomp.c > @@ -6,7 +6,7 @@ > #include <linux/slab.h> > #include <linux/wait.h> > #include <linux/sched.h> > -#include <linux/cpu.h> > +#include <linux/cpumask.h> > #include <linux/crypto.h> > #include <linux/vmalloc.h> > > @@ -43,31 +43,40 @@ static const struct zcomp_ops *backends[] = { > NULL > }; > > -static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) > +static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *strm) > { > - comp->ops->destroy_ctx(&zstrm->ctx); > - vfree(zstrm->buffer); > - zstrm->buffer = NULL; > + comp->ops->destroy_ctx(&strm->ctx); > + vfree(strm->buffer); > + kfree(strm); > } > > -static int zcomp_strm_init(struct zcomp *comp, struct zcomp_strm *zstrm) > +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) > { > + struct zcomp_strm *strm; > int ret; > > - ret = comp->ops->create_ctx(comp->params, &zstrm->ctx); > - if (ret) > - return ret; > + strm = kzalloc(sizeof(*strm), GFP_KERNEL); > + if (!strm) > + return NULL; > + > + INIT_LIST_HEAD(&strm->entry); > + > + ret = comp->ops->create_ctx(comp->params, &strm->ctx); > + if (ret) { > + kfree(strm); > + return NULL; > + } > > /* > - * allocate 2 pages. 1 for compressed data, plus 1 extra for the > - * case when compressed size is larger than the original one > + * allocate 2 pages. 1 for compressed data, plus 1 extra in case if > + * compressed data is larger than the original one. > */ > - zstrm->buffer = vzalloc(2 * PAGE_SIZE); > - if (!zstrm->buffer) { > - zcomp_strm_free(comp, zstrm); > - return -ENOMEM; > + strm->buffer = vzalloc(2 * PAGE_SIZE); > + if (!strm->buffer) { > + zcomp_strm_free(comp, strm); > + return NULL; > } > - return 0; > + return strm; > } > > static const struct zcomp_ops *lookup_backend_ops(const char *comp) > @@ -109,13 +118,59 @@ ssize_t zcomp_available_show(const char *comp, char *buf) > > struct zcomp_strm *zcomp_stream_get(struct zcomp *comp) > { > - local_lock(&comp->stream->lock); > - return this_cpu_ptr(comp->stream); > + struct zcomp_strm *strm; > + > + might_sleep(); > + > + while (1) { > + spin_lock(&comp->strm_lock); > + if (!list_empty(&comp->idle_strm)) { > + strm = list_first_entry(&comp->idle_strm, > + struct zcomp_strm, > + entry); > + list_del(&strm->entry); > + spin_unlock(&comp->strm_lock); > + return strm; > + } > + > + /* cannot allocate new stream, wait for an idle one */ > + if (comp->avail_strm >= num_online_cpus()) { > + spin_unlock(&comp->strm_lock); > + wait_event(comp->strm_wait, > + !list_empty(&comp->idle_strm)); > + continue; > + } > + > + /* allocate new stream */ > + comp->avail_strm++; > + spin_unlock(&comp->strm_lock); > + > + strm = zcomp_strm_alloc(comp); > + if (strm) > + break; > + > + spin_lock(&comp->strm_lock); > + comp->avail_strm--; > + spin_unlock(&comp->strm_lock); > + wait_event(comp->strm_wait, !list_empty(&comp->idle_strm)); > + } > + > + return strm; > } > > -void zcomp_stream_put(struct zcomp *comp) > +void zcomp_stream_put(struct zcomp *comp, struct zcomp_strm *strm) > { > - local_unlock(&comp->stream->lock); > + spin_lock(&comp->strm_lock); > + if (comp->avail_strm <= num_online_cpus()) { > + list_add(&strm->entry, &comp->idle_strm); > + spin_unlock(&comp->strm_lock); > + wake_up(&comp->strm_wait); > + return; > + } > + > + comp->avail_strm--; > + spin_unlock(&comp->strm_lock); > + zcomp_strm_free(comp, strm); > } > > int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, > @@ -148,61 +203,19 @@ int zcomp_decompress(struct zcomp *comp, struct zcomp_strm *zstrm, > return comp->ops->decompress(comp->params, &zstrm->ctx, &req); > } > > -int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) > -{ > - struct zcomp *comp = hlist_entry(node, struct zcomp, node); > - struct zcomp_strm *zstrm; > - int ret; > - > - zstrm = per_cpu_ptr(comp->stream, cpu); > - local_lock_init(&zstrm->lock); > - > - ret = zcomp_strm_init(comp, zstrm); > - if (ret) > - pr_err("Can't allocate a compression stream\n"); > - return ret; > -} > - > -int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node) > -{ > - struct zcomp *comp = hlist_entry(node, struct zcomp, node); > - struct zcomp_strm *zstrm; > - > - zstrm = per_cpu_ptr(comp->stream, cpu); > - zcomp_strm_free(comp, zstrm); > - return 0; > -} > - > -static int zcomp_init(struct zcomp *comp, struct zcomp_params *params) > -{ > - int ret; > - > - comp->stream = alloc_percpu(struct zcomp_strm); > - if (!comp->stream) > - return -ENOMEM; > - > - comp->params = params; > - ret = comp->ops->setup_params(comp->params); > - if (ret) > - goto cleanup; > - > - ret = cpuhp_state_add_instance(CPUHP_ZCOMP_PREPARE, &comp->node); > - if (ret < 0) > - goto cleanup; > - > - return 0; > - > -cleanup: > - comp->ops->release_params(comp->params); > - free_percpu(comp->stream); > - return ret; > -} > - > void zcomp_destroy(struct zcomp *comp) > { > - cpuhp_state_remove_instance(CPUHP_ZCOMP_PREPARE, &comp->node); > + struct zcomp_strm *strm; > + > + while (!list_empty(&comp->idle_strm)) { > + strm = list_first_entry(&comp->idle_strm, > + struct zcomp_strm, > + entry); > + list_del(&strm->entry); > + zcomp_strm_free(comp, strm); > + } > + > comp->ops->release_params(comp->params); > - free_percpu(comp->stream); > kfree(comp); > } > > @@ -229,7 +242,12 @@ struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params) > return ERR_PTR(-EINVAL); > } > > - error = zcomp_init(comp, params); > + INIT_LIST_HEAD(&comp->idle_strm); > + init_waitqueue_head(&comp->strm_wait); > + spin_lock_init(&comp->strm_lock); > + > + comp->params = params; > + error = comp->ops->setup_params(comp->params); > if (error) { > kfree(comp); > return ERR_PTR(error); > diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h > index ad5762813842..62330829db3f 100644 > --- a/drivers/block/zram/zcomp.h > +++ b/drivers/block/zram/zcomp.h > @@ -3,10 +3,10 @@ > #ifndef _ZCOMP_H_ > #define _ZCOMP_H_ > > -#include <linux/local_lock.h> > - > #define ZCOMP_PARAM_NO_LEVEL INT_MIN > > +#include <linux/wait.h> > + > /* > * Immutable driver (backend) parameters. The driver may attach private > * data to it (e.g. driver representation of the dictionary, etc.). > @@ -31,7 +31,7 @@ struct zcomp_ctx { > }; > > struct zcomp_strm { > - local_lock_t lock; > + struct list_head entry; > /* compression buffer */ > void *buffer; > struct zcomp_ctx ctx; > @@ -60,16 +60,15 @@ struct zcomp_ops { > const char *name; > }; > > -/* dynamic per-device compression frontend */ > struct zcomp { > - struct zcomp_strm __percpu *stream; > + struct list_head idle_strm; > + spinlock_t strm_lock; > + u32 avail_strm; > + wait_queue_head_t strm_wait; > const struct zcomp_ops *ops; > struct zcomp_params *params; > - struct hlist_node node; > }; > > -int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node); > -int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node); > ssize_t zcomp_available_show(const char *comp, char *buf); > bool zcomp_available_algorithm(const char *comp); > > @@ -77,7 +76,7 @@ struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params); > void zcomp_destroy(struct zcomp *comp); > > struct zcomp_strm *zcomp_stream_get(struct zcomp *comp); > -void zcomp_stream_put(struct zcomp *comp); > +void zcomp_stream_put(struct zcomp *comp, struct zcomp_strm *strm); > > int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, > const void *src, unsigned int *dst_len); > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c > index 1c2df2341704..8d5974ea8ff8 100644 > --- a/drivers/block/zram/zram_drv.c > +++ b/drivers/block/zram/zram_drv.c > @@ -31,7 +31,6 @@ > #include <linux/idr.h> > #include <linux/sysfs.h> > #include <linux/debugfs.h> > -#include <linux/cpuhotplug.h> > #include <linux/part_stat.h> > #include <linux/kernel_read_file.h> > > @@ -1601,7 +1600,7 @@ static int read_compressed_page(struct zram *zram, struct page *page, u32 index) > ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, dst); > kunmap_local(dst); > zs_unmap_object(zram->mem_pool, handle); > - zcomp_stream_put(zram->comps[prio]); > + zcomp_stream_put(zram->comps[prio], zstrm); > > return ret; > } > @@ -1762,14 +1761,14 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) > kunmap_local(mem); > > if (unlikely(ret)) { > - zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); > + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm); > pr_err("Compression failed! err=%d\n", ret); > zs_free(zram->mem_pool, handle); > return ret; > } > > if (comp_len >= huge_class_size) { > - zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); > + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm); > return write_incompressible_page(zram, page, index); > } > > @@ -1793,7 +1792,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) > __GFP_HIGHMEM | > __GFP_MOVABLE); > if (IS_ERR_VALUE(handle)) { > - zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); > + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm); > atomic64_inc(&zram->stats.writestall); > handle = zs_malloc(zram->mem_pool, comp_len, > GFP_NOIO | __GFP_HIGHMEM | > @@ -1805,7 +1804,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) > } > > if (!zram_can_store_page(zram)) { > - zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); > + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm); > zs_free(zram->mem_pool, handle); > return -ENOMEM; > } > @@ -1813,7 +1812,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) > dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); > > memcpy(dst, zstrm->buffer, comp_len); > - zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); > + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm); > zs_unmap_object(zram->mem_pool, handle); > > zram_slot_write_lock(zram, index); > @@ -1972,7 +1971,7 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, > kunmap_local(src); > > if (ret) { > - zcomp_stream_put(zram->comps[prio]); > + zcomp_stream_put(zram->comps[prio], zstrm); > return ret; > } > > @@ -1982,7 +1981,7 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, > /* Continue until we make progress */ > if (class_index_new >= class_index_old || > (threshold && comp_len_new >= threshold)) { > - zcomp_stream_put(zram->comps[prio]); > + zcomp_stream_put(zram->comps[prio], zstrm); > continue; > } > > @@ -2040,13 +2039,13 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, > __GFP_HIGHMEM | > __GFP_MOVABLE); > if (IS_ERR_VALUE(handle_new)) { > - zcomp_stream_put(zram->comps[prio]); > + zcomp_stream_put(zram->comps[prio], zstrm); > return PTR_ERR((void *)handle_new); > } > > dst = zs_map_object(zram->mem_pool, handle_new, ZS_MM_WO); > memcpy(dst, zstrm->buffer, comp_len_new); > - zcomp_stream_put(zram->comps[prio]); > + zcomp_stream_put(zram->comps[prio], zstrm); > > zs_unmap_object(zram->mem_pool, handle_new); > > @@ -2794,7 +2793,6 @@ static void destroy_devices(void) > zram_debugfs_destroy(); > idr_destroy(&zram_index_idr); > unregister_blkdev(zram_major, "zram"); > - cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE); > } > > static int __init zram_init(void) > @@ -2804,15 +2802,9 @@ static int __init zram_init(void) > > BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.flags) * 8); > > - ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", > - zcomp_cpu_up_prepare, zcomp_cpu_dead); > - if (ret < 0) > - return ret; > - > ret = class_register(&zram_control_class); > if (ret) { > pr_err("Unable to register zram-control class\n"); > - cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE); > return ret; > } > > @@ -2821,7 +2813,6 @@ static int __init zram_init(void) > if (zram_major <= 0) { > pr_err("Unable to get major number\n"); > class_unregister(&zram_control_class); > - cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE); > return -EBUSY; > } > > diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h > index 6cc5e484547c..092ace7db8ee 100644 > --- a/include/linux/cpuhotplug.h > +++ b/include/linux/cpuhotplug.h > @@ -119,7 +119,6 @@ enum cpuhp_state { > CPUHP_MM_ZS_PREPARE, > CPUHP_MM_ZSWP_POOL_PREPARE, > CPUHP_KVM_PPC_BOOK3S_PREPARE, > - CPUHP_ZCOMP_PREPARE, > CPUHP_TIMERS_PREPARE, > CPUHP_TMIGR_PREPARE, > CPUHP_MIPS_SOC_PREPARE, > -- > 2.48.1.362.g079036d154-goog This seems will cause a huge regression of performance on multi core systems, this is especially significant as the number of concurrent tasks increases: Test build linux kernel using ZRAM as SWAP (1G memcg): Before: + /usr/bin/time make -s -j48 2495.77user 2604.77system 2:12.95elapsed 3836%CPU (0avgtext+0avgdata 863304maxresident)k After: + /usr/bin/time make -s -j48 2403.60user 6676.09system 3:38.22elapsed 4160%CPU (0avgtext+0avgdata 863276maxresident)k `perf lock contention -ab sleep 3` also indicates the big spin lock in zcomp_stream_get/put is having significant contention: contended total wait max wait avg wait type caller 793357 28.71 s 2.66 ms 36.19 us spinlock zcomp_stream_get+0x37 793170 28.60 s 2.65 ms 36.06 us spinlock zcomp_stream_put+0x1f 444007 15.26 s 2.58 ms 34.37 us spinlock zcomp_stream_put+0x1f 443960 15.21 s 2.68 ms 34.25 us spinlock zcomp_stream_get+0x37 5516 152.50 ms 3.30 ms 27.65 us spinlock evict_folios+0x7e 4523 137.47 ms 3.66 ms 30.39 us spinlock folio_lruvec_lock_irqsave+0xc3 4253 108.93 ms 2.92 ms 25.61 us spinlock folio_lruvec_lock_irqsave+0xc3 49294 71.73 ms 15.87 us 1.46 us spinlock list_lru_del+0x7c 2327 51.35 ms 3.48 ms 22.07 us spinlock evict_folios+0x5c0