On Wed, Nov 23, 2022 at 04:25:00PM +0800, Chen Wandun wrote: > > > 在 2022/11/22 3:00, Alexey Romanov 写道: > > zram maps each page (struct page) to a zsmalloc object that > > contains a compressed buffer of that page's data. This fact > > generates data redundancy: for example, two identical pages > > will be store in compressed form in zsmalloc allocator twice. > > > > This patch adds a mechanism to scan zram_table_entry array > > and frees all identical objects in zsmalloc allocator, leaving > > only one. All zram_table_entry elements which reference this > > freed objects now refer to the same, not freed, object. > > > > To implement this mechanism, we sequentially scan the > > zram_table_entry array, counting the hash from the contents of > > the compressed pages (zsmalloc objects) and enter the index of > > the object into the hash table (hlist_head). If the hash matches, > > we remove the identical zsmalloc (zs_free) objects and update > > the link rbtree. > > > > Also, we can't just call zs_free() function during zram_free_page(). > > Before calling this function we have to make sure that no one else > > refers to this zsmalloc object. > > > > To implement the mechanism for merging identical compressed > > pages (zsmalloc objects), a rbtree is needed. > > > > The tree node key is a reference to the zsmalloc object > > (handle), and the value is the number of references to > > this object (atomic counter). This is necessary for data > > consistency so that we do not zs_free the object referenced > > by any element of the zram_table_entry array. > > > > Signed-off-by: Alexey Romanov <avromanov@xxxxxxxxxxxxxx> > > --- > > drivers/block/zram/zram_drv.c | 278 +++++++++++++++++++++++++++++++++- > > drivers/block/zram/zram_drv.h | 6 + > > 2 files changed, 283 insertions(+), 1 deletion(-) > > > > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c > > index e290d6d97047..716c2f72805e 100644 > > --- a/drivers/block/zram/zram_drv.c > > +++ b/drivers/block/zram/zram_drv.c > > @@ -33,12 +33,15 @@ > > #include <linux/debugfs.h> > > #include <linux/cpuhotplug.h> > > #include <linux/part_stat.h> > > +#include <linux/hashtable.h> > > +#include <linux/xxhash.h> > > #include "zram_drv.h" > > static DEFINE_IDR(zram_index_idr); > > /* idr index must be protected */ > > static DEFINE_MUTEX(zram_index_mutex); > > +static DEFINE_MUTEX(zram_rbtree_mutex); > > static int zram_major; > > static const char *default_compressor = CONFIG_ZRAM_DEF_COMP; > > @@ -57,6 +60,16 @@ static void zram_free_page(struct zram *zram, size_t index); > > static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, > > u32 index, int offset, struct bio *bio); > > +struct zram_rbtree_node { > > + struct rb_node node; > > + unsigned long key; > > + unsigned long cnt; > > +}; > > + > > +struct zram_hash_node { > > + unsigned long index; > > + struct hlist_node next; > > +}; > > static int zram_slot_trylock(struct zram *zram, u32 index) > > { > > @@ -1283,6 +1296,247 @@ static DEVICE_ATTR_RO(bd_stat); > > #endif > > static DEVICE_ATTR_RO(debug_stat); > > +static bool zram_rbtree_insert(struct rb_root *root, struct zram_rbtree_node *data) > > +{ > > + struct rb_node **new = &(root->rb_node), *parent = NULL; > > + struct zram_rbtree_node *this; > > + > > + while (*new) { > > + this = rb_entry(*new, struct zram_rbtree_node, node); > > + parent = *new; > > + if (data->key < this->key) > > + new = &((*new)->rb_left); > > + else if (data->key > this->key) > > + new = &((*new)->rb_right); > > + else > > + return false; > > + } > > + > > + rb_link_node(&data->node, parent, new); > > + rb_insert_color(&data->node, root); > > + return true; > > +} > > + > > +static struct zram_rbtree_node *zram_rbtree_search(struct rb_root *root, > > + unsigned long key) > > +{ > > + struct rb_node *node = root->rb_node; > > + struct zram_rbtree_node *data; > > + > > + while (node) { > > + data = rb_entry(node, struct zram_rbtree_node, node); > > + if (key < data->key) > > + node = node->rb_left; > > + else if (key > data->key) > > + node = node->rb_right; > > + else > > + return data; > > + } > > + > > + return NULL; > > +} > > + > > +static unsigned long zram_calc_hash(void *src, size_t len) > > +{ > > + return xxhash(src, len, 0); > > +} > > + > > +static int zram_cmp_obj_and_merge(struct zram *zram, struct hlist_head *htable, > > + size_t htable_size, size_t index) > > +{ > > + struct zram_rbtree_node *rb_node; > > + struct zram_hash_node *node; > > + unsigned long handle, cur_handle; > > + size_t obj_size; > > + char *src, *buf; > > + unsigned long hash; > > + int ret = 0; > > + > > + handle = zram_get_handle(zram, index); > > + if (!handle) > > + return ret; > > + > > + obj_size = zram_get_obj_size(zram, index); > > + buf = kmalloc(obj_size, GFP_KERNEL); > > + if (!buf) { > > + pr_err("Failed to allocate zs_map_object buffer\n"); > > + return -ENOMEM; > > + } > > + > > + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); > > + memcpy(buf, src, obj_size); > > + zs_unmap_object(zram->mem_pool, handle); > > + hash = zram_calc_hash(buf, obj_size); > > + > > + mutex_lock(&zram_rbtree_mutex); > > + hlist_for_each_entry(node, &htable[hash % htable_size], next) { > > + int cmp; > > + > > + zram_slot_lock(zram, node->index); > > + > > + /* > > + * Page may change as the hash table is being formed, > > + * so the checks below are necessary. > > + */ > > + cur_handle = zram_get_handle(zram, node->index); > > + if (handle == cur_handle || > > + obj_size != zram_get_obj_size(zram, node->index)) { > > + zram_slot_unlock(zram, node->index); > > + continue; > > + } > > + > > + src = zs_map_object(zram->mem_pool, cur_handle, ZS_MM_RO); > > + cmp = memcmp(buf, src, obj_size); > > + zs_unmap_object(zram->mem_pool, cur_handle); > > + > > + if (!cmp) { > > + rb_node = zram_rbtree_search(&zram->sph_rbtree, handle); > > + > > + /* > > + * This check is necessary in order not to zs_free an object > > + * that someone already refers to. This situation is possible > > + * when with repeated calls to zram_do_scan(). For example: > > + * > > + * [slot0] [slot1] [slot2] [slot3] [slot4] > > + * [obj0] [obj1] [obj2] [obj3] [obj4] > > + * > > + * Let's imagine that obj2 and obj3 are equal, and we called > > + * zram_do_scan() function: > > + * > > + * [slot0] [slot1] [slot2] [slot3] [slot4] > > + * [obj0] [obj1] [obj2] [obj2] [obj4] > > + * > > + * Now, slot2 and slot3 refers to obj2 zsmalloc object. > > + * Time passed, now slot0 refres to obj0_n, which is equal > > + * to obj2: > > + * > > + * [slot0] [slot1] [slot2] [slot3] [slot4] > > + * [obj0_n] [obj1] [obj2] [obj2] [obj4] > > + * > > + * Now we call zram_do_scan() function again. We get to slot2, > > + * and we understand that obj2 and obj0_n hashes are the same. We > > + * try to zs_free(obj2), but slot3 also already refers to it. > > + * > > + * This is not correct! > > + */ > In this case, it seem like can't merge all same object, is that right? Yes. > > how about let slot2 point to obj0_n and decrease the rb_node ref of > slot2/slot3 in the first loop, > let slot3 point to obj0_n and decrease the rb_node ref in the next loop, > then the obj2 can be free. Sure. I will correct your remark in next series. > > + if (unlikely(rb_node)) > > + if (rb_node->cnt > 1) { > > + zram_slot_unlock(zram, node->index); > > + continue; > > + } > > + > > + zram_set_handle(zram, index, cur_handle); > > + zs_free(zram->mem_pool, handle); > > + > > + rb_node = zram_rbtree_search(&zram->sph_rbtree, cur_handle); > > + > > + if (!rb_node) { > > + rb_node = kzalloc(sizeof(struct zram_rbtree_node), > > + GFP_KERNEL); > > + if (!rb_node) { > > + pr_err("Failed to allocate rb_node\n"); > > + ret = -ENOMEM; > > + zram_slot_unlock(zram, node->index); > > + mutex_unlock(&zram_rbtree_mutex); > > + goto merged_or_err; > > + } > > + > > + rb_node->key = cur_handle; > > + /* Two slots refers to an zsmalloc object with cur_handle key */ > > + rb_node->cnt = 2; > > + zram_rbtree_insert(&zram->sph_rbtree, rb_node); > > + } else { > > + rb_node->cnt++; > > + } > > + > > + atomic64_sub(obj_size, &zram->stats.compr_data_size); > > + zram_set_flag(zram, index, ZRAM_MERGED); > > + zram_set_flag(zram, node->index, ZRAM_MERGED); > > + > > + zram_slot_unlock(zram, node->index); > > + mutex_unlock(&zram_rbtree_mutex); > > + goto merged_or_err; > > + } > > + > > + zram_slot_unlock(zram, node->index); > > + } > > + > > + mutex_unlock(&zram_rbtree_mutex); > > + > > + node = kmalloc(sizeof(struct zram_hash_node), GFP_KERNEL); > > + if (!node) { > > + ret = -ENOMEM; > > + goto merged_or_err; > > + } > > + > > + node->index = index; > > + hlist_add_head(&node->next, &htable[hash % htable_size]); > > + > > +merged_or_err: > > + kfree(buf); > > + return ret; > > +} > > + > > +static void zram_free_htable_entries(struct hlist_head *htable, > > + size_t htable_size) > > +{ > > + struct hlist_node *n; > > + struct zram_hash_node *node; > > + > > + hlist_for_each_entry_safe(node, n, htable, next) { > > + hlist_del(&node->next); > > + kfree(node); > > + } > > +} > > + > > +static int zram_do_scan(struct zram *zram) > > +{ > > + size_t num_pages = zram->disksize >> PAGE_SHIFT; > > + size_t htable_size = num_pages; > > + size_t index; > > + struct hlist_head *htable; > > + int i, ret = 0; > > + > > + htable = vzalloc(htable_size * sizeof(struct hlist_head)); > > + if (!htable) { > > + pr_err("Failed to allocate hash table\n"); > > + return -ENOMEM; > > + } > > + > > + for (i = 0; i < htable_size; i++) > > + INIT_HLIST_HEAD(&htable[i]); > > + > > + for (index = 0; index < num_pages; index++) { > > + zram_slot_lock(zram, index); > > + > > + if (!zram_allocated(zram, index)) { > > + zram_slot_unlock(zram, index); > > + continue; > > + } > > + > > + if (zram_test_flag(zram, index, ZRAM_UNDER_WB) || > > + zram_test_flag(zram, index, ZRAM_WB) || > > + zram_test_flag(zram, index, ZRAM_SAME)) { > > + zram_slot_unlock(zram, index); > > + continue; > > + } > > + > > + /* Ignore pages that have been recompressed */ > > + if (zram_get_priority(zram, index) != 0) > > + continue; > > + > > + ret = zram_cmp_obj_and_merge(zram, htable, htable_size, index); > > + zram_slot_unlock(zram, index); > > + if (ret != 0) > > + goto out; > > + } > > + > > +out: > > + zram_free_htable_entries(htable, htable_size); > > + vfree(htable); > > + return ret; > > +} > > + > > static void zram_meta_free(struct zram *zram, u64 disksize) > > { > > size_t num_pages = disksize >> PAGE_SHIFT; > > @@ -1324,6 +1578,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) > > static void zram_free_page(struct zram *zram, size_t index) > > { > > unsigned long handle; > > + struct zram_rbtree_node *node; > > #ifdef CONFIG_ZRAM_MEMORY_TRACKING > > zram->table[index].ac_time = 0; > > @@ -1361,7 +1616,26 @@ static void zram_free_page(struct zram *zram, size_t index) > > if (!handle) > > return; > > - zs_free(zram->mem_pool, handle); > > + if (zram_test_flag(zram, index, ZRAM_MERGED)) { > > + zram_clear_flag(zram, index, ZRAM_MERGED); > > + mutex_lock(&zram_rbtree_mutex); > > + > > + node = zram_rbtree_search(&zram->sph_rbtree, handle); > > + BUG_ON(!node); > > + > > + node->cnt--; > > + if (node->cnt == 0) { > > + rb_erase(&node->node, &zram->sph_rbtree); > > + mutex_unlock(&zram_rbtree_mutex); > > + > > + zs_free(zram->mem_pool, handle); > > + kfree(node); > > + } else { > > + mutex_unlock(&zram_rbtree_mutex); > > + } > > + } else { > > + zs_free(zram->mem_pool, handle); > > + } > > atomic64_sub(zram_get_obj_size(zram, index), > > &zram->stats.compr_data_size); > > @@ -2421,6 +2695,8 @@ static int zram_add(void) > > comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); > > + zram->sph_rbtree = RB_ROOT; > > + > > zram_debugfs_register(zram); > > pr_info("Added device: %s\n", zram->disk->disk_name); > > return device_id; > > diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h > > index c5254626f051..4a7151c94523 100644 > > --- a/drivers/block/zram/zram_drv.h > > +++ b/drivers/block/zram/zram_drv.h > > @@ -56,6 +56,7 @@ enum zram_pageflags { > > ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */ > > ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */ > > + ZRAM_MERGED, /* page was merged */ > > __NR_ZRAM_PAGEFLAGS, > > }; > > @@ -140,5 +141,10 @@ struct zram { > > #ifdef CONFIG_ZRAM_MEMORY_TRACKING > > struct dentry *debugfs_dir; > > #endif > > + /* > > + * This is same pages handle's rb tree, where the key is a handle > > + * to same pages and the value is a link counter > > + */ > > + struct rb_root sph_rbtree; > > }; > > #endif > -- Thank you, Alexey